### Predicting on Test Data

#### Load Libraries

In [1]:
# Data Analysis
import numpy as np
import csv
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option("display.max_colwidth", 30)
pd.set_option("display.max_columns", 30)

# Text Preprocessing
from utils import sent2features

# Saving and loading Model 
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vinubalan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vinubalan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### Load Test Data

In [2]:
df_test = pd.read_csv("data/test_new.txt", sep = "\s+",quoting=csv.QUOTE_NONE)
print(df_test.shape)
df_test.head()

(19697, 2)


Unnamed: 0,sentence_id,word
0,276,Iran
1,276,this
2,276,week
3,276,restarted
4,276,parts


#### Load Model

In [3]:
model = pickle.load(open("model/ner_model.pickle", 'rb'))

In [4]:
model.transition_features_

{('B-gpe', 'B-gpe'): -3.163222,
 ('B-gpe', 'O'): 0.350024,
 ('B-gpe', 'B-per'): 0.769702,
 ('B-gpe', 'I-per'): -2.482244,
 ('B-gpe', 'I-geo'): -0.925704,
 ('B-gpe', 'B-tim'): -1.535012,
 ('B-gpe', 'I-tim'): -0.369862,
 ('B-gpe', 'B-org'): 1.708464,
 ('B-gpe', 'I-org'): -2.755583,
 ('B-gpe', 'I-eve'): -0.274743,
 ('B-gpe', 'I-gpe'): 3.643499,
 ('B-gpe', 'I-art'): -0.006919,
 ('O', 'B-gpe'): 0.40253,
 ('O', 'O'): 1.901419,
 ('O', 'B-per'): 1.000093,
 ('O', 'I-per'): -4.48834,
 ('O', 'B-geo'): 0.454614,
 ('O', 'I-geo'): -4.881346,
 ('O', 'B-tim'): 0.567111,
 ('O', 'I-tim'): -4.884157,
 ('O', 'B-org'): 0.586498,
 ('O', 'I-org'): -5.490619,
 ('O', 'B-eve'): 0.203691,
 ('O', 'I-eve'): -2.3337,
 ('O', 'I-gpe'): -2.518099,
 ('O', 'B-art'): 0.443093,
 ('O', 'I-art'): -2.715411,
 ('O', 'I-nat'): -1.200186,
 ('B-per', 'B-gpe'): -0.104804,
 ('B-per', 'O'): -0.136357,
 ('B-per', 'B-per'): -3.173376,
 ('B-per', 'I-per'): 4.57733,
 ('B-per', 'B-geo'): -0.02149,
 ('B-per', 'B-tim'): -1.493628,
 ('B-pe

In [None]:
xx = pd.DataFrame([[keys[0],keys[1],values] for keys, values in model.state_features_.items()], columns = ['from','to','score'])
xx.head()

In [None]:
xx[xx['from'] == '+1:word.lemma:this']

#### Data preprocessing - Test Data

Extract sentences and created features

In [None]:
test_sentences = []

for i in df_test.sentence_id.unique():
    test_sentences.append(df_test[df_test.sentence_id == i].word.tolist())

X_test= [sent2features(s) for s in test_sentences]

pd.DataFrame(X_test[0])

In [None]:
pd.DataFrame(X_test[0]).shape

#### Predict on Test Data

In [None]:
y_pred = model.predict(X_test)
y_pred_flat = [x for sublist in y_pred for x in sublist]
df_test['Tag'] = y_pred_flat
df_test.head()

In [None]:
X_test[0]

In [None]:
pd.options.display.float_format = '{:20,.6f}'.format
pd.set_option('display.max_colwidth', 1000)


In [None]:
n = []

X_test[0][2]

In [None]:
pd.DataFrame(model.predict_marginals(X_test[0][0])).astype(object)

In [None]:
pd.DataFrame(model.predict_marginals(X_test[1])).astype(object)

In [None]:
z = {'B-gpe': 0.0024998276781207167, 'O': 0.0022653554608588968, 'B-per': 0.0032870215197021864, 'I-per': 0.003739451059759381, 'B-geo': 0.018977163419604436, 'I-geo': 0.004111864475637663, 'B-tim': 0.25482866993206466, 'I-tim': 0.3874132306931088, 'B-org': 0.01360795767420368, 'I-org': 0.004959731996447832, 'B-eve': 0.008249502870830032, 'I-eve': 0.003556243831611179, 'I-gpe': 0.0037888238760149687, 'B-nat': 0.0031689754607647706, 'B-art': 0.15253235038936294, 'I-art': 0.12991061769661125, 'I-nat': 0.003103211965296966}

In [None]:
pd.DataFrame(z.items)

#### Save the results

In [None]:
df_test.to_csv('results/test_results.csv', index = False)