In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import nltk

In [17]:
df_ner_test = pd.read_csv('data/ner-test.tsv', delimiter='\t')

# adding POS tags
def pos_tagging(token):
    return nltk.pos_tag([token])[0][1]

df_ner_test['POS'] = df_ner_test['token'].apply(pos_tagging)
df_ner_test.head()


Unnamed: 0,sentence id,token id,token,BIO NER tag,POS
0,0,0,I,O,PRP
1,0,1,would,O,MD
2,0,2,n't,O,RB
3,0,3,be,O,VB
4,0,4,caught,O,NN


**Added the NLTK pos_tags to the test set because they are a useful feature for NERC.**

In [31]:
#### Named Entity Recognition

test_features = []
test_gold_labels = []

rows = df_ner_test.iterrows()
for i, row in rows:
    token = row['token']
    
    a_dict = {
        'word': token,                # the word itself
        'pos': row['POS'],                   # the part of speech
        'word[-3:]': token[-3:],      # the last three characters of the word
        'word[-2:]': token[-2:],      # the last two characters of the word
        'upper': token[0].isupper(),  # whether the first letter is uppercase
        'title': token.istitle(),      # whether the word is titlecased 
                                 
        # 'upper' and 'lower' will be slightly different because
        # of words like 'EU'
    }
    
    # If we have a previous word, add it to the features list to improve NERC
    if i > 0:
        a_dict['prev-word'] = df_ner_test.iloc[i - 1]['token']

    test_features.append(a_dict)
    test_gold_labels.append(row['BIO NER tag'])