In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import nltk

In [27]:
df_ner_test = pd.read_csv('data/ner-test.tsv', delimiter='\t')

# adding POS tags
def pos_tagging(token):
    return nltk.pos_tag([token])[0][1]

df_ner_test['POS'] = df_ner_test['token'].apply(pos_tagging)
df_ner_test.head(20)

Unnamed: 0,sentence id,token id,token,BIO NER tag,POS
0,0,0,I,O,PRP
1,0,1,would,O,MD
2,0,2,n't,O,RB
3,0,3,be,O,VB
4,0,4,caught,O,NN
5,0,5,dead,O,JJ
6,0,6,watching,O,VBG
7,0,7,the,O,DT
8,0,8,NFL,B-ORG,NN
9,0,9,if,O,IN


**Added the NLTK pos_tags to the test set because they are a useful feature for NERC.**

In [28]:
test_features = []
test_gold_labels = []

test_words = df_ner_test.iterrows()
for i, row in test_words:
    token: str = row['token']
    
    a_dict = {
        'word': token,                # the word itself
        'pos': row['POS'],                   # the part of speech
        'word[-3:]': token[-3:],      # the last three characters of the word
        'word[-2:]': token[-2:],      # the last two characters of the word
        'upper': token[0].isupper(),  # whether the first letter is uppercase
        'title': token.istitle(),      # whether the word is titlecased 
        'prev-word': None
                                 
        # 'upper' and 'lower' will be slightly different because
        # of words like 'EU'
    }
    
    #If we have a previous word, add it to the features list to improve NERC
    if i > 0:
        # only get previous word if they're part of the same sentence
        prev_row = df_ner_test.iloc[i - 1]
        
        if prev_row['sentence id'] == row['sentence id']:
            a_dict['prev-word'] = prev_row['token']

    test_features.append(a_dict)
    test_gold_labels.append(row['BIO NER tag'])


In [29]:
from nltk.corpus.reader import ConllCorpusReader
### Adapt the path to point to the CONLL2003 folder on your local machine
train = ConllCorpusReader('./data/CONLL2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk'])
training_features = []
training_gold_labels = []

train_words = list(train.iob_words())

In [32]:
from collections import Counter
print(Counter(train_words[i][2] for i in range(len(train_words))))
print(Counter(df_ner_test['BIO NER tag']))

for i, (token, pos, ne_label) in enumerate(train_words):
   if token == '' or token == 'DOCSTART':
      continue
   
   a_dict = {
      'word': token,                # the word itself
      'pos': pos,                   # the part of speech
      'word[-3:]': token[-3:],      # the last three characters of the word
      'word[-2:]': token[-2:],      # the last two characters of the word
      'upper': token[0].isupper(),  # whether the first letter is uppercase
      'title': token.istitle(),     # whether the word is titlecased 
      'prev-word': None,       # the previous word (None if it's the first word in the dataset)
   }
   
   # If the sentence is not ending
   if i > 0:
      prev_token = train_words[i - 1][0]
      if prev_token != '.':
         a_dict['prev-word'] = prev_token
      

   training_features.append(a_dict)
   training_gold_labels.append(ne_label)

Counter({'O': 169578, 'B-LOC': 7140, 'B-PER': 6600, 'B-ORG': 6321, 'I-PER': 4528, 'I-ORG': 3704, 'B-MISC': 3438, 'I-LOC': 1157, 'I-MISC': 1155})
Counter({'O': 160, 'I-WORK_OF_ART': 9, 'B-PER': 6, 'I-ORG': 6, 'B-WORK_OF_ART': 4, 'B-ORG': 3, 'I-PER': 3, 'B-DATE': 1, 'I-DATE': 1})


In [8]:
import gensim
path = "C:\\Users\\Yari\\Downloads\\GoogleNews-vectors-negative300.bin"

word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

In [19]:
from sklearn.metrics import classification_report
def get_train_vectors_labels() -> tuple[list, list]:
    input_vectors = []
    labels = []
    
    for token, _, ne_label in train_words:
        if token =='' or token == 'DOCSTART':
            continue
        
        if token in word_embedding_model:
            vector = word_embedding_model[token]
        else:
            vector = [0]*300
            
        input_vectors.append(vector)
        labels.append(ne_label)
        
    return (input_vectors, labels)

def get_vectors_labels(data: pd.DataFrame) -> tuple[list, list]:
    input_vectors = []
    labels = []
    
    for _, row in data.iterrows():
        token = row['token']
        ne_label = row['BIO NER tag']

        if token in word_embedding_model:
            vector = word_embedding_model[token]
        else:
            vector = [0] * 300
        input_vectors.append(vector)
        labels.append(ne_label)
            
    return (input_vectors, labels)

input_vectors, labels = get_train_vectors_labels()
test_input_vectors, test_labels = get_vectors_labels(df_ner_test)

In [20]:
from sklearn import svm

lin_clf = svm.LinearSVC()
lin_clf.fit(input_vectors, labels)
predicted_labels = lin_clf.predict(test_input_vectors)

report = classification_report(test_labels, predicted_labels)
print(report)



               precision    recall  f1-score   support

       B-DATE       0.00      0.00      0.00         1
       B-MISC       0.00      0.00      0.00         0
        B-ORG       0.40      0.67      0.50         3
        B-PER       0.67      0.67      0.67         6
B-WORK_OF_ART       0.00      0.00      0.00         4
       I-DATE       0.00      0.00      0.00         1
        I-LOC       0.00      0.00      0.00         0
        I-ORG       0.00      0.00      0.00         6
        I-PER       0.50      0.33      0.40         3
I-WORK_OF_ART       0.00      0.00      0.00         9
            O       0.91      1.00      0.96       160

     accuracy                           0.87       193
    macro avg       0.23      0.24      0.23       193
 weighted avg       0.79      0.87      0.83       193



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
