In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import nltk
from datasets import load_dataset
import json

dataset = load_dataset("tner/ontonotes5")
ontonotes5_train_dataset = pd.DataFrame(dataset['train'])

# Read the JSON file
with open('dataset_label.json', 'r') as file:
    labels = json.load(file)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
labels = { v:k for k,v in labels.items() }

training_tokens = []

for row in ontonotes5_train_dataset.iterrows():
    zipped = list(zip(row[1]['tokens'], row[1]['tags']))
    
    for item in zipped:
        training_tokens.append((item[0], labels[item[1]], nltk.pos_tag([item[0]])[0][1]))

print(training_tokens[0:10])

[('People', 'O', 'NNS'), ('start', 'O', 'NN'), ('their', 'O', 'PRP$'), ('own', 'O', 'JJ'), ('businesses', 'O', 'NNS'), ('for', 'O', 'IN'), ('many', 'O', 'JJ'), ('reasons', 'O', 'NNS'), ('.', 'O', '.'), ('But', 'O', 'CC')]


In [3]:
df_ner_test = pd.read_csv('data/ner-test.tsv', delimiter='\t')

# adding POS tags
def pos_tagging(token):
    return nltk.pos_tag([token])[0][1]

df_ner_test['POS'] = df_ner_test['token'].apply(pos_tagging)
df_ner_test.head(20)

Unnamed: 0,sentence id,token id,token,BIO NER tag,POS
0,0,0,I,O,PRP
1,0,1,would,O,MD
2,0,2,n't,O,RB
3,0,3,be,O,VB
4,0,4,caught,O,NN
5,0,5,dead,O,JJ
6,0,6,watching,O,VBG
7,0,7,the,O,DT
8,0,8,NFL,B-ORG,NN
9,0,9,if,O,IN


**Added the NLTK pos_tags to the test set because they are a useful feature for NERC.**

In [4]:
test_features = []
test_gold_labels = []

test_words = df_ner_test.iterrows()
for i, row in test_words:
    token: str = row['token']
    
    a_dict = {
        'word': token,                # the word itself
        'pos': row['POS'],                   # the part of speech
        'word[-3:]': token[-3:],      # the last three characters of the word
        'word[-2:]': token[-2:],      # the last two characters of the word
        'upper': token[0].isupper(),  # whether the first letter is uppercase
        'title': token.istitle(),      # whether the word is titlecased 
        'prev-word': "!NEWSENTENCE"     # We had None type if the first word but that screwed with the 
                                 
        # 'upper' and 'lower' will be slightly different because
        # of words like 'EU'
    }
    
    #If we have a previous word, add it to the features list to improve NERC
    if i > 0:
        # only get previous word if they're part of the same sentence
        prev_row = df_ner_test.iloc[i - 1]
        
        if prev_row['sentence id'] == row['sentence id']:
            a_dict['prev-word'] = prev_row['token']

    test_features.append(a_dict)
    test_gold_labels.append(row['BIO NER tag'])


In [5]:
test_features[0:10]

[{'word': 'I',
  'pos': 'PRP',
  'word[-3:]': 'I',
  'word[-2:]': 'I',
  'upper': True,
  'title': True,
  'prev-word': '!NEWSENTENCE'},
 {'word': 'would',
  'pos': 'MD',
  'word[-3:]': 'uld',
  'word[-2:]': 'ld',
  'upper': False,
  'title': False,
  'prev-word': 'I'},
 {'word': "n't",
  'pos': 'RB',
  'word[-3:]': "n't",
  'word[-2:]': "'t",
  'upper': False,
  'title': False,
  'prev-word': 'would'},
 {'word': 'be',
  'pos': 'VB',
  'word[-3:]': 'be',
  'word[-2:]': 'be',
  'upper': False,
  'title': False,
  'prev-word': "n't"},
 {'word': 'caught',
  'pos': 'NN',
  'word[-3:]': 'ght',
  'word[-2:]': 'ht',
  'upper': False,
  'title': False,
  'prev-word': 'be'},
 {'word': 'dead',
  'pos': 'JJ',
  'word[-3:]': 'ead',
  'word[-2:]': 'ad',
  'upper': False,
  'title': False,
  'prev-word': 'caught'},
 {'word': 'watching',
  'pos': 'VBG',
  'word[-3:]': 'ing',
  'word[-2:]': 'ng',
  'upper': False,
  'title': False,
  'prev-word': 'dead'},
 {'word': 'the',
  'pos': 'DT',
  'word[-3:]':

In [6]:
# from nltk.corpus.reader import ConllCorpusReader
# ### Adapt the path to point to the CONLL2003 folder on your local machine
# train = ConllCorpusReader('./data/CONLL2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk'])

# train_words = list(train.iob_words())

In [7]:
training_features = []
training_gold_labels = []

for i, (token, ne_label, pos) in enumerate(training_tokens):
   if token == '' or token == 'DOCSTART':
      continue
   
   a_dict = {
      'word': token,                # the word itself
      'pos': pos,                   # the part of speech
      'word[-3:]': token[-3:],      # the last three characters of the word
      'word[-2:]': token[-2:],      # the last two characters of the word
      'upper': token[0].isupper(),  # whether the first letter is uppercase
      'title': token.istitle(),     # whether the word is titlecased 
      'prev-word': "!NEWSENTENCE",       # the previous word (!NEWSENTENCE if it's the first word)
   }
   
   # If the sentence is not ending
   if i > 0:
      prev_token = training_tokens[i - 1][0]
      if prev_token != '.':
         a_dict['prev-word'] = prev_token
      

   training_features.append(a_dict)
   training_gold_labels.append(ne_label)

In [8]:
print(training_features[:30])

[{'word': 'People', 'pos': 'NNS', 'word[-3:]': 'ple', 'word[-2:]': 'le', 'upper': True, 'title': True, 'prev-word': '!NEWSENTENCE'}, {'word': 'start', 'pos': 'NN', 'word[-3:]': 'art', 'word[-2:]': 'rt', 'upper': False, 'title': False, 'prev-word': 'People'}, {'word': 'their', 'pos': 'PRP$', 'word[-3:]': 'eir', 'word[-2:]': 'ir', 'upper': False, 'title': False, 'prev-word': 'start'}, {'word': 'own', 'pos': 'JJ', 'word[-3:]': 'own', 'word[-2:]': 'wn', 'upper': False, 'title': False, 'prev-word': 'their'}, {'word': 'businesses', 'pos': 'NNS', 'word[-3:]': 'ses', 'word[-2:]': 'es', 'upper': False, 'title': False, 'prev-word': 'own'}, {'word': 'for', 'pos': 'IN', 'word[-3:]': 'for', 'word[-2:]': 'or', 'upper': False, 'title': False, 'prev-word': 'businesses'}, {'word': 'many', 'pos': 'JJ', 'word[-3:]': 'any', 'word[-2:]': 'ny', 'upper': False, 'title': False, 'prev-word': 'for'}, {'word': 'reasons', 'pos': 'NNS', 'word[-3:]': 'ons', 'word[-2:]': 'ns', 'upper': False, 'title': False, 'prev-w

In [9]:
from collections import Counter
Counter(training_tokens[i][1] for i in range(len(training_tokens)))


Counter({'O': 939111,
         'I-ORG': 18246,
         'B-PERSON': 15429,
         'B-GPE': 15405,
         'I-DATE': 13333,
         'B-ORG': 12820,
         'I-PERSON': 11147,
         'B-DATE': 10922,
         'B-CARDINAL': 7355,
         'B-NORP': 6870,
         'I-MONEY': 4912,
         'I-GPE': 3679,
         'I-PERCENT': 2498,
         'B-MONEY': 2411,
         'I-WORK_OF_ART': 2400,
         'I-CARDINAL': 2289,
         'B-PERCENT': 1763,
         'B-ORDINAL': 1640,
         'I-EVENT': 1605,
         'B-LOC': 1514,
         'I-TIME': 1507,
         'I-FAC': 1467,
         'I-LOC': 1395,
         'I-QUANTITY': 1235,
         'B-TIME': 1233,
         'B-WORK_OF_ART': 974,
         'B-FAC': 860,
         'I-LAW': 785,
         'B-EVENT': 748,
         'B-QUANTITY': 657,
         'B-PRODUCT': 606,
         'I-PRODUCT': 576,
         'I-NORP': 446,
         'B-LANGUAGE': 304,
         'B-LAW': 282,
         'I-LANGUAGE': 13,
         'I-ORDINAL': 5})

In [10]:
Counter(df_ner_test['BIO NER tag'])

Counter({'O': 160,
         'I-WORK_OF_ART': 9,
         'B-PERSON': 6,
         'I-ORG': 6,
         'B-WORK_OF_ART': 4,
         'B-ORG': 3,
         'I-PERSON': 3,
         'B-DATE': 1,
         'I-DATE': 1})

In [11]:
# #count nans in training_features
# for key in training_features[0].keys():
#     print(key, sum(1 for i in training_features if i[key] is None))

In [12]:
len(test_features), len(training_features)

(193, 1088442)

In [13]:
from sklearn.feature_extraction import DictVectorizer

# Concat all features
all_features = training_features.copy()
all_features.extend(test_features)

vec = DictVectorizer()
the_array = vec.fit_transform(all_features)
print(the_array.shape)

vec_training_features = the_array[:len(training_features)]
vec_test_features = the_array[len(training_features):]

print(vec_training_features.shape, vec_test_features.shape)

(1088635, 89833)
(1088442, 89833) (193, 89833)


In [14]:
from sklearn import svm
lin_clf = svm.LinearSVC()
from sklearn.metrics import classification_report

lin_clf.fit(vec_training_features, training_gold_labels)
predicted = lin_clf.predict(vec_test_features)

report = classification_report(test_gold_labels, predicted)

print(report)



               precision    recall  f1-score   support

   B-CARDINAL       0.00      0.00      0.00         0
       B-DATE       1.00      1.00      1.00         1
        B-GPE       0.00      0.00      0.00         0
        B-ORG       0.67      0.67      0.67         3
     B-PERSON       0.86      1.00      0.92         6
B-WORK_OF_ART       0.00      0.00      0.00         4
       I-DATE       0.50      1.00      0.67         1
      I-EVENT       0.00      0.00      0.00         0
        I-GPE       0.00      0.00      0.00         0
        I-ORG       0.50      0.17      0.25         6
     I-PERSON       1.00      0.33      0.50         3
I-WORK_OF_ART       0.00      0.00      0.00         9
            O       0.92      0.99      0.96       160

     accuracy                           0.89       193
    macro avg       0.42      0.40      0.38       193
 weighted avg       0.84      0.89      0.86       193



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import gensim
path = "C:\\Users\\Yari\\Downloads\\GoogleNews-vectors-negative300.bin"

word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
from sklearn.metrics import classification_report
def get_train_vectors_labels() -> tuple[list, list]:
    input_vectors = []
    labels = []
    
    for token, _, ne_label in training_tokens:
        if token =='' or token == 'DOCSTART':
            continue
        
        if token in word_embedding_model:
            vector = word_embedding_model[token]
        else:
            vector = [0]*300
            
        input_vectors.append(vector)
        labels.append(ne_label)
        
    return (input_vectors, labels)

def get_vectors_labels(data: pd.DataFrame) -> tuple[list, list]:
    input_vectors = []
    labels = []
    
    for _, row in data.iterrows():
        token = row['token']
        ne_label = row['BIO NER tag']

        if token in word_embedding_model:
            vector = word_embedding_model[token]
        else:
            vector = [0] * 300
        input_vectors.append(vector)
        labels.append(ne_label)
            
    return (input_vectors, labels)

input_vectors, labels = get_train_vectors_labels()
test_input_vectors, test_labels = get_vectors_labels(df_ner_test)

In [None]:
from sklearn import svm

lin_clf = svm.LinearSVC()
lin_clf.fit(input_vectors, labels)
predicted_labels = lin_clf.predict(test_input_vectors)

report = classification_report(test_labels, predicted_labels)
print(report)