In [0]:
# Reference: https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# Copy data to google colab from google drive and unzip
# This may take 1-2 minutes
!cp gdrive/My\ Drive/ADBI\ Project/Dataset/ner_dataset.csv.zip .
!cp gdrive/My\ Drive/ADBI\ Project/Dataset/barack.txt .
!unzip ner_dataset.csv.zip

Archive:  ner_dataset.csv.zip
replace ner_dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

KeyboardInterrupt: ignored

In [0]:
! ls 

## Implementation

In [0]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

### Load Data

In [0]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:10000]
df.head()

In [0]:
df.isnull().sum()

## Data Preprocessing

In [0]:
df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

#### Tags not evenly distributed

In [0]:
df.groupby('Tag').size().reset_index(name='counts')

###Train Test split - 70-30%

In [0]:
X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

In [0]:
new_classes = classes.copy()
new_classes.pop()

## 1) Perceptron

In [0]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

In [0]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

## 2)LR - SGD 

In [0]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

## 3) Naive Bayes classifier for multinomial models

In [0]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

In [0]:
# 4) Passive Aggressive Classifier
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

## CRF - Conditional Random Fields 

In [0]:
! pip install sklearn_crfsuite

In [0]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

### Sentences

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

### Feature Extraction and Train-Test split

In [0]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
  
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### CRF model

In [0]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)

print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

In [0]:
# Transition
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))
print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

# Check the state features 
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(10))
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])


Top likely transitions:
B-org  -> I-org   4.680201
B-per  -> I-per   4.242035
I-org  -> I-org   4.198692
I-per  -> I-per   3.991878
B-art  -> I-art   3.688363
B-eve  -> I-eve   3.636531
I-art  -> I-art   3.464745
B-gpe  -> I-gpe   3.365085
O      -> O       3.119727
B-geo  -> I-geo   2.966060

Top unlikely transitions:
B-tim  -> B-gpe   -0.627473
B-geo  -> I-art   -0.651209
B-org  -> I-per   -0.664598
B-geo  -> I-per   -0.781181
B-gpe  -> I-per   -0.828000
I-org  -> I-per   -0.837846
O      -> I-per   -0.970128
O      -> I-art   -1.289684
O      -> I-tim   -1.493870
O      -> I-org   -1.870850
Top positive:
5.860922 O        bias
3.722280 O        BOS
3.721897 B-tim    word[-3:]:day
3.690997 B-tim    word[-2:]:ay
3.149401 B-gpe    word.istitle()
3.047903 B-tim    word[-2:]:0s
2.978401 B-geo    -1:word.lower():in
2.502835 O        postag:NN
2.437516 B-tim    -1:word.lower():in
2.386781 B-gpe    postag:JJ

Top negative:
-1.410707 B-gpe    -1:word.lower():from
-1.434462 B-gpe    word[-3:]

### ELi5

In [0]:
! pip install eli5
import eli5
eli5.show_weights(crf, top=10)
# eli5.show_weights(crf, top=10, targets=['O', 'B-org', 'I-per'])
# eli5.show_weights(crf, top=10, feature_re='^word\.is',horizontal_layout=False, show=['targets'])

##NER with Spacy

In [0]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from pprint import pprint

###  Preprocess the dataset and convert words into paragraph

In [0]:
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        
        
grouped = df.groupby('Sentence #').apply(agg_func)
grouped = [s for s in grouped]
sentences = []

iob_tag_list = []
no_of_words = 0
for sent in grouped:
  no_of_words += len(sent)
  
  sentences.append(" ".join([x[0] for x in sent]))
  iob_tag_list+=[(x[0],x[2]) for x in sent]

text = " ".join(sentences)
print(text)
print(no_of_words)

### Spacy model evalution

In [0]:

doc = nlp(text)
# pprint([(X.text, X.label_) for X in doc.ents])
# pprint([(X, X.ent_iob_) for X in doc])
predictions = [(X, X.ent_iob_) for X in doc]
actual_pair = iob_tag_list

print(len(predictions))
print(len(actual_pair))

# Process actual pair to split '-' words into separate rows
modified_actual_pair = []
for pair in actual_pair:
  if '-' in pair[0] and not re.search(r'\d', pair[0]) and '-'*len(pair[0]) != pair[0]:
    words = [ w for w in pair[0].split('-') if w.strip() != '']
    count = len(words) - 1
    for word in words:
      if word == '':
        print("Space")
      modified_actual_pair.append((word, pair[1]))
      if count > 0:
        modified_actual_pair.append(('-', pair[1]))
        count -= 1
  elif re.search(r'\d', pair[0]) and len(pair[0].split('-')) > 2:
    words = pair[0].split('-')
    first, last =  words[:2], word[2:-1]
  else:
      modified_actual_pair.append(pair)
print(len(predictions))
print(len(modified_actual_pair))
# count = 0
# for act in actual_pair:
#   if '-' in act[0] and not re.search(r'\d', act[0]) and '-'*len(act[0]) != act[0]:
#     count += 1
#     print(act[0])

# index mismatch handling
index_list = [963, 972, 1968]
for index in index_list:
  modified_actual_pair.insert(index, ('-','O'))

count = 0
# for pred, actual in zip(predictions, modified_actual_pair):
  
#   if str(pred[0]) != str(actual[0]) :
#     print(pred[0], actual[0], str(count))
    
#   count += 1