In [0]:
import pandas as pd
import numpy as np

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
link = 'https://drive.google.com/open?id=1RlXPcm035izLrJypwPlGITo2L2RoO6oz'

fluff, id = link.split('=')

downloaded = drive.CreateFile({'id':id})

downloaded.GetContentFile('ner_dataset.csv')

df = pd.read_csv("ner_dataset.csv", encoding = "ISO-8859-1")

In [0]:
data = df.fillna(method="ffill")

In [0]:

words = list(set(df["Word"].values))

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)

In [0]:
sent = getter.get_next()

In [0]:
sentences = getter.sentences

In [0]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
      return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [0]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [0]:
pip install sklearn-crfsuite



In [0]:
from sklearn_crfsuite import CRF

In [0]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

In [0]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [0]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [0]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

In [0]:
crf.fit(X, y)

In [0]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
def pos_tagger(sent):
  doc = nlp(sent)
  sent_list = []
  for token in doc:
    sent_list.append((token.text, token.tag_))
#     print(sent_list)
  return sent_list

In [0]:
x=crf.predict_single(sent2features(pos_tagger("The Duke of Cambridge today labelled the Christchurch terrorist atrocity an “unspeakable act of hate” as he made an emotional visit to the city’s al-Noor mosque.")))
print(x)


In [0]:
# print(sent2features(pos_tagger("mahmoud manaa is student in faculity of computers and information")))
# sent2features(pos_tagger("mahmoud manaa is student in faculity of computers and information"))
x=sent2features(pos_tagger("mahmoud manaa is student in faculity of computers and information"))

In [190]:
crf.score(X, y)

0.9862832892258542