<a href="https://colab.research.google.com/github/xmansyx/namedEntityRecognizer/blob/master/CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
pip install sklearn-crfsuite


Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
[?25l  Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)
[K    100% |████████████████████████████████| 757kB 20.9MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6


In [0]:
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
import pandas as pd
import numpy as np

In [7]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

[?25l[K    1% |▎                               | 10kB 15.3MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.8MB/s eta 0:00:01[K    3% |█                               | 30kB 2.6MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.7MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.1MB/s eta 0:00:01[K    6% |██                              | 61kB 2.5MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.9MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.3MB/s eta 0:00:01[K    9% |███                             | 92kB 3.6MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.8MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.8MB/s eta 0:00:01[K    12% |████                            | 122kB 4.0MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.0MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.4MB/s eta 0:00:01[

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
link = 'https://drive.google.com/open?id=1RlXPcm035izLrJypwPlGITo2L2RoO6oz'

fluff, id = link.split('=')

downloaded = drive.CreateFile({'id':id})

downloaded.GetContentFile('ner_dataset.csv')

df = pd.read_csv("ner_dataset.csv", encoding = "ISO-8859-1")

In [0]:
data = df.fillna(method="ffill")

In [0]:

words = list(set(df["Word"].values))

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)

In [0]:
sent = getter.get_next()

In [0]:
sentences = getter.sentences

In [0]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
      return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [0]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [0]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

In [0]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [0]:
report = flat_classification_report(y_pred=pred, y_true=y)


In [22]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [0]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
def pos_tagger(sent):
  doc = nlp(sent)
  sent_list = []
  for token in doc:
    sent_list.append((token.text, token.tag_))
  return sent_list

In [24]:
print(report)

              precision    recall  f1-score   support

       B-art       0.35      0.11      0.16       402
       B-eve       0.51      0.35      0.41       308
       B-geo       0.86      0.90      0.88     37644
       B-gpe       0.97      0.94      0.95     15870
       B-nat       0.65      0.37      0.47       201
       B-org       0.78      0.72      0.75     20143
       B-per       0.84      0.81      0.82     16990
       B-tim       0.93      0.88      0.90     20333
       I-art       0.11      0.03      0.04       297
       I-eve       0.34      0.21      0.26       253
       I-geo       0.81      0.80      0.80      7414
       I-gpe       0.92      0.54      0.68       198
       I-nat       0.56      0.27      0.37        51
       I-org       0.80      0.79      0.80     16784
       I-per       0.84      0.89      0.87     17251
       I-tim       0.84      0.77      0.80      6528
           O       0.99      0.99      0.99    887908

   micro avg       0.97   

In [25]:
crf.score(X, y)

0.9865226617075555

In [37]:
x=crf.predict_single(sent2features(pos_tagger("Jim bought 300 shares of Acme Corp. in 2006")))
print(pos_tagger("Jim bought 300 shares of Acme Corp. in 2006"))

[('Jim', 'NNP'), ('bought', 'VBD'), ('300', 'CD'), ('shares', 'NNS'), ('of', 'IN'), ('Acme', 'NNP'), ('Corp.', 'NNP'), ('in', 'IN'), ('2006', 'CD')]
