In [24]:
import pandas as pd

In [30]:
df = pd.read_csv('ner_dataset.csv.gz', compression='gzip', encoding='ISO-8859-1')

df =df.fillna(method='ffill')#inplace NaN instead of 0


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:#len()means here length of all characters and spaces in sent.
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]#len=11 so range 0, 10

def sent2labels(sent):
    return [label for token, postag, label in sent]


# just a function
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                   s['POS'].values.tolist(), 
                                                   s['Tag'].values.tolist())]#s=sentence

grouped_df = df.groupby('Sentence #').apply(agg_func)

print(grouped_df[grouped_df.index == 'Sentence: 1'].values)

sentences = [s for s in grouped_df]


from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([sent2features(s) for s in sentences])
y = np.array([sent2labels(s) for s in sentences])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.1,
                           c2=0.1,
                           max_iterations=50,
                           all_possible_transitions=True,
                           verbose=True,)

[list([('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')])]


  X = np.array([sent2features(s) for s in sentences])
  y = np.array([sent2labels(s) for s in sentences])


In [None]:
crf.fit(X_train, y_train)

In [26]:
import re

text = """Three more countries have joined an "international grand
committee" of parliaments, adding

to calls forFacebook’s boss, Mark
Zuckerberg, to give evidence on misinformation to the coalition. Brazil,
Latvia and Singapore bring the total to eight different parliaments across
the world, with plans to send representatives to London on 27 November
with the intention of hearing from Zuckerberg. Since the Cambridge
Analytica scandal broke, the Facebook chief has only appeared in front of
two legislatures: the American Senate and House of Representatives, and
the European parliament. Facebook has consistently rebuffed attempts from
others, including the UK and Canadian parliaments, to hear from Zuckerberg.
He added that an article in the New York Times on Thursday, in which the
paper alleged a pattern of behaviour from Facebook to "delay, deny and
deflect" negative news stories, "raises further questions about how recent
data breaches were allegedly dealt with within Facebook."""

In [27]:
text = re.sub(r'\n', '', text)

In [28]:
import nltk

text_tokens = nltk.word_tokenize(text)
text_pos = nltk.pos_tag(text_tokens)
text_pos[:10]

[('Three', 'CD'),
 ('more', 'JJR'),
 ('countries', 'NNS'),
 ('have', 'VBP'),
 ('joined', 'VBN'),
 ('an', 'DT'),
 ('``', '``'),
 ('international', 'JJ'),
 ('grandcommittee', 'NN'),
 ("''", "''")]

In [None]:
labels = crf.predict(features)
doc_labels = labels[0]
doc_labels[10:20]

In [None]:
text_ner = [(token, tag) for token, tag in zip(text_tokens, doc_labels)]
print(text_ner)

In [None]:
# extract and display all named entities
named_entities = []
temp_entity_name = "
temp_named_entity = None
for term, tag in text_ner:
    if tag != 'O':
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = "
            temp_named_entity = None

In [None]:
pd.DataFrame(named_entities, columns=['Entity', 'Tag'])