<h1> Andrey Lukyanenko NER talk (sberloga) adaption for new dataset

In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from sklearn.model_selection import train_test_split
import eli5
import spacy
from spacy.training import offsets_to_biluo_tags
import pandas as pd
import numpy as np

from collections import Counter
from ast import literal_eval
import json


In [2]:
DATA_PATH = '../datasets'

<h2>Loading dataset

This dataset is taken from Annotated Corpus for Named Entity Recognition by Abhinav Walia dataset and then processed.

Annotated Corpus for Named Entity Recognition is annotated Corpus for Named Entity Recognition using GMB(Groningen Meaning Bank) corpus for entity classification with enhanced and popular features by Natural Language Processing applied to the data set.

Essential info about entities:

geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon

In [3]:
df = pd.read_csv(f'{DATA_PATH}/ner.csv')

In [4]:
df.POS = df.POS.apply(literal_eval)
df.Tag = df.Tag.apply(literal_eval)

In [5]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."


In [6]:
df.shape

(47959, 4)

In [7]:
df.Sentence[11]

'The European Union , with U.S. backing , has threatened to refer Iran to the U.N. Security Council , which could impose sanctions if it finds Tehran has violated the Nuclear Non-Proliferation treaty .'

In [8]:
df, df_test = train_test_split(df, test_size=0.25, random_state=42)

In [9]:
df.to_csv(f'{DATA_PATH}/train.csv')
df_test.to_csv(f'{DATA_PATH}/test.csv')

<h2>Sklearn_crfsuite

In [10]:
#transform data to tuples like (word, POS, ent)

train_data = []
for _, row in df.iterrows():
    tokens = [token for token in zip(row['Sentence'].split(), row['POS'], row['Tag'])]
    train_data.append(tokens)

In [11]:
#transform data to tuples like (word, POS, ent)

test_data = []
for _, row in df_test.iterrows():
    tokens = [token for token in zip(row['Sentence'].split(), row['POS'], row['Tag'])]
    test_data.append(tokens)

In [12]:
train_data[:10]

[[('The', 'DT', 'O'),
  ('new', 'JJ', 'O'),
  ('laws', 'NNS', 'O'),
  ('also', 'RB', 'O'),
  ('call', 'VBP', 'O'),
  ('for', 'IN', 'O'),
  ('longer', 'JJR', 'O'),
  ('prison', 'NN', 'O'),
  ('sentences', 'NNS', 'O'),
  ('for', 'IN', 'O'),
  ('journalists', 'NNS', 'O'),
  ('convicted', 'VBN', 'O'),
  ('of', 'IN', 'O'),
  ('defamation', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Pakistan', 'NNP', 'B-geo'),
  ("'s", 'POS', 'O'),
  ('military', 'JJ', 'O'),
  ('says', 'VBZ', 'O'),
  ('it', 'PRP', 'O'),
  ('has', 'VBZ', 'O'),
  ('killed', 'VBN', 'O'),
  ('about', 'IN', 'O'),
  ('60', 'CD', 'O'),
  ('pro-Taliban', 'JJ', 'O'),
  ('militants', 'NNS', 'O'),
  ('in', 'IN', 'O'),
  ('heavy', 'JJ', 'O'),
  ('fighting', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ("'s", 'POS', 'O'),
  ('northwest', 'NN', 'O'),
  ('.', '.', 'O')],
 [('General', 'NNP', 'B-org'),
  ('Pinochet', 'NNP', 'I-org'),
  (',', ',', 'O'),
  ('who', 'WP', 'O'),
  ('turns', 'VBZ', 'O'),
  ('90', 'C

In [13]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,            
        })
    else:
        features['BOS'] = True
    
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True
        
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]   

In [14]:
len(train_data), len(test_data)

(35969, 11990)

In [15]:
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]
X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

In [16]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True,
)

crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|███████████████████████████████████████| 35969/35969 [00:05<00:00, 6178.54it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 129530
Seconds required: 1.501

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=2.09  loss=1344629.99 active=128587 feature_norm=1.00
Iter 2   time=3.04  loss=794857.90 active=127758 feature_norm=3.67
Iter 3   time=1.01  loss=655526.98 active=121935 feature_norm=3.24
Iter 4   time=3.02  loss=498390.57 active=123982 feature_norm=2.84
Iter 5   time=1.02  loss=433732.22 active=126519 feature_norm=3.39
Iter 6   time=1.03  loss=349165.66 active=127375 feature_norm=4.66
Iter 7   time=1.03  loss=266234.34 active=116645 feature_norm=6.84
Iter 8   time=1.00  loss=218325.90 active=105016 feature_norm=9.00
Iter 9   time=1.01  loss=188108.00 active=103133 feature_norm



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100, verbose=True)

In [17]:
all_ents = list(crf.classes_)
ents = [ent for ent in all_ents if ent != 'O']

In [18]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=all_ents)

0.9718179726241668

In [19]:
# y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=ents)

0.8547421833825358

In [20]:
# group B and I results
sorted_ents = sorted(
    ents,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_ents, digits=3, zero_division=0
))



              precision    recall  f1-score   support

       B-art      0.405     0.143     0.211       105
       I-art      0.056     0.013     0.021        79
       B-eve      0.479     0.338     0.397        68
       I-eve      0.415     0.293     0.343        58
       B-geo      0.867     0.908     0.887      9583
       I-geo      0.827     0.811     0.819      1856
       B-gpe      0.968     0.941     0.954      3986
       I-gpe      0.846     0.647     0.733        51
       B-nat      0.714     0.391     0.505        64
       I-nat      0.583     0.438     0.500        16
       B-org      0.800     0.745     0.772      4884
       I-org      0.832     0.799     0.816      4179
       B-per      0.842     0.826     0.834      4239
       I-per      0.849     0.903     0.875      4287
       B-tim      0.932     0.880     0.905      5041
       I-tim      0.828     0.763     0.794      1640

   micro avg      0.864     0.850     0.857     40136
   macro avg      0.703   

In [21]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-eve  -> I-eve   6.016204
B-nat  -> I-nat   5.912091
I-art  -> I-art   5.683562
I-eve  -> I-eve   5.657436
B-art  -> I-art   5.412966
B-geo  -> I-geo   5.185098
B-org  -> I-org   4.773786
I-nat  -> I-nat   4.721207
I-org  -> I-org   4.661529
I-tim  -> I-tim   4.515197
B-gpe  -> I-gpe   4.194296
B-tim  -> I-tim   4.143584
O      -> O       4.029554
I-geo  -> I-geo   3.906596
I-gpe  -> I-gpe   3.789573
B-per  -> I-per   3.752899
I-per  -> I-per   2.951249
O      -> B-per   1.861282
O      -> B-tim   1.653621
B-geo  -> B-tim   1.240877

Top unlikely transitions:
B-geo  -> B-geo   -3.933086
B-gpe  -> I-per   -3.995429
B-org  -> B-org   -4.079195
B-per  -> I-org   -4.181320
I-org  -> B-org   -4.300398
B-geo  -> I-org   -4.344379
O      -> I-art   -4.365107
B-geo  -> I-per   -4.456436
B-gpe  -> I-geo   -4.461123
B-tim  -> B-tim   -4.909021
I-org  -> I-per   -4.918186
B-org  -> I-per   -4.981201
B-gpe  -> I-org   -5.013117
I-per  -> B-per   -5.699825
O      -> I-per  

In [22]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
7.713548 B-tim    word[-3:]:Day
7.699178 O        word.lower():last
7.180446 O        word.lower():month
6.843241 B-org    word.lower():philippine
6.487287 B-geo    word.lower():caribbean
6.480748 B-per    word.lower():vice
6.449320 B-gpe    word.lower():niger
6.394158 B-tim    word.lower():multi-candidate
6.373373 B-tim    word.lower():one-fifth
6.354920 B-gpe    word.lower():afghan
6.168210 B-org    word.lower():mid-march
6.096259 B-tim    word.lower():weekend
6.061384 B-tim    word[-3:]:day
6.042216 B-geo    word.lower():mid-march
5.961533 B-per    word.lower():obama
5.916545 B-gpe    word.lower():nepal
5.871175 B-geo    word.lower():europe
5.832318 B-tim    word[-3:]:60s
5.806659 B-tim    word.lower():one-year
5.644425 B-org    -1:word.lower():rice
5.580649 B-org    word.lower():al-qaida
5.560970 O        word.lower():year
5.552939 B-tim    word.lower():january
5.508367 B-per    word.lower():president
5.481061 O        word.lower():chairman
5.430922 B-tim    word.lowe

In [23]:
eli5.show_weights(crf, top=10)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,4.03,0.257,-4.365,0.55,-3.817,1.214,-7.062,0.203,-3.09,0.426,-2.863,1.178,-6.585,1.861,-6.043,1.654,-6.624
B-art,-0.059,0.0,5.413,0.0,-0.122,-0.645,-1.277,-1.269,-0.744,0.0,0.0,0.02,-1.929,-1.48,-2.053,-0.339,-1.218
I-art,-0.099,-0.492,5.684,0.0,0.0,0.0,-0.868,-0.831,0.0,0.0,0.0,-0.921,-1.304,-1.433,-2.049,-0.715,-1.191
B-eve,-0.813,0.0,-0.414,-0.497,6.016,-1.025,-1.053,-1.367,-0.569,-0.238,0.0,-1.584,-1.55,-2.071,-1.726,0.013,-1.235
I-eve,0.017,0.0,-0.042,-2.08,5.657,-1.173,-0.929,-0.623,-0.033,0.0,0.0,-0.971,-1.161,-1.557,-1.383,-1.526,-1.327
B-geo,0.632,0.505,-2.494,-0.119,-1.744,-3.933,5.185,-0.048,-3.833,-0.177,-1.002,-0.312,-4.344,-1.283,-4.456,1.241,-3.405
I-geo,-0.022,0.991,-1.725,-0.962,-1.074,-3.103,3.907,-1.53,-1.945,0.0,-0.721,-1.107,-3.38,-1.327,-3.687,0.263,-2.724
B-gpe,0.706,-1.48,-2.208,-0.839,-2.453,-0.321,-4.461,-6.207,4.194,-0.486,-1.028,0.904,-5.013,-0.643,-3.995,-0.359,-3.262
I-gpe,-0.033,0.0,0.0,0.0,0.0,-1.558,-0.879,-0.604,3.79,0.0,0.0,-0.993,-0.896,-0.352,-0.802,-1.089,-1.066
B-nat,-0.361,0.0,0.0,0.0,0.0,0.569,-0.2,-0.591,-0.006,-0.086,5.912,-0.378,-0.574,-1.5,-1.406,-0.816,-0.434

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+7.699,word.lower():last,,,,,,,,,,,,,,,
+7.180,word.lower():month,,,,,,,,,,,,,,,
+5.561,word.lower():year,,,,,,,,,,,,,,,
+5.481,word.lower():chairman,,,,,,,,,,,,,,,
+5.354,word.lower():internet,,,,,,,,,,,,,,,
+5.228,BOS,,,,,,,,,,,,,,,
+5.120,word.lower():columbia,,,,,,,,,,,,,,,
+4.980,postag:VBD,,,,,,,,,,,,,,,
+4.824,postag:PRP,,,,,,,,,,,,,,,
… 7205 more positive …,… 7205 more positive …,,,,,,,,,,,,,,,

Weight?,Feature
+7.699,word.lower():last
+7.180,word.lower():month
+5.561,word.lower():year
+5.481,word.lower():chairman
+5.354,word.lower():internet
+5.228,BOS
+5.120,word.lower():columbia
+4.980,postag:VBD
+4.824,postag:PRP
… 7205 more positive …,… 7205 more positive …

Weight?,Feature
+4.177,word.lower():twitter
+4.090,+1:word.lower():enkhbayar
+3.952,+1:word.lower():boots
+3.847,-1:word.lower():engine
+3.825,word.lower():canal
+3.809,word.lower():nevirapine
+3.605,word.lower():english
+3.492,word.lower():russian
+3.338,-1:word.lower():shown
+3.276,+1:word.lower():al-arabiya

Weight?,Feature
+3.098,-1:word.lower():boeing
+2.703,+1:word.lower():came
+2.587,+1:word.lower():expands
+2.296,-1:word.lower():cajun
+2.236,+1:word.lower():gained
+1.989,+1:word.lower():early
+1.928,-1:word.lower():hitler
+1.926,word[-3:]:und
+1.890,word.lower():flowers
+1.843,+1:word.lower():airport

Weight?,Feature
+3.923,-1:word.lower():falklands
+3.766,word.lower():ramadan
+3.578,word.lower():games
+3.386,word[-3:]:II
+3.386,word.lower():ii
+3.244,word.lower():olympic
+3.240,word[-3:]:pic
+3.047,-1:word.lower():midnight
+3.035,word.lower():hopman
+2.878,-1:word.lower():celebrated

Weight?,Feature
+2.816,+1:word.lower():caused
+2.618,word.lower():games
+2.560,+1:word.lower():era
+2.474,word[-3:]:Day
+2.449,word.lower():day
+2.316,+1:word.lower():without
+2.280,+1:word.lower():now
+2.176,-1:word.lower():jewish
+2.174,+1:word.lower():tore
+2.130,word.lower():dean

Weight?,Feature
+6.487,word.lower():caribbean
+6.042,word.lower():mid-march
+5.871,word.lower():europe
+4.964,-1:word.lower():serb
+4.756,word.lower():martian
+4.345,word.lower():quake-zone
+4.300,+1:word.lower():phoned
+4.224,+1:word.lower():moqtada
+4.203,word.lower():beijing
… 4982 more positive …,… 4982 more positive …

Weight?,Feature
+4.260,word.lower():led-invasion
+4.026,word.lower():city
+3.749,word.lower():island
+3.696,+1:word.lower():regional
+3.387,+1:word.lower():french
+3.372,word.lower():holiday
+3.323,word.lower():shogunate
+3.323,-1:word.lower():tokugawa
+3.008,-1:word.lower():gulf
+2.922,-1:word.lower():sumatran

Weight?,Feature
+6.449,word.lower():niger
+6.355,word.lower():afghan
+5.917,word.lower():nepal
+5.275,word.lower():iranian
+4.864,word.lower():iraqi
+4.812,word.lower():azerbaijan
+4.807,word.lower():spaniard
+4.781,word.lower():jordan
+4.781,word.lower():korean
+4.756,word.lower():argentine

Weight?,Feature
+4.892,+1:word.lower():mayor
+3.962,-1:word.lower():democratic
+3.421,+1:word.lower():developed
+3.335,-1:word.lower():soviet
+2.901,+1:word.lower():invaded
+2.881,word.lower():cypriot
+2.864,word.lower():cypriots
+2.837,word[-3:]:iot
+2.685,+1:word.lower():health
+2.685,+1:word.lower():under

Weight?,Feature
+5.140,word.lower():marburg
+4.881,word.lower():katrina
+4.313,word.lower():rita
+3.610,word[-3:]:ita
+3.572,word[-3:]:urg
+3.499,+1:word.lower():strain
+3.336,word[-3:]:5N1
+3.336,word.lower():h5n1
+2.891,word[-3:]:ACC
+2.891,word.lower():acc

Weight?,Feature
+2.720,word.lower():rita
+2.555,word[-3:]:ita
+2.266,-1:word.lower():hurricanes
+1.988,+1:word.lower():slammed
+1.889,word.lower():flu
+1.819,+1:word.lower():outbreak
+1.809,-1:word.lower():type
+1.633,-1:postag:NN
+1.465,+1:word.lower():last
+1.458,word.lower():katrina

Weight?,Feature
+6.843,word.lower():philippine
+6.168,word.lower():mid-march
+5.644,-1:word.lower():rice
+5.581,word.lower():al-qaida
+5.182,word.lower():hamas
+4.871,word.lower():university
+4.800,word.lower():service
+4.770,word.lower():taleban
+4.736,word.lower():hezbollah
+4.736,word.lower():congress

Weight?,Feature
+3.735,-1:word.lower():&
+3.441,word.lower():member-countries
+3.407,+1:word.lower():ohlmert
+3.352,word.lower():member-states
+3.349,-1:word.lower():associated
+3.248,+1:word.lower():mulgueta
+3.226,+1:word.lower():reporter
+3.172,-1:word.lower():decathlon
+3.156,word.lower():times
… 5590 more positive …,… 5590 more positive …

Weight?,Feature
+6.481,word.lower():vice
+5.962,word.lower():obama
+5.508,word.lower():president
+4.994,word.lower():senator
+4.817,word.lower():prime
+4.815,word.lower():greenspan
+4.713,word.lower():hall
+4.433,word[-3:]:Mr.
+4.433,word.lower():mr.
+4.226,word.lower():spears

Weight?,Feature
+3.965,+1:word.lower():advisor
+3.613,-1:postag:NN
+3.120,word.lower():vice
+3.059,+1:word.lower():shinawatra
+2.979,-1:word.lower():condoleezza
+2.968,-1:word.lower():viktor
+2.926,+1:word.lower():gao
+2.810,-1:word.lower():michael
… 4571 more positive …,… 4571 more positive …
… 1007 more negative …,… 1007 more negative …

Weight?,Feature
+7.714,word[-3:]:Day
+6.394,word.lower():multi-candidate
+6.373,word.lower():one-fifth
+6.096,word.lower():weekend
+6.061,word[-3:]:day
+5.832,word[-3:]:60s
+5.807,word.lower():one-year
+5.553,word.lower():january
+5.431,word.lower():february
+5.368,word.lower():june

Weight?,Feature
+4.656,word[-3:]:Day
+4.563,-1:word.lower():this
+4.481,+1:word.lower():stocky
+4.337,word[-3:]:.m.
+3.995,word[-3:]:day
+3.983,+1:word.lower():old
+3.931,word.lower():working-age
+3.649,+1:word.lower():jose
+3.609,+1:word.lower():toure
+3.470,+1:word.lower():population


In [24]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=5,
    c2=0.01,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True,
)

crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|███████████████████████████████████████| 35969/35969 [00:06<00:00, 5862.49it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 129530
Seconds required: 1.505

L-BFGS optimization
c1: 5.000000
c2: 0.010000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=2.09  loss=1344680.66 active=36712 feature_norm=1.00
Iter 2   time=3.21  loss=795059.92 active=35135 feature_norm=3.67
Iter 3   time=1.02  loss=655724.07 active=27373 feature_norm=3.24
Iter 4   time=3.08  loss=498801.47 active=22632 feature_norm=2.84
Iter 5   time=1.05  loss=433965.61 active=22356 feature_norm=3.39
Iter 6   time=1.05  loss=351656.39 active=20918 feature_norm=4.65
Iter 7   time=1.08  loss=271041.95 active=17320 feature_norm=6.78
Iter 8   time=1.12  loss=221111.98 active=16030 feature_norm=8.83
Iter 9   time=1.14  loss=193114.62 active=15017 feature_norm=10.42
It

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=5, c2=0.01,
    keep_tempfiles=None, max_iterations=100, verbose=True)

In [25]:
eli5.show_weights(crf, top=10)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.243,0.759,-2.417,0.532,-2.139,1.309,-5.911,0.271,-1.771,0.442,-1.282,1.238,-6.021,1.862,-4.0,1.271,-5.832
B-art,0.0,0.0,7.371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-art,-0.487,0.0,7.159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-eve,-0.526,0.0,0.0,0.0,7.435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-eve,-0.412,0.0,0.0,0.0,6.738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-geo,0.416,0.0,0.0,0.0,0.0,-1.538,6.062,0.094,0.0,0.0,0.0,0.0,-2.659,-1.097,-1.655,1.257,-1.941
I-geo,0.058,0.0,0.0,0.0,0.0,-0.734,5.237,0.0,0.0,0.0,0.0,0.0,-1.58,-0.784,-0.702,0.104,-1.057
B-gpe,0.969,0.0,0.0,0.0,0.0,0.072,-1.49,-4.748,6.037,0.0,0.0,0.913,-3.342,0.002,-1.041,-0.173,-1.214
I-gpe,-0.165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-nat,-0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.341,0.0,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+5.604,BOS,,,,,,,,,,,,,,,
+5.341,word.lower():month,,,,,,,,,,,,,,,
+4.990,word.lower():last,,,,,,,,,,,,,,,
+4.873,word.lower():year,,,,,,,,,,,,,,,
+4.595,bias,,,,,,,,,,,,,,,
+4.569,word.lower():chairman,,,,,,,,,,,,,,,
+4.229,word.lower():week,,,,,,,,,,,,,,,
… 496 more positive …,… 496 more positive …,,,,,,,,,,,,,,,
… 408 more negative …,… 408 more negative …,,,,,,,,,,,,,,,
-4.500,postag:NNP,,,,,,,,,,,,,,,

Weight?,Feature
+5.604,BOS
+5.341,word.lower():month
+4.990,word.lower():last
+4.873,word.lower():year
+4.595,bias
+4.569,word.lower():chairman
+4.229,word.lower():week
… 496 more positive …,… 496 more positive …
… 408 more negative …,… 408 more negative …
-4.500,postag:NNP

Weight?,Feature
+2.452,word.lower():english
+1.899,word[-3:]:ook
+1.552,word.lower():facebook
+1.119,word[-3:]:ish
+1.094,-1:postag:``
+1.012,+1:word.isupper()
+0.682,postag:NNP
+0.469,"-1:word.lower():"""
… 11 more positive …,… 11 more positive …
… 6 more negative …,… 6 more negative …

Weight?,Feature
+0.395,-1:word.istitle()
+0.351,word.istitle()
+0.242,word.isupper()
+0.180,-1:word.isupper()
+0.074,+1:word.lower():.
+0.063,+1:postag:.
… 1 more positive …,… 1 more positive …
… 2 more negative …,… 2 more negative …
-0.127,-1:postag:JJ
-0.128,postag:NNP

Weight?,Feature
+2.414,word.lower():ii
+2.414,word[-3:]:II
+2.304,-1:word.lower():war
+1.588,word.lower():olympic
+1.584,word[-3:]:pic
+1.559,word.lower():hurricane
+1.304,word[-3:]:mas
+1.235,+1:word.lower():open
+1.187,word.isupper()
+0.914,+1:word.lower():war

Weight?,Feature
+2.264,word[-3:]:Day
+2.238,word.lower():day
+1.524,-1:word.lower():hurricane
+1.438,-1:word.lower():war
+1.206,word.lower():games
+1.116,word.lower():open
+1.112,word[-3:]:pen
+0.978,-1:word.lower():world
+0.835,word.isupper()
+0.712,word[-3:]:War

Weight?,Feature
+3.799,-1:word.lower():mr.
+3.594,word.lower():israel
+3.422,word.lower():beijing
+3.412,word.lower():caribbean
+3.088,word.lower():republic
+3.078,word.lower():iran
+2.832,word.lower():martian
+2.743,word.lower():europe
+2.740,word.lower():britain
+2.624,word.lower():east

Weight?,Feature
+3.501,word.lower():airport
+3.006,-1:word.lower():san
+2.800,word.lower():republic
+2.203,-1:word.lower():middle
+2.183,word.lower():island
+2.156,-1:word.lower():hong
+2.025,-1:word.lower():new
+1.957,-1:word.lower():of
+1.788,word.lower():city
+1.768,-1:word.lower():the

Weight?,Feature
+6.420,word.lower():niger
+5.665,word.lower():afghan
+5.125,word.lower():nepal
+4.597,word.istitle()
+4.028,word.lower():german
+3.907,word.lower():korean
+3.713,word.lower():israeli
+3.524,word.lower():cuban
+3.475,word.lower():pakistani
+3.392,word.lower():azerbaijan

Weight?,Feature
+2.897,-1:word.lower():bosnian
+2.118,-1:word.lower():north
+1.405,word.lower():cypriots
+1.333,-1:postag:NNP
+1.138,postag:JJ
+0.792,word.lower():cypriot
+0.788,word[-3:]:iot
+0.787,word[-3:]:can
… 11 more positive …,… 11 more positive …
… 5 more negative …,… 5 more negative …

Weight?,Feature
+6.747,word.lower():katrina
+4.142,word.lower():marburg
+2.959,word.lower():rita
+2.901,word[-3:]:ita
+2.622,word.lower():h5n1
+2.622,word[-3:]:5N1
+2.454,word[-3:]:urg
+1.782,word.isupper()
+1.217,word.lower():aids
+1.217,word[-3:]:IDS

Weight?,Feature
1.944,word.lower():rita
1.917,word[-3:]:ita
1.165,word.lower():katrina
0.901,word[-3:]:ina
0.839,-1:word.lower():hurricane
0.325,-1:word.istitle()
-0.624,bias

Weight?,Feature
+5.474,word.lower():philippine
+5.394,word.lower():al-qaida
+4.461,word.lower():hamas
+3.912,-1:word.lower():niger
+3.729,word.lower():congress
+3.703,-1:word.lower():senator
+3.203,-1:word.lower():mr.
+2.962,word.lower():hezbollah
+2.938,word.lower():taleban
+2.906,word.lower():xinhua

Weight?,Feature
+3.450,-1:word.lower():&
+2.290,-1:word.lower():for
+2.083,word.lower():court
+1.960,word.lower():bank
+1.954,-1:postag:CC
+1.948,word.lower():department
+1.936,+1:word.lower():post
+1.902,+1:word.lower():hamas
… 252 more positive …,… 252 more positive …
… 61 more negative …,… 61 more negative …

Weight?,Feature
+5.608,word.lower():president
+4.468,word.lower():prime
+3.938,BOS
+3.704,word.lower():vice
+3.618,word.lower():western
+3.504,word.lower():hall
+3.083,word.lower():senator
+2.987,word[-3:]:Mr.
+2.987,word.lower():mr.
+2.947,word.lower():obama

Weight?,Feature
+2.449,-1:postag:NN
+2.187,-1:word.lower():condoleezza
+1.805,-1:word.lower():bin
+1.686,word.lower():obama
+1.563,+1:word.lower():reports
+1.522,-1:word.lower():'
+1.486,word.lower():rice
+1.452,word.lower():annan
+1.427,+1:word.lower():of
… 112 more positive …,… 112 more positive …

Weight?,Feature
+10.302,word[-3:]:day
+6.480,word[-3:]:Day
+5.598,word.lower():march
+5.545,word.lower():june
+4.380,word.lower():august
+4.154,word.lower():january
+4.017,word.lower():february
+4.003,word.lower():may
+3.932,+1:word.lower():week
+3.579,word[-3:]:60s

Weight?,Feature
+8.638,word[-3:]:day
+5.053,word[-3:]:.m.
+3.809,word[-3:]:Day
+3.575,postag:CD
+2.810,word[-3:]:ber
+2.511,word.lower():decades
+2.089,+1:word.lower():month
+2.088,word.lower():june
+2.016,word.lower():january
+1.978,word.lower():may


In [26]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=all_ents), metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=ents)

(0.9668833847645976, 0.8283769556113116)

<h2>Spacy

In [27]:
with open(f'../data/ner.bio', 'w', encoding='utf8') as file:
    for _, row in df.iterrows():
        for word, _, ent in zip(row['Sentence'].split(), row['POS'], row['Tag']):
            file.write(f'{word}\t{ent}\n')
        file.write('\n')

In [28]:
with open(f'../data/ner_test.bio', 'w', encoding='utf8') as file:
    for _, row in df_test.iterrows():
        for word, _, ent in zip(row['Sentence'].split(), row['POS'], row['Tag']):
            file.write(f'{word}\t{ent}\n')
        file.write('\n')

with open(f'../data/ner_train.bio', 'w', encoding='utf8') as file:
    for sent in train_data:
        for token in sent:
            file.write(f'{token[0]}\t{token[2]}\n')
        file.write('\n')

with open(f'../data/ner_test.bio', 'w', encoding='utf8') as file:
    for sent in test_data:
        for token in sent:
            file.write(f'{token[0]}\t{token[2]}\n')
        file.write('\n')

In [29]:
!python -m spacy init config base_config.cfg -p ner


[x] The provided output file already exists. To force overwriting the config
file, set the --force or -F flag.



In [30]:
!python -m spacy init fill-config base_config.cfg config.cfg

[!] Nothing to auto-fill: base config is already complete
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [31]:
!python -m spacy convert ../data/ner.bio spacy_data -t spacy -c ner

[i] Auto-detected token-per-line NER format
[i] Grouping every 1 sentences into a document.
[!] To generate better training data, you may want to group sentences into
documents with `-n 10`.
[+] Generated output file (35969 documents): ..\spacy_data\ner.spacy


In [32]:
!python -m spacy convert ../data/ner_test.bio spacy_data -t spacy -c ner

[i] Auto-detected token-per-line NER format
[i] Grouping every 1 sentences into a document.
[!] To generate better training data, you may want to group sentences into
documents with `-n 10`.
[+] Generated output file (11990 documents): ..\spacy_data\ner_test.spacy


In [33]:
# works in cli
# !python -m spacy train config.cfg --output ./output --paths.train ../spacy_data/ner_train.spacy --paths.dev ../spacy_data/ner_test.spacy

In [34]:
!python -m spacy evaluate output/model-best ../spacy_data/ner_test.spacy

[i] Using CPU
[1m

TOK     -    
NER P   85.23
NER R   81.04
NER F   83.08
SPEED   4474 

[1m

          P       R       F
geo   86.91   87.56   87.23
org   75.66   65.17   70.03
tim   91.81   82.92   87.14
per   74.98   74.81   74.89
gpe   94.92   93.23   94.06
eve   78.57   16.18   26.83
nat   65.22   23.44   34.48
art    0.00    0.00    0.00



<h2>To biluo

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:
with open(f'../spacy_data/ner.json', 'r') as f:
    d = json.load(f)

In [37]:
with open(f'../spacy_data/ner_test.json', 'r') as f:
    d_test = json.load(f)

In [38]:
d[0]['paragraphs'][50]['sentences']

[{'tokens': [{'id': 0, 'orth': 'The', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 1, 'orth': 'area', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 2, 'orth': 'became', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 3, 'orth': 'a', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 4, 'orth': 'refuge', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 5, 'orth': 'for', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 6, 'orth': 'many', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 7, 'orth': 'al-Qaida', 'space': ' ', 'tag': '-', 'ner': 'U-org'},
   {'id': 8, 'orth': 'and', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 9, 'orth': 'Taleban', 'space': ' ', 'tag': '-', 'ner': 'U-org'},
   {'id': 10, 'orth': 'fighters', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 11, 'orth': 'after', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 12, 'orth': 'the', 'space': ' ', 'tag': '-', 'ner': 'O'},
   {'id': 13, 'orth': 'Taleban', 'space': ' ', 'tag': '-', 'ner': 'U-org'},
   {'id': 

In [39]:
from typing import List, Tuple, Union
def convert_to_biluo(text: str = '',
                     entities: List[Tuple] = None,
                     tokens: list = None,
                     missing: str = 'O') -> Tuple[Union[List[str], list, None], List[str]]:
    """
    Tokenize text and return text tokens and ner labels.

    Args:
        text: text
        entities: labels in spacy format
        tokens: already tokenized text, if you want it
        missing: lable for tokens without entities

    Returns:
        tokenized text and labels
    """

    # create dicts with start/end position of token and its index
    starts = []
    ends = []
    cur_index = 0
    tokens = text.split() if tokens is None else tokens

    for token in tokens:
        starts.append(cur_index)
        ends.append(cur_index + len(token))
        cur_index += len(token) + 1

    starts = {k: v for v, k in enumerate(starts)}
    ends = {k: v for v, k in enumerate(ends)}

    # this will be a list with token labels
    biluo = ["-" for _ in text.split()]

    # check that there are no overlapping entities
    entities_indexes = [list(range(i[0], i[1])) for i in entities]
    if max(Counter([i for j in entities_indexes for i in j]).values()) > 1:
        raise ValueError('You have overlapping entities')

    tokens_in_ents = {}

    # Handle entity cases
    for start_char, end_char, label in entities:
        for token_index in range(start_char, end_char):
            tokens_in_ents[token_index] = (start_char, end_char, label)
        start_token = starts.get(start_char)
        end_token = ends.get(end_char)
        # Only interested if the tokenization is correct
        if start_token is not None and end_token is not None:
            if start_token == end_token:
                biluo[start_token] = f"U-{label}"
            else:
                biluo[start_token] = f"B-{label}"
                for i in range(start_token + 1, end_token):
                    biluo[i] = f"I-{label}"
                biluo[end_token] = f"L-{label}"

    # put missing value for tokens without labels
    entity_chars = set()
    for start_char, end_char, label in entities:
        for i in range(start_char, end_char):
            entity_chars.add(i)

    for ind, token in enumerate(tokens):
        for i in range(list(starts.keys())[ind], list(ends.keys())[ind]):
            if i in entity_chars:
                break
        else:
            biluo[ind] = missing

    return tokens, biluo

In [40]:
%%time
# convert the data
new_data = []
biluo_labels = []
for i in range(len(d[0]['paragraphs'])):
    tokens_dict = d[0]['paragraphs'][i]['sentences'][0]['tokens']
    tokens = [i['orth'] for i in tokens_dict]
    if len([i['orth'] for i in tokens_dict]) > 1:
        
        text = ' '.join(tokens)
        doc = nlp(text)
        entities = d[0]['paragraphs'][i]['entities']

        new_ents = offsets_to_biluo_tags(doc, entities)
        if entities == []:
            new_ents = ['O'] * len(tokens)
        new_data.append(tokens)
        
        biluo_labels.append(new_ents)
        if len(tokens) != len(new_ents):
            
            ents2 = convert_to_biluo(text, entities)[1]
            biluo_labels[-1] = ents2

Wall time: 12min 54s


In [41]:
df = pd.DataFrame({'sent_id': [i for j in [[i] * len(s) for i, s in enumerate(new_data)] for i in j],
                   'data': [i for j in new_data for i in j],
                   'entities': [i for j in biluo_labels for i in j]})
df.head()

Unnamed: 0,sent_id,data,entities
0,0,Thousands,O
1,0,of,O
2,0,demonstrators,O
3,0,have,O
4,0,marched,O
