In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Download and preprocess data

In [2]:
!curl -LJ "https://raw.githubusercontent.com/ningshixian/NER-CONLL2003/master/data/train.txt" -o "train.txt"
!curl -LJ "https://raw.githubusercontent.com/ningshixian/NER-CONLL2003/master/data/valid.txt" -o "valid.txt"
!curl -LJ "https://raw.githubusercontent.com/ningshixian/NER-CONLL2003/master/data/test.txt" -o "test.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3206k  100 3206k    0     0  7287k      0 --:--:-- --:--:-- --:--:-- 7287k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  808k  100  808k    0     0   885k      0 --:--:-- --:--:-- --:--:--  885k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  730k  100  730k    0     0  2609k      0 --:--:-- --:--:-- --:--:-- 2618k


In [3]:
!head -5 train.txt

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC


In [4]:
import nltk
from nltk.corpus.reader import ConllCorpusReader

In [5]:
train_sentences = ConllCorpusReader("./", "train.txt", ["words", "pos", "ignore", "chunk"]).iob_sents()
valid_sentences = ConllCorpusReader("./", "valid.txt", ["words", "pos", "ignore", "chunk"]).iob_sents()
test_sentences = ConllCorpusReader("./", "test.txt", ["words", "pos", "ignore", "chunk"]).iob_sents()

Remove empty (len = 0) sentences due to data error.

In [6]:
train_sentences = [s for s in train_sentences if len(s) > 0]
test_sentences = [s for s in test_sentences if len(s) > 0]

In [7]:
train_sentences[0]

[('EU', 'NNP', 'B-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'B-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O'),
 ('boycott', 'VB', 'O'),
 ('British', 'JJ', 'B-MISC'),
 ('lamb', 'NN', 'O'),
 ('.', '.', 'O')]

In [8]:
test_sentences[0]

[('SOCCER', 'NN', 'O'),
 ('-', ':', 'O'),
 ('JAPAN', 'NNP', 'B-LOC'),
 ('GET', 'VB', 'O'),
 ('LUCKY', 'NNP', 'O'),
 ('WIN', 'NNP', 'O'),
 (',', ',', 'O'),
 ('CHINA', 'NNP', 'B-PER'),
 ('IN', 'IN', 'O'),
 ('SURPRISE', 'DT', 'O'),
 ('DEFEAT', 'NN', 'O'),
 ('.', '.', 'O')]

In [9]:
print(f"Length of training set: {len(train_sentences)}")
print(f"Length of testing set: {len(test_sentences)}")

Length of training set: 14041
Length of testing set: 3453


Making a Pandas dataframe, instead of list of tuples

In [10]:
frame = []

for s in train_sentences:
    for term in s:
        frame.append({
            "token": term[0],
            "postag": term[1],
            "label": term[2]
        })
    frame.append({
        "token": "",
        "postag": "",
        "label": ""
    })

df = pd.DataFrame(frame)

In [11]:
pd.set_option("display.max_column", None)
pd.set_option("max_rows", 10)

In [12]:
df

Unnamed: 0,token,postag,label
0,EU,NNP,B-ORG
1,rejects,VBZ,O
2,German,JJ,B-MISC
3,call,NN,O
4,to,TO,O
...,...,...,...
217657,Swansea,NN,B-ORG
217658,1,CD,O
217659,Lincoln,NNP,B-ORG
217660,2,CD,O


# Feature extraction

In [13]:
def word2feat(sentence, idx):
    word = sentence[idx][0]
    postag = sentence[idx][1]
    
    features = {
        "bias" : 1.0,
        "word.lower()" : word.lower(),
        "word[-3:]" : word[-3:],
        "word[-2:]" : word[-2:],
        "word.isupper()" : word.isupper(),
        "word.istitle()" : word.istitle(),
        "word.isdigit()" : word.isdigit(),
        "postag" : postag,
        "postag[:2]" : postag[:2]
    }
    
    if idx > 0:
        # This word is not standing at the sentence's beginning
        word1 = sentence[idx - 1][0]
        postag1 = sentence[idx - 1][1]
        
        features.update({
            "-1:word.lower()" : word1.lower(),
            "-1:word.istitle()" : word1.istitle(),
            "-1:word.isupper()" : word1.isupper(),
            "-1:postag" : postag1,
            "-1:postag[:2]" : postag1[:2]
        })
    else:
        features["BOS"] = True
    
    if idx < len(sentence) - 1:
        # This word is not standing at the sentence's end
        word1 = sentence[idx + 1][0]
        postag1 = sentence[idx + 1][1]
        
        features.update({
            "+1:word.lower()" : word1.lower(),
            "+1:word.istitle()" : word1.istitle(),
            "+1:word.isupper()" : word1.isupper(),
            "+1:postag" : postag1,
            "+1:postag[:2]" : postag1[:2]
        })
    else:
        features["EOS"] = True
    
    return features

In [14]:
def sent2feat(sentence):
    return [word2feat(sentence, i) for i in range(len(sentence))]

In [15]:
def sent2labels(sentence):
    return [label for _, _, label in sentence]

In [16]:
def sent2tokens(sentence):
    return [token for token, _, _ in sentence]

In [17]:
sent2feat(train_sentences[0])

[{'bias': 1.0,
  'word.lower()': 'eu',
  'word[-3:]': 'EU',
  'word[-2:]': 'EU',
  'word.isupper()': True,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'NNP',
  'postag[:2]': 'NN',
  'BOS': True,
  '+1:word.lower()': 'rejects',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'VBZ',
  '+1:postag[:2]': 'VB'},
 {'bias': 1.0,
  'word.lower()': 'rejects',
  'word[-3:]': 'cts',
  'word[-2:]': 'ts',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'VBZ',
  'postag[:2]': 'VB',
  '-1:word.lower()': 'eu',
  '-1:word.istitle()': False,
  '-1:word.isupper()': True,
  '-1:postag': 'NNP',
  '-1:postag[:2]': 'NN',
  '+1:word.lower()': 'german',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:postag': 'JJ',
  '+1:postag[:2]': 'JJ'},
 {'bias': 1.0,
  'word.lower()': 'german',
  'word[-3:]': 'man',
  'word[-2:]': 'an',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False

Make CRF training and valid data

In [18]:
X_train = [sent2feat(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

In [19]:
X_valid = [sent2feat(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

In [20]:
print(len(X_train[0]))
print(len(y_train[0]))
print(X_train[0][2])
print(y_train[0][2])

9
9
{'bias': 1.0, 'word.lower()': 'german', 'word[-3:]': 'man', 'word[-2:]': 'an', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'JJ', 'postag[:2]': 'JJ', '-1:word.lower()': 'rejects', '-1:word.istitle()': False, '-1:word.isupper()': False, '-1:postag': 'VBZ', '-1:postag[:2]': 'VB', '+1:word.lower()': 'call', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'NN', '+1:postag[:2]': 'NN'}
B-MISC


# Training

In [21]:
!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite

Collecting sklearn_crfsuite
  Cloning https://github.com/MeMartijn/updated-sklearn-crfsuite.git to /tmp/pip-install-ihhzcviu/sklearn-crfsuite_8fbd1a0c421b48f4a587864f2b775a7e
  Running command git clone --filter=blob:none --quiet https://github.com/MeMartijn/updated-sklearn-crfsuite.git /tmp/pip-install-ihhzcviu/sklearn-crfsuite_8fbd1a0c421b48f4a587864f2b775a7e
  Resolved https://github.com/MeMartijn/updated-sklearn-crfsuite.git to commit 675038761b4405f04691a83339d04903790e2b95
  Preparing metadata (setup.py) ... [?25ldone
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: sklearn_crfsuite
  Building wheel for sklearn_crfsuite (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn_crfsuite: filename=sklearn_crfsu

In [22]:
import sklearn_crfsuite

In [23]:
crf = sklearn_crfsuite.CRF(
    algorithm = "lbfgs",
    c1 = 0.1,
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions = True,
    verbose = True
)

In [24]:
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 14041/14041 [00:03<00:00, 3908.75it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 86687
Seconds required: 0.630

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=1.01  loss=232367.34 active=86384 feature_norm=1.00
Iter 2   time=0.52  loss=217022.74 active=83848 feature_norm=3.45
Iter 3   time=0.51  loss=161383.49 active=83844 feature_norm=2.99
Iter 4   time=1.53  loss=119552.24 active=83579 feature_norm=2.74
Iter 5   time=0.51  loss=94630.26 active=86129 feature_norm=3.20
Iter 6   time=0.52  loss=88330.83 active=85472 feature_norm=3.56
Iter 7   time=0.52  loss=71522.54 active=80105 feature_norm=5.12
Iter 8   time=0.53  loss=61055.57 active=64864 feature_norm=6.38
Iter 9   time=0.54  loss=54560.88 active=61779 feature_norm=7.80
Iter 10  t

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    max_iterations=100, verbose=True)

# Evaluation

In [25]:
from sklearn_crfsuite import metrics

In [26]:
labels = list(crf.classes_)
print(labels)

['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']


In [27]:
y_pred = crf.predict(X_valid)

In [28]:
print(metrics.flat_accuracy_score(y_test, y_pred))

0.9564337245612146


In [29]:
print(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels))

0.9561699839034588


In [30]:
print(metrics.flat_precision_score(y_test, y_pred,
                      average='weighted', labels=labels))

0.9562818434672782


In [31]:
print(metrics.flat_recall_score(y_test, y_pred,
                      average='weighted', labels=labels))

0.9564337245612146


In [32]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

In [33]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           O      0.989     0.989     0.989     38323
       B-LOC      0.856     0.814     0.834      1668
       I-LOC      0.745     0.626     0.681       257
      B-MISC      0.819     0.754     0.785       702
      I-MISC      0.688     0.653     0.670       216
       B-ORG      0.775     0.727     0.750      1661
       I-ORG      0.679     0.734     0.705       835
       B-PER      0.822     0.860     0.841      1617
       I-PER      0.861     0.951     0.904      1156

    accuracy                          0.956     46435
   macro avg      0.804     0.790     0.795     46435
weighted avg      0.956     0.956     0.956     46435



Confusion matrix

In [34]:
from itertools import chain
from sklearn.metrics import confusion_matrix

In [35]:
print(confusion_matrix(
    list(chain.from_iterable(y_test)),
    list(chain.from_iterable(y_pred))
))

[[ 1357    26   163    53     1     1    10     4    53]
 [   17   529    36    26     0     6     2     2    84]
 [  122    39  1207   156     2     1    17     4   113]
 [   49     9    58  1391     1     1     9    14    85]
 [    4     0     0     0   161     0    56    22    14]
 [    2     7     1     1     4   141    17    15    28]
 [    9     2    19     6    31    15   613    83    57]
 [    0     0     1     4     5     2    39  1099     6]
 [   26    34    72    55    11    38   140    33 37914]]
