In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
from itertools import chain

import json
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

## Read dataset

In [4]:
CRF_TRAIN = "../Data/Generated_dataset/crf_dataset/crf_train.txt"
CRF_DEV = "../Data/Generated_dataset/crf_dataset/crf_dev.txt"
CRF_TEST = "../Data/Generated_dataset/crf_dataset/crf_test.txt"

In [5]:
train_sents = []
def read_dataset(file_name):
    sentences = []
    with open (file_name, 'r') as f:
        for line in f:
            sent = json.loads(line)
            sent_set = []
            for item in sent:
                sent_set.append(tuple(item))
            sentences.append(sent_set)
    return sentences

train_sents = read_dataset(CRF_TRAIN)
dev_sents = read_dataset(CRF_DEV)
test_sents = read_dataset(CRF_TEST)

In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [7]:
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'move',
 'word[-3:]': 'ove',
 'word[-2:]': 've',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': 'VERB',
 'postag[:2]': 'VE',
 'BOS': True,
 '+1:word.lower()': 'the',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'DET',
 '+1:postag[:2]': 'DE'}

In [8]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents] 

CPU times: user 1.68 s, sys: 646 ms, total: 2.32 s
Wall time: 2.32 s


In [18]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.2, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 1min 10s, sys: 7.04 s, total: 1min 17s
Wall time: 1min 29s




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.2, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [11]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-TARGET', 'I-TARGET', 'B-RECEPTACLE', 'I-RECEPTACLE']

In [36]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.628554993061262

In [37]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

B-RECEPTACLE      0.551     0.514     0.532      5983
I-RECEPTACLE      0.434     0.455     0.444      1152
    B-TARGET      0.826     0.703     0.760      6236
    I-TARGET      0.790     0.502     0.614      1837

   micro avg      0.672     0.586     0.626     15208
   macro avg      0.650     0.544     0.588     15208
weighted avg      0.684     0.586     0.629     15208



In [31]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-2, 
                        n_iter=30, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed: 31.0min
[Parallel(n_jobs=-2)]: Done  90 out of  90 | elapsed: 77.2min finished


CPU times: user 1h 13min 27s, sys: 2min 29s, total: 1h 15min 56s
Wall time: 1h 17min 58s


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=30, n_jobs=-2,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f24899dd2e8>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f24899dd470>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-TARGET', 'I-TARGET', 'B-RECEPTACLE', 'I-RECEPTACLE']),
                   verbose=1)

In [32]:
crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.0433462845150516, 'c2': 0.01316779153024307}
best CV score: 0.49676629501730885
model size: 0.21M


In [33]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.628554993061262

In [34]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-TARGET -> I-TARGET 4.048525
I-TARGET -> I-TARGET 3.335785
B-RECEPTACLE -> I-RECEPTACLE 2.926770
O      -> O       1.882036
I-TARGET -> I-RECEPTACLE 0.466125
O      -> B-RECEPTACLE 0.449737
I-RECEPTACLE -> I-RECEPTACLE 0.374634
O      -> B-TARGET 0.241345
I-RECEPTACLE -> O       -0.002101
B-RECEPTACLE -> O       -0.165732
I-TARGET -> B-RECEPTACLE -0.268988
B-TARGET -> O       -0.330918
I-TARGET -> O       -0.361394
I-TARGET -> B-TARGET -1.553355
B-RECEPTACLE -> B-TARGET -1.563485
B-RECEPTACLE -> I-TARGET -2.115421
I-RECEPTACLE -> B-RECEPTACLE -2.285759
B-TARGET -> I-RECEPTACLE -2.523690
B-TARGET -> B-TARGET -2.715554
I-RECEPTACLE -> B-TARGET -3.021333

Top unlikely transitions:
O      -> B-RECEPTACLE 0.449737
I-RECEPTACLE -> I-RECEPTACLE 0.374634
O      -> B-TARGET 0.241345
I-RECEPTACLE -> O       -0.002101
B-RECEPTACLE -> O       -0.165732
I-TARGET -> B-RECEPTACLE -0.268988
B-TARGET -> O       -0.330918
I-TARGET -> O       -0.361394
I-TARGET -> B-TARGET -1.553

In [35]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
12.153116 B-RECEPTACLE -1:word.lower():safe
9.843691 B-TARGET word.lower():dispenser
9.220662 I-TARGET -1:word.lower():fire
8.615246 O        postag:ADP
7.619490 B-TARGET word.lower():clothing
7.214269 O        -1:word.lower():safe
7.065473 B-RECEPTACLE -1:word.lower():original
6.988626 I-TARGET word.lower():card
6.769381 I-TARGET word.lower():tablet
6.516453 I-TARGET -1:word.lower():bed
6.394268 B-TARGET word.lower():frying
6.335617 O        word.lower():lower
6.274567 O        word[-2:]:st
6.253033 O        postag:PRON
6.210318 B-TARGET +1:word.lower():a
6.171586 O        word.lower():then
6.122281 I-RECEPTACLE -1:word.lower():toiler
6.049479 I-TARGET -1:word.lower():bath
6.033224 O        word.lower():right
6.005406 O        word.lower():one
5.917266 O        -1:word.lower():credit
5.864150 B-TARGET word.lower():cleaner
5.809009 O        word.lower():toiler
5.807791 I-TARGET word.lower():dispenser
5.749456 I-TARGET +1:word.lower():thing
5.665179 O        -1:word.lower(