In [45]:
from nltk.corpus import brown
import spacy
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [2]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
sents = brown.sents(categories=['news', 'editorial', 'reviews'])

In [187]:
nlp = spacy.load('en_core_web_md')

In [196]:
from spacy.tokens import Doc
from spacy.lang.en import English

nlp = English()

In [235]:
def word2features(sent, i):
    token = sent[i]
    word = sent[i].text
    postag = token.pos_
    features = {
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1].text
        postag1 = sent[i-1].pos_
        features.update({
            '-1:word.istitle()': word1.istitle(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1].text
        postag1 = sent[i+1].pos_
        features.update({
            '+1:word.istitle()': word1.istitle(),
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    doc = Doc(nlp.vocab, words=sent)
    return [word2features(doc, i) for i in range(len(doc))]

def word2labels(sent, i):
    label='false'
    if i < len(sent)-1:
        n = sent[i+1].text
        label='true' if (n=='.') else 'false'
    return label
    
def sent2labels(sent):
    doc = Doc(nlp.vocab, words=sent)
    return [word2labels(doc, i) for i in range(len(doc))]

In [236]:
d = [s for s in sents[:500]]

In [237]:
%%time
data = [sent2features(s) for s in d]
labels = [sent2labels(s) for s in d]
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)

CPU times: user 84.4 ms, sys: 2.71 ms, total: 87.1 ms
Wall time: 85.9 ms


In [238]:
len(y_train)

335

In [239]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 165 ms, sys: 1.54 ms, total: 167 ms
Wall time: 166 ms


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [240]:
labels = list(crf.classes_)

In [241]:
y_pred = crf.predict(X_test)

In [242]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9454762921364931

In [243]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       false      0.963     1.000     0.981      3872
        true      0.000     0.000     0.000       147

    accuracy                          0.963      4019
   macro avg      0.482     0.500     0.491      4019
weighted avg      0.928     0.963     0.945      4019



  _warn_prf(average, modifier, msg_start, len(result))


In [217]:
import json
data=None
with open('06-language-as-sequence/run-on-test.json') as json_file:
    data = json.load(json_file)

In [228]:
validate, y_validate = [], []
for o in data:
    s = []
    labels=[]
    for el in o:
        s.append(el[0])
        labels.append('true' if el[1] else 'false')
    validate.append(s)
    y_validate.append(labels)
    
X_validate = [sent2features(s) for s in validate]

In [231]:
y_pred = crf.predict(X_validate)

In [232]:
metrics.flat_f1_score(y_validate, y_pred,
                      average='weighted', labels=labels)

0.9611602139232006

In [234]:
print(metrics.flat_classification_report(
    y_validate, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       false      0.966     0.959     0.962      4542
        true      0.000     0.000     0.000       155

    accuracy                          0.927      4697
   macro avg      0.483     0.479     0.481      4697
weighted avg      0.934     0.927     0.930      4697

