In [1]:
from nltk.corpus import brown
import spacy
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import eli5



In [2]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
sents = brown.sents(categories=['news', 'editorial', 'reviews'])

In [4]:
nlp = spacy.load('en_core_web_md')

In [239]:
import spacy 
nlp = spacy.load("en_core_web_md")

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # EDIT: commented out regex that splits on hyphens between letters:
        #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

In [240]:
def prevent_sentence_boundaries(doc):
    if doc[0].text=='"':
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[:2])
    return doc

nlp.add_pipe(prevent_sentence_boundaries, before='parser')

In [241]:
def join_sent(l): 
    sent = ""
    first = True
    for el in l: 
        if el[0] in ["'", "."]:
            sent+=el
        else:
            if first:
                sent+=el
                first = False
            else:
                sent+=(" "+el)
    return sent

In [242]:
def word2features(sent, i):
    token = sent[i]
    word = sent[i].text
    postag = token.pos_
    features = {
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1].text
        postag1 = sent[i-1].pos_
        features.update({
            '-1:word.istitle()': word1.istitle(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1].text
        postag1 = sent[i+1].pos_
        features.update({
            '+1:word.istitle()': word1.istitle(),
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    doc = nlp(join_sent(sent))
    return [word2features(doc, i) for i in range(len(doc))]

def word2labels(sent, i):
    label='false'
    if i < len(sent)-1:
        n = sent[i+1].tag_
        label='true' if (n=='.') else 'false'
    return label
    
def sent2labels(sent):
    doc = nlp(join_sent(sent))
    return [word2labels(doc, i) for i in range(len(doc))]

In [243]:
d = [s for s in sents[:500]]

In [244]:
%%time
data = [sent2features(s) for s in d]
labels = [sent2labels(s) for s in d]
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)

CPU times: user 6.7 s, sys: 47.8 ms, total: 6.75 s
Wall time: 6.76 s


In [245]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 164 ms, sys: 3.02 ms, total: 167 ms
Wall time: 166 ms


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [246]:
eli5.show_weights(crf, top=30)



From \ To,false,true
False,2.692,-0.233
True,-0.713,-5.234

Weight?,Feature
Weight?,Feature
+5.799,BOS
+2.510,postag:ADP
+2.351,postag:PART
+2.239,EOS
+1.974,postag:AUX
+1.455,postag:INTJ
+1.393,+1:postag:NOUN
+1.315,postag:CCONJ
+1.236,+1:postag:ADP
+1.018,+1:postag:VERB

Weight?,Feature
+5.799,BOS
+2.510,postag:ADP
+2.351,postag:PART
+2.239,EOS
+1.974,postag:AUX
+1.455,postag:INTJ
+1.393,+1:postag:NOUN
+1.315,postag:CCONJ
+1.236,+1:postag:ADP
+1.018,+1:postag:VERB

Weight?,Feature
3.051,+1:postag:PUNCT
0.524,postag:ADV
0.333,word.isdigit()
0.267,postag:NOUN
0.149,-1:postag:PUNCT
0.118,postag:PRON
0.076,-1:postag:PART
0.045,word.istitle()
-0.001,postag:DET
-0.002,postag:VERB


In [269]:
labels = list(crf.classes_)

In [270]:
y_pred = crf.predict(X_test)

In [271]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9445628418168468

In [272]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))

              precision    recall  f1-score   support

       false      0.964     0.993     0.978      3995
        true      0.171     0.039     0.064       153

    accuracy                          0.958      4148
   macro avg      0.568     0.516     0.521      4148
weighted avg      0.935     0.958     0.945      4148



In [273]:
import json
data=None
with open('06-language-as-sequence/run-on-test.json') as json_file:
    data = json.load(json_file)

In [274]:
validate, y_validate = [], []
for o in data:
    s = []
    labels=[]
    for el in o:
        s.append(el[0])
        labels.append(str(el[1]).lower())
    validate.append(s)
    y_validate.append(labels)
    
X_validate = [sent2features(s) for s in validate]

In [275]:
y_pred = crf.predict(X_validate)

In [277]:
metrics.flat_f1_score(y_validate, y_pred,
                      average='weighted', labels=labels)

0.9769438940450106

In [282]:
print(metrics.flat_classification_report(
    y_validate, y_pred, labels=l, digits=3
))

              precision    recall  f1-score   support

       false      0.967     0.990     0.978      4541
        true      0.000     0.000     0.000       155

    accuracy                          0.957      4696
   macro avg      0.483     0.495     0.489      4696
weighted avg      0.935     0.957     0.946      4696

