# Data Preparation

In [1]:
import sklearn
import sklearn_crfsuite
import string
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn_crfsuite import metrics
from joblib import dump, load

In [2]:
def read(file):
    with open(file, 'r', encoding='utf8') as f:
        file = f.read().splitlines()
    data = [[] for _ in range(len(file))]
    for idx, i in enumerate(file):
        a = i.split()
        for j in a:
            tmp = (j.rsplit('/',1))
            data[idx].append((tmp[0], tmp[1]))
    return data

In [3]:
train_set = read('C:/Users/owcap/Documents/Learning/CS221/Project/corpus/train.txt')
test_set = read('C:/Users/owcap/Documents/Learning/CS221/Project/corpus/test.txt')

In [4]:
#Features extraction for CRF model
def word2features(sent, i):
    word = sent[i][0]
#     postag = sent[i][1]
    if i == 0:
        first = True
    else:
        first = False
    if i == len(sent) - 1:
        last = True
    else:
        last = False

    features = {
        'word' : word,
        'word.lower()': word.lower(),
        'number': word.isdigit(),
        'word.istitle()': word.istitle(),
        'word.isupper()': word.isupper(),
        'has_hyphen': '-' in word,
        'is_first': first,
        'is_last': last
    }

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [word[1] for word in sent]


def sent2tokens(sent):
    return [word[0] for word in sent]

In [5]:
X_train = [sent2features(s) for s in train_set]
y_train = [sent2labels(s) for s in train_set]

X_test = [sent2features(s) for s in test_set]
y_test = [sent2labels(s) for s in test_set]

In [6]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1,
    c2=0.25,
)

In [7]:
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', c1=0.1, c2=0.25, keep_tempfiles=None)

In [None]:
#Save model for GUI use
dump(crf,'crf.joblib')

In [8]:
labels = list(crf.classes_)
labels.remove('X')

In [9]:
#obtaining metrics such as accuracy, etc. on the train set
ypred = crf.predict(X_train)
print('F1 score on the train set = {}\n'.format(metrics.flat_f1_score(y_train, ypred, average='weighted', labels=labels)))
print('Accuracy on the train set = {}\n'.format(metrics.flat_accuracy_score(y_train, ypred)))

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

F1 score on the train set = 0.9635272620515002

Accuracy on the train set = 0.9633225458468176



In [10]:
#obtaining metrics such as accuracy, etc. on the test set
print('Train set classification report: \n\n{}'.format(metrics.flat_classification_report(
y_train, ypred, labels=sorted_labels, digits=3
)))
#obtaining metrics such as accuracy, etc. on the test set
ypred = crf.predict(X_test)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(y_test, ypred,
average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(y_test, ypred)))

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
# print('Test set classification report: \n\n{}'.format(metrics.flat_classification_report(y_test, ypred, labels=sorted_labels, digits=3)))

Train set classification report: 

              precision    recall  f1-score   support

        PART      0.962     1.000     0.980        25
       CCONJ      1.000     0.986     0.993        73
       SCONJ      0.733     0.759     0.746        58
         ADJ      0.946     0.951     0.949       185
         ADP      0.989     0.979     0.984       288
         ADV      0.941     0.929     0.935       154
        VERB      0.967     0.928     0.947       319
         DET      0.965     0.968     0.966       282
        NOUN      0.957     0.978     0.967       547
        PRON      0.994     0.953     0.973       171
       PROPN      0.926     0.980     0.952       153
         NUM      1.000     0.975     0.987        40
       PUNCT      0.997     0.997     0.997       366
         AUX      0.957     0.973     0.965       113
         SYM      1.000     1.000     1.000         3

   micro avg      0.963     0.964     0.963      2777
   macro avg      0.956     0.957     0.956  

  average, "true nor predicted", 'F-score is', len(true_sum)
