In [28]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import numpy as np
import sklearn
import csv

In [29]:
def extract_sents_from_conll(inputfile):
    
    rows = csv.reader(open(inputfile, encoding="utf-8"), delimiter='\t')
    sents = []
    current_sent = []
    for row in rows:
        current_sent.append(tuple(row))
        #note that this is a simplification that works well for this particular data, in other situations, you may need to do more advanced preprocessing to identify sentence boundaries
        if row[0] == "Sent_end":
            sents.append(current_sent)
            current_sent = []
    return sents


In [30]:
sents = extract_sents_from_conll("Toy_data_train.tsv")

print(sents[0])

[('Word', 'AR_label'), ('The', 'O'), ('Ninth', 'O'), ('Circle', 'O'), (':', 'O'), ('The', 'O'), ('Hellish', 'O'), ('View', 'O'), ('from', 'O'), ('Inside', 'O'), ('the', 'O'), ('Beltway', 'O'), (',', 'O'), ('#', 'O'), ('2', 'O'), ('.', 'O'), ('Sent_end', 'O')]


In [31]:
def sent2tokens(sent):

    return [token for token, label in sent]


In [32]:
test =  sent2tokens(sents[0])

print(test)

['Word', 'The', 'Ninth', 'Circle', ':', 'The', 'Hellish', 'View', 'from', 'Inside', 'the', 'Beltway', ',', '#', '2', '.', 'Sent_end']


In [33]:
def sent2labels(sent):

    return [label for token, label  in sent]

In [34]:
test2 = sent2labels(sents[0])

print(test2)

['AR_label', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [35]:
def token2features(sent, i):
    token = sent[i][0]
       
    features = {
        'bias': 1.0,
        'token': token.lower()
    }

    if i == 0:
        features['BOS'] = True
        
    elif i == len(sent) -1:
        features['EOS'] = True
        
    return features


In [36]:
features= token2features(sents[0], i=0)

print(features)
print(type(features))

{'bias': 1.0, 'token': 'word', 'BOS': True}
<class 'dict'>


In [37]:
def sent2features(sent):
    return [token2features(sent, i) for i in range(len(sent))]

In [38]:
test1 =sent2features(sents[0])

print(sents[0])

print(test1)

[('Word', 'AR_label'), ('The', 'O'), ('Ninth', 'O'), ('Circle', 'O'), (':', 'O'), ('The', 'O'), ('Hellish', 'O'), ('View', 'O'), ('from', 'O'), ('Inside', 'O'), ('the', 'O'), ('Beltway', 'O'), (',', 'O'), ('#', 'O'), ('2', 'O'), ('.', 'O'), ('Sent_end', 'O')]
[{'bias': 1.0, 'token': 'word', 'BOS': True}, {'bias': 1.0, 'token': 'the'}, {'bias': 1.0, 'token': 'ninth'}, {'bias': 1.0, 'token': 'circle'}, {'bias': 1.0, 'token': ':'}, {'bias': 1.0, 'token': 'the'}, {'bias': 1.0, 'token': 'hellish'}, {'bias': 1.0, 'token': 'view'}, {'bias': 1.0, 'token': 'from'}, {'bias': 1.0, 'token': 'inside'}, {'bias': 1.0, 'token': 'the'}, {'bias': 1.0, 'token': 'beltway'}, {'bias': 1.0, 'token': ','}, {'bias': 1.0, 'token': '#'}, {'bias': 1.0, 'token': '2'}, {'bias': 1.0, 'token': '.'}, {'bias': 1.0, 'token': 'sent_end', 'EOS': True}]


In [39]:
def train_crf_model(X_train, y_train):

    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)
    
    return crf


In [40]:
def create_crf_model(trainingfile):

    train_sents = extract_sents_from_conll(trainingfile)
    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    crf = train_crf_model(X_train, y_train)
    
    return crf


In [41]:
def run_crf_model(crf, evaluationfile):

    test_sents = extract_sents_from_conll(evaluationfile)
    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]
    y_pred = crf.predict(X_test)
    
    return y_pred, X_test, y_test


In [42]:
def write_out_evaluation(eval_data, pred_labels, outputfile):

    outfile = open(outputfile, 'w', encoding="utf-8")
    
    for evalsents, predsents in zip(eval_data, pred_labels):
        for data, pred in zip(evalsents, predsents):
            token = str(data.get('token'))
            outfile.write(token + "\t" + pred + "\n")

In [43]:
def run_and_evaluate_crf_model(trainingfile, evaluationfile, outputfile):

    '''Perform the full training at once'''
    crf = create_crf_model(trainingfile)
    labels = list(crf.classes_)
    labels.remove('O')
    labels.remove('AR_label')
    labels
    y_pred, X_test, y_test = run_crf_model(crf, evaluationfile)
    write_out_evaluation(X_test, y_pred, outputfile)
    print('The predictions are written on the output file.')
    print(metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=4))
    print('Accuracy score for sequence items')
    print(metrics.flat_accuracy_score(y_test, y_pred))
    print('Precision score for sequence items')
    print(metrics.flat_precision_score(y_test, y_pred, average='weighted'))
    print('Recall score for sequence items')
    print(metrics.flat_recall_score(y_test, y_pred, average='weighted'))
    print('F1 score score for sequence items')
    print(metrics.flat_f1_score(y_test, y_pred, average='weighted'))

In [44]:
trainingfile = "polnear_withBIO_train.tsv"
evaluationfile = "polnear_withBIO_dev.tsv"
outputfile = "polnear_output_CRF_baseline.tsv"

In [45]:
run_and_evaluate_crf_model(trainingfile, evaluationfile, outputfile)

The predictions are written on the output file.




              precision    recall  f1-score   support

    B-SOURCE     0.7425    0.4589    0.5673      1948
       B-CUE     0.8055    0.5237    0.6348      2190
   B-CONTENT     0.5933    0.3958    0.4748      2193
   I-CONTENT     0.7173    0.7389    0.7279     36881
       I-CUE     0.4045    0.2096    0.2761      1808
    I-SOURCE     0.6111    0.4393    0.5111      4070

   micro avg     0.7036    0.6585    0.6803     49090
   macro avg     0.6457    0.4611    0.5320     49090
weighted avg     0.6964    0.6585    0.6715     49090

Accuracy score for sequence items
0.6814877471712454
Precision score for sequence items


  _warn_prf(average, modifier, msg_start, len(result))


0.6790784854694446
Recall score for sequence items
0.6814877471712454
F1 score score for sequence items
0.6761123604225054
