# CRF

import pandas + numpy and load data

In [0]:
import pandas as pd
import numpy as np

data = pd.read_csv('Dataset.csv', encoding='latin1')[['Sentence #','Word','POS', 'Tag']]

Make a list of all words in the train and test set

In [0]:
words = list(set(data['Word'].values))

In [0]:
n_words = len(words)

Sentence getter

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)

In [0]:
sent = getter.get_next()

In [0]:
sentences = getter.sentences


Get a word's features

In [0]:
def word2features(sent, i):
    word = str(sent[i][0])
    postag = str(sent[i][1])

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = str(sent[i-1][0])
        postag1 = str(sent[i-1][1])
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = str(sent[i+1][0])
        postag1 = str(sent[i+1][1])
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [0]:
X = [sent2features(s) for s in sentences]
X1 = [str(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

The model

In [0]:
!pip install sklearn_crfsuite

In [0]:
!pip install seqeval

In [0]:
import os
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn_crfsuite import CRF
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from nervaluate import Evaluator

# Cross-Validate
skf = StratifiedKFold(5, shuffle=False, random_state=23)

    
oos_y = []
oos_pred = []
resultslist = []

oos_x = []


fold = 0

label_encoder = LabelEncoder()
for train, test in skf.split(X,[1 if 'B' in ya else 0 for ya in y ]):
    fold+=1
    print(f"Fold #{fold}")
        
    x_train = np.array(X)[train]
    y_train = np.array(y)[train]
    x_test = np.array(X)[test]
    y_test = np.array(y)[test]
    
    crf = CRF(algorithm='lbfgs',
          c1=10,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

    crf.fit(x_train, y_train)

    
    pred = crf.predict(x_test)
    
    print(classification_report(y_test, pred))
    evaluator = Evaluator(y_test, pred, tags = [''], loader='list')
    results, results_per_tag = evaluator.evaluate()
    resultslist.append(results)

    oos_y.append(y_test)
    oos_x.append(x_test)
    oos_pred.append(pred)    



# Build the oos prediction list 
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
oos_x = np.concatenate(oos_x)


Results

In [0]:
# Calculate the SD scores of the B and I

#np.std(np.array(['Fill in the scores']))

In [0]:
# Calculate the SD score of partial and exact

np.std(np.array([r['partial']['precision'] for r in resultslist]))

In [0]:
print(classification_report(oos_y, oos_pred))

In [0]:
evaluator = Evaluator(oos_y, oos_pred, tags= [''], loader='list')
results, results_per_tag = evaluator.evaluate()

In [0]:
results