# CRF
In this lab session, you are going to train and test a linear-chain CRF model. Before starting, specify "python 2" in the environment parameters.

In [0]:
!pip install python-crfsuite

In [0]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

# Let's use CoNLL 2002 data to build a NER system

CoNLL2002 corpus is available in NLTK. We use Spanish data.

In [0]:
nltk.download('conll2002')
nltk.corpus.conll2002.fileids()

In [0]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

Data format:

In [0]:
train_sents[0]

## Features

Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used. 

This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.

In [0]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]    

This is what word2features extracts:

In [0]:
sent2features(train_sents[0])[0]

Extract the features from the data:

In [0]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## Train the model

To train the model, we create pycrfsuite.Trainer, load the training data and call 'train' method. 
First, create pycrfsuite.Trainer and load the training data to CRFsuite:

In [0]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

Set training parameters. We will use L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [0]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

Possible parameters for the default training algorithm:

In [0]:
trainer.params()

Train the model:

In [0]:
%%time
trainer.train('conll2002-esp.crfsuite')

trainer.train saves model to a file:

In [0]:
!ls -lh ./conll2002-esp.crfsuite

We can also get information about the final state of the model by looking at the trainer's logparser. If we had tagged our input data using the optional group argument in add, and had used the optional holdout argument during train, there would be information about the trainer's performance on the holdout set as well. 

In [0]:
trainer.logparser.last_iteration

We can also get this information for every step using trainer.logparser.iterations

In [0]:
print len(trainer.logparser.iterations), trainer.logparser.iterations[-1]

## Make predictions

To use the trained model, create pycrfsuite.Tagger, open the model and use "tag" method:

In [0]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

Let's tag a sentence to see how it works:

In [0]:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)))

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

## Evaluate the model

In [0]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

Predict entity labels for all sentences in our testing set ('testb' Spanish data):

In [0]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

..and check the result. Note this report is not comparable to results in CONLL2002 papers because here we check per-token results (not per-entity). Per-entity numbers will be worse.  

In [0]:
print(bio_classification_report(y_test, y_pred))

## Let's check what classifier learned

In [0]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized. Also note I-PER -> B-LOC transition: a positive weight means that model thinks that a person name is often followed by a location.

Check the state features:

In [0]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Some observations:

* **8.743642 B-ORG  word.lower=psoe-progresistas** - the model remembered names of some entities - maybe it is overfit, or maybe our features are not adequate, or maybe remembering is indeed helpful;
* **5.195429 I-LOC  -1:word.lower=calle**: "calle" is a street in Spanish; model learns that if a previous word was "calle" then the token is likely a part of location;
* **-3.529449 O      word.isupper=True**, ** -2.913103 O      word.istitle=True **: UPPERCASED or TitleCased words are likely entities of some kind;
* **-2.585756 O      postag=NP** - proper nouns (NP is a proper noun in the Spanish tagset) are often entities.

# Coding your own CRF inference routine

To help you, we use a library, named flexcrf, that provide some inference routines you will use to test your viterbi algorithm. With the following command, you donwload and unzip it.

In [0]:
!if [[ ! -d flexcrf_tp ]]; then wget http://stelat.eu/wp-content/uploads/2020/03/flexcrf_tp.zip && unzip flexcrf_tp.zip;fi

Here, we import some functions


In [0]:
import cPickle as pickle

import numpy as np
from pycrfsuite import Tagger
from flexcrf_tp.models.linear_chain import (_feat_fun_values,
                                            _compute_all_potentials,
                                            _forward_score,
                                            _backward_score,
                                            _partition_fun_value,
                                            _posterior_score)

from flexcrf_tp.crfsuite2flexcrf import convert_data_to_flexcrf

#Viterbi decoder
Now you can complete the viterbi_decoder function:

In [0]:


# -- Define vitrebi_decoder here:

def viterbi_decoder(m_xy, n=None, log_version=True):
    """
    Performs MAP inference, determining $y = \argmax_y P(y|x)$, using the
    Viterbi algorithm.

    Parameters
    ----------
    m_xy : ndarray, shape (n_obs, n_labels, n_labels)
        Values of log-potentials ($\log M_i(y_{i-1}, y_i, x)$)
        computed based on feature functions f_xy and/or user-defined potentials
        `psi_xy`. At t=0, m_xy[0, 0, :] contains values of $\log M_1(y_0, y_1)$
        with $y_0$ the fixed initial state.

    n : integer, default=None
        Time position up to which to decode the optimal sequence; if not
        specified (default) the score is computed for the whole sequence.

    Returns
    -------
    y_pred : ndarray, shape (n_obs,)
        Predicted optimal sequence of labels.

    TODO: Cythonise this function for more efficiency.
    """

    if n is None:
        n = m_xy.shape[0]

    # Here we provide the temporary variables required by the viterbi algorithm.
    n_labels = m_xy.shape[2]
    y_pred = np.empty(n, dtype=int)
    delta = np.empty((n, n_labels))
    delta[0, :] = m_xy[0, 0, :]
    btrack = np.empty((n, n_labels), dtype=int)

    # Viterbi scores
    #YOUR CODE HERE
    
    # Backtracking
    #YOUR CODE HERE
    
    return y_pred

# Test your Viterbi decoder
Check if you viterbi decoder provide the same output as pycrfsuite.

In [0]:


# -- Load data and crfsuite model and convert them-------------------------

RECREATE = True  # set to True to recreate flexcrf data with new model

CRFSUITE_MODEL_FILE = './conll2002-esp.crfsuite'
CRFSUITE_MODEL_INFO_FILE = './conll2002-esp.crfsuite-model-info.dump'

CRFSUITE_TEST_DATA_FILE = './conll2002-esp_crfsuite-test-data.dump'
FLEXCRF_TEST_DATA_FILE = './conll2002-esp_flexcrf-test-data.dump'

# crfsuite model
tagger = Tagger()
tagger.open(CRFSUITE_MODEL_FILE)
model = tagger.info()
#model = pickle.load(open(CRFSUITE_MODEL_INFO_FILE))
#print "model loaded."
data={'X': X_test, 'y': y_test}
#data = pickle.load(open(CRFSUITE_TEST_DATA_FILE))
#print "test data loaded."

if RECREATE:
    dataset, thetas = convert_data_to_flexcrf(data, model, n_seq=3)
    pickle.dump({'dataset': dataset, 'thetas': thetas},
                open(FLEXCRF_TEST_DATA_FILE, 'wb'))
else:
    dd = pickle.load(open(FLEXCRF_TEST_DATA_FILE))
    dataset = dd['dataset']
    thetas = dd['thetas']

# -- Start classification ------------------------------------------------

for seq in range(len(dataset)):

    # -- with pycrfsuite
    s_ = tagger.tag(data['X'][seq])
    y_ = np.array([int(model.labels[s]) for s in s_])
    prob_ = tagger.probability(s_)

    print "\n-- With crfsuite:"
    print "labels:\n", s_, "\n", y_
    print "probability:\t %f" % prob_

    # -- with flexcrf
    f_xy, y = dataset[seq]
    
    theta = thetas[seq]

    m_xy, f_m_xy = _compute_all_potentials(f_xy, theta)

    y_pred = viterbi_decoder(m_xy)

    alpha = _forward_score(m_xy)
    #beta = _backward_score(m_xy)
    z_x = _partition_fun_value(alpha)

    # compare flexcrf prob to crfsuill2002-esp.crfsuite-te prob
    f_x = _feat_fun_values(f_xy, y_, with_f_x_sum=False)
    prob0 = np.exp(_posterior_score(f_x=f_x, theta=theta, z_x=z_x))
    print "flexcrf prob:\t %f" % prob0

    f_x = _feat_fun_values(f_xy, y_pred, with_f_x_sum=False)
    prob = np.exp(_posterior_score(f_x=f_x, theta=theta, z_x=z_x))

    print "-- With flexcrf:"
    print "labels:\n", y_pred
    print "equal predictions: ", all(y_pred == y_)
    print "probability:\t %f" % prob
    print "delta:\t %f" % abs(prob-prob_)

tagger.close()