# CRF for multi-class uncertainty cues recognition

The notebook is based on [this tutorial](https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system).

In [1]:
import sklearn_crfsuite
import pickle
import pandas as pd

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# Load data

- The `train.pkl` and `test.pkl` files are downloadable from [here](https://1drv.ms/u/s!AvPkt_QxBozXk7BiazucDqZkVxLo6g?e=IisuM6) (OneDrive).
- Alternatively, you can download the full `szeged_fixed.pkl` corpus (from the same OneDrive folder) and split the data yourself.
- Make sure to place the data in the same location (`../data/train_dev_test/`) or edit the paths.

In [2]:
train = pd.read_pickle('../data/train_dev_test/train.pkl').fillna('')
test = pd.read_pickle('../data/train_dev_test/test.pkl').fillna('')

# Convert features to ```crfsuite``` format

In [3]:
def sent2features(df):
    return df.drop(['sentence_id', 'labels'], axis=1).to_dict(orient='records')

def sent2labels(df):
    return df.labels.to_list()

In [4]:
X_train = [lst for lst in train.groupby('sentence_id').apply(sent2features).to_list()]
y_train = [lst for lst in train.groupby('sentence_id').apply(sent2labels)]

In [5]:
X_test = [lst for lst in test.groupby('sentence_id').apply(sent2features).to_list()]
y_test = [lst for lst in test.groupby('sentence_id').apply(sent2labels)]

# Hyperparameter optimization

In [6]:
# define fixed parameters
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

# define parameters to search
params_space = {
    'c1': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'c2': [0.01, 0.02, 0.03, 0.05, 0.07, 0.09, 0.1, 0.2],
}

# metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average='macro')

# search
rs = RandomizedSearchCV(
    crf,
    params_space,
    cv=3,
    verbose=1,
    scoring=f1_scorer,
    n_iter=30,
    n_jobs=4,
)

rs.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 17.4min
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed: 32.7min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=30, n_jobs=4,
                   param_distributions={'c1': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
                                               0.7, 0.8, 0.9],
                                        'c2': [0.01, 0.02, 0.03, 0.05, 0.07,
                                               0.09, 0.1, 0.2]},
                   scoring=make_scorer(flat_f1_score, average=macro),
                   verbose=1)

In [7]:
# best results
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

best params: {'c2': 0.05, 'c1': 0.7}
best CV score: 0.7980690350824143


# Evaluation (optimized params model)

In [8]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)

## Classification report

In [9]:
print(metrics.flat_classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           C      0.998     0.999     0.998    104937
           D      0.860     0.824     0.842       142
           E      0.872     0.776     0.821       624
           I      0.817     0.802     0.809       111
           N      0.785     0.593     0.675        86

    accuracy                          0.997    105900
   macro avg      0.866     0.799     0.829    105900
weighted avg      0.997     0.997     0.997    105900



# Save model

In [11]:
with open('../model/crf.pkl','wb') as f:
    pickle.dump(crf, f)