In [None]:
pip install sklearn_crfsuite

In [None]:
pip install datasets

In [3]:
from sklearn_crfsuite import CRF
from sklearn.metrics import classification_report
import random
import itertools
import numpy as np
import scipy
from sklearn import metrics
from sklearn_crfsuite.utils import flatten
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

### Load Dataset

In [None]:
dataset = load_dataset("conll2003")
train_set = dataset["train"]
validation_set = dataset["validation"]
test_set = dataset["test"]

In [5]:
# A function to convert digit labels to string labels
def convert_ner_tags(dataset):
  # Label mapping from integer to string
  label_map = {0: 'O', 1: 'PER', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}
  list_sent = []
  list_labels = []
  for sent in dataset:
    words = []
    label = []
    for word, label_idx in zip(sent['tokens'], sent['ner_tags']):
        label.append(label_map[label_idx])
        words.append(word)
    list_sent.append(words)
    list_labels.append(label)
  return list_sent, list_labels

In [6]:
# Define the train, validation and test set
train_ner, train_label_ner = convert_ner_tags(train_set)
validation_ner, validation_label_ner  = convert_ner_tags(validation_set)
test_ner, test_label_ner = convert_ner_tags(test_set)

In [7]:
print('Train',len(train_ner),len(train_label_ner))
print('Test',len(test_ner),len(test_label_ner))
print('Validation',len(validation_ner),len(validation_label_ner))

Train 14041 14041
Test 3453 3453
Validation 3250 3250


In [8]:
# Increase size of train set
new_train = []
new_train.extend(train_ner)
new_train.extend(validation_ner)

new_label = []
new_label.extend(train_label_ner)
new_label.extend(validation_label_ner)

print('Full',len(new_train),len(new_label))

Full 17291 17291


### Train CRF Model

In [9]:
# Initialize the CRF model with class weighting
crf = CRF(algorithm='lbfgs', c1=0.25, c2=0.03, max_iterations=400, all_possible_states=True)
crf.fit(new_train, new_label)
labels = list(crf.classes_)

### Evaluation

In [10]:
y_pred = crf.predict(test_ner)
print(test_label_ner[0])
print(y_pred[0])

['O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'PER', 'O', 'O', 'O', 'O']
['I-PER', 'O', 'I-ORG', 'O', 'I-PER', 'B-ORG', 'O', 'I-ORG', 'O', 'I-ORG', 'O', 'O']


In [11]:
# Calculate evaluation metrics
report = classification_report(flatten(test_label_ner), flatten(y_pred), digits=3)
print(report)

              precision    recall  f1-score   support

       B-LOC      0.280     0.128     0.176       257
      B-MISC      0.157     0.185     0.170       216
       B-ORG      0.250     0.166     0.200       835
       B-PER      0.519     0.556     0.537      1156
       I-LOC      0.212     0.068     0.103       702
       I-ORG      0.407     0.330     0.365      1668
       I-PER      0.390     0.184     0.250      1661
           O      0.910     0.965     0.937     38323
         PER      0.510     0.400     0.448      1617

    accuracy                          0.848     46435
   macro avg      0.404     0.331     0.354     46435
weighted avg      0.820     0.848     0.831     46435



In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV

In [13]:
labels = list(filter(lambda a: a != 'O', labels))
print(labels)

['I-PER', 'I-LOC', 'PER', 'B-PER', 'I-ORG', 'B-ORG', 'B-MISC', 'B-LOC']


In [14]:
crf3 = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring='f1_macro')

rs.fit(new_train, new_label)

AttributeError: ignored

In [69]:
full_set = []
full_set.extend(train_ner)
full_set.extend(validation_ner)
full_set.extend(test_ner)

full_label = []
full_label.extend(train_label_ner)
full_label.extend(validation_label_ner)
full_label.extend(test_label_ner)

In [70]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.base import clone

def randomized_search_cv(param_distributions, X_train, y_train,  n_iter, cv):
    best_score = None
    best_params = None
    it = 0
    for _ in range(n_iter):
        print("\n\n§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§ Iteration ", it, "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§")
        ii=0
        for param_name, distribution in param_distributions.items():
            param_value = distribution.rvs()
            if ii == 0:
              c_one = param_value
            else:
              c_two = param_value
            ii += 1

        estimator = CRF(algorithm='lbfgs', c1= c_one, c2= c_two, max_iterations=300, all_possible_transitions=True)

        cv_results = []
        f = 0
        for train_index, test_index in cv.split(X_train):
            X_train_fold, X_val_fold = np.array(X_train)[train_index], np.array(X_train)[test_index]
            y_train_fold, y_val_fold = np.array(y_train)[train_index], np.array(y_train)[test_index]

            estimator.fit(X_train_fold, y_train_fold)
            # Evaluation
            y_pred = crf.predict(X_val_fold)
            
            # Calculate evaluation metrics for the current fold
            report = classification_report(flatten(y_val_fold), flatten(y_pred), digits =5)
            print("******************* Report of Fold", f+1, "*******************")
            print(report)
            # Extract precision, recall, and F1-score from the classification report
            metrics = report.split()[-4:]
            f1_scores = (float(metrics[2]))
            cv_results.append(f1_scores)
            print("F1_Score of Fold", f+1, " = ", f1_scores)
            f+=1

        mean_score = np.mean(cv_results)
        print("Mean of F1_Scores of iteration", it, "= ", mean_score, "\n")
        if best_score is None or mean_score > best_score:
            best_score = mean_score
            best_params = (c_one, c_two)
            print("#### Best C1 and C2: ", best_params, "####")
        it +=1

    estimator.fit(X_train, y_train)

    return estimator, best_params

In [71]:
params_space = { 'c1': scipy.stats.expon(scale=0.25), 'c2': scipy.stats.expon(scale=0.5)}

kfold = KFold(n_splits=6, shuffle=True)

trained_model, best_c1c2 = randomized_search_cv(params_space, full_set, full_label, n_iter=5, cv=kfold)
print(best_c1c2)



§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§ Iteration  0 §§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§
******************* Report of Fold 1 *******************
              precision    recall  f1-score   support

       B-LOC    0.27000   0.10037   0.14634       269
      B-MISC    0.30769   0.18293   0.22945       328
       B-ORG    0.23785   0.16368   0.19391       837
       B-PER    0.51150   0.53527   0.52311      1205
       I-LOC    0.28384   0.07567   0.11949       859
       I-ORG    0.40330   0.31334   0.35267      1717
       I-PER    0.35006   0.17824   0.23621      1526
           O    0.91069   0.96830   0.93861     41417
         PER    0.49102   0.38886   0.43401      1687

    accuracy                        0.85272     49845
   macro avg    0.41844   0.32296   0.35264     49845
weighted avg    0.82267   0.85272   0.83424     49845

F1_Score of Fold 1  =  0.83424
******************* Report of Fold 2 *******************
              precision    recall  f1-score   support

       B-L

In [72]:
best_crf = CRF(algorithm='lbfgs', c1= best_c1c2[0], c2= best_c1c2[1], max_iterations=300, all_possible_transitions=True)

In [73]:
y_pred = trained_model.predict(test_ner)
print(test_label_ner[0])
print(y_pred[0])
# Calculate evaluation metrics
report = classification_report(flatten(test_label_ner), flatten(y_pred), digits=3)
print(report)

['O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'PER', 'O', 'O', 'O', 'O']
['I-PER', 'O', 'I-ORG', 'O', 'I-PER', 'B-ORG', 'O', 'I-ORG', 'O', 'I-ORG', 'O', 'O']
              precision    recall  f1-score   support

       B-LOC      0.323     0.121     0.176       257
      B-MISC      0.203     0.199     0.201       216
       B-ORG      0.254     0.169     0.203       835
       B-PER      0.528     0.556     0.542      1156
       I-LOC      0.266     0.077     0.119       702
       I-ORG      0.396     0.304     0.344      1668
       I-PER      0.400     0.203     0.269      1661
           O      0.908     0.966     0.936     38323
         PER      0.517     0.404     0.454      1617

    accuracy                          0.849     46435
   macro avg      0.422     0.333     0.360     46435
weighted avg      0.821     0.849     0.831     46435

