In [80]:
from sklearn_crfsuite import CRF
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
import random
import itertools
import numpy as np
import scipy
import pandas as pd
from sklearn import metrics
from sklearn_crfsuite.utils import flatten
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

### Load Dataset

In [81]:
dataset = load_dataset("conll2003")
train_set = dataset["train"]
validation_set = dataset["validation"]
test_set = dataset["test"]

Found cached dataset conll2003 (C:/Users/vahid/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [82]:
# A function to convert digit labels to string labels
def convert_ner_tags(dataset):
  # Label mapping from integer to string
  label_map = {0: 'O', 1: 'PER', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}
  list_sent = []
  list_labels = []
  for sent in dataset:
    words = []
    label = []
    for word, label_idx in zip(sent['tokens'], sent['ner_tags']):
        label.append(label_map[label_idx])
        words.append(word)
    list_sent.append(words)
    list_labels.append(label)
  return list_sent, list_labels

In [83]:
# Define the train, validation and test set
train_ner, train_label_ner = convert_ner_tags(train_set)
validation_ner, validation_label_ner  = convert_ner_tags(validation_set)
test_ner, test_label_ner = convert_ner_tags(test_set)

In [84]:
print('Train',len(train_ner),len(train_label_ner))
print('Test',len(test_ner),len(test_label_ner))
print('Validation',len(validation_ner),len(validation_label_ner))

Train 14041 14041
Test 3453 3453
Validation 3250 3250


In [85]:
# Increase size of train set
new_train = []
new_train.extend(train_ner)
new_train.extend(validation_ner)

new_label = []
new_label.extend(train_label_ner)
new_label.extend(validation_label_ner)

print('Full',len(new_train),len(new_label))

Full 17291 17291


### Weights of Classes

In [86]:
# Compute class weights
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd

class_weights = compute_class_weight(class_weight = "balanced", classes= np.unique(flatten(train_label_ner)), y= flatten(train_label_ner))

# Convert the class weights to a dictionary
class_weights_dict = dict(zip(np.unique(flatten(train_label_ner)), class_weights))
pd_weights = pd.DataFrame.from_dict(class_weights_dict, "index", columns=['Weight'])
display(pd_weights)

Unnamed: 0,Weight
B-LOC,19.554499
B-MISC,19.58836
B-ORG,6.108141
B-PER,4.996589
I-LOC,6.580732
I-ORG,3.168705
I-PER,3.579268
O,0.133417
PER,3.427963


In [87]:
import random

# A function to reduce the O labels
def reduce(xx, yy):
  x = np.copy(xx)
  y = np.copy(yy)

# Iterate over each sublist in x and its corresponding sublist in y
  for i, (sublist_x, sublist_y) in enumerate(zip(x, y)):
      indices_to_delete = []
      
      for j, label in enumerate(sublist_y):
          if label == "O":
              indices_to_delete.append(j)

      num_deletions = random.randint(0, len(indices_to_delete))
      indices_to_delete = indices_to_delete[:num_deletions]
      for rem, index in enumerate(indices_to_delete):
          del sublist_x[index - rem]
          del sublist_y[index - rem]
      
      # Update the modified sublists in x and y
      x[i] = sublist_x
      y[i] = sublist_y

  return x, y

In [90]:
# Initialize the CRF model with class weighting
crf = CRF(algorithm='lbfgs', c1=0.1, c2=1, max_iterations=300, all_possible_states=True)
X, Y = reduce(new_train, new_label)
print(X[0])
print(Y[0])

# Fit the CRF model on the modified training data
crf.fit(X, Y)
print("done")

y_pred = crf.predict(test_ner)
print(test_label_ner[0])
print(y_pred[0])

# Calculate evaluation metrics
report = classification_report(flatten(test_label_ner), flatten(y_pred), digits=3)
print(report)

['EU', 'German', 'British']
['I-PER', 'I-LOC', 'I-LOC']
done
['O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'PER', 'O', 'O', 'O', 'O']
['I-PER', 'O', 'I-ORG', 'I-LOC', 'B-MISC', 'B-MISC', 'O', 'I-PER', 'B-ORG', 'I-PER', 'I-PER', 'O']
              precision    recall  f1-score   support

       B-LOC      0.269     0.276     0.273       257
      B-MISC      0.190     0.421     0.262       216
       B-ORG      0.312     0.304     0.308       835
       B-PER      0.490     0.596     0.538      1156
       I-LOC      0.220     0.164     0.188       702
       I-ORG      0.296     0.380     0.333      1668
       I-PER      0.281     0.267     0.274      1661
           O      0.948     0.931     0.940     38323
         PER      0.456     0.447     0.451      1617

    accuracy                          0.834     46435
   macro avg      0.385     0.421     0.396     46435
weighted avg      0.843     0.834     0.838     46435

