# NERC task

# CRF

In [34]:
import pandas as pd
import sklearn_crfsuite
from sklearn_crfsuite import metrics

## Load the data

In [35]:
train_data = pd.read_csv("vrije-project-TM/data/Training_data/kaggle/ner_dataset.csv", encoding="latin1")
test_data = pd.read_csv("vrije-project-TM/data/NER-test.tsv", sep="\t", encoding="latin1")

## Preprocess the train data

In [36]:
train_data = train_data.ffill()

transformed_train_data = train_data.drop('POS', axis=1)

     Sentence #           Word    Tag
0   Sentence: 1      Thousands      O
1   Sentence: 1             of      O
2   Sentence: 1  demonstrators      O
3   Sentence: 1           have      O
4   Sentence: 1        marched      O
5   Sentence: 1        through      O
6   Sentence: 1         London  B-geo
7   Sentence: 1             to      O
8   Sentence: 1        protest      O
9   Sentence: 1            the      O
10  Sentence: 1            war      O
11  Sentence: 1             in      O
12  Sentence: 1           Iraq  B-geo
13  Sentence: 1            and      O
14  Sentence: 1         demand      O
15  Sentence: 1            the      O
16  Sentence: 1     withdrawal      O
17  Sentence: 1             of      O
18  Sentence: 1        British  B-gpe
19  Sentence: 1         troops      O


## Preprocess the test data

In [38]:
transformed_test_data = pd.DataFrame(columns=["Sentence #", "Word", "Tag"])

sentence_labels = []
current_sentence = None
    
for idx, row in test_data.iterrows():
    if row['sentence_id'] != current_sentence:
        sentence_labels.append(f"Sentence: {row['sentence_id'] + 1}")
        current_sentence = row['sentence_id']
    else:
        sentence_labels.append(f"Sentence: {row['sentence_id'] + 1}")
        
converted_test_labels = []

for label in test_data['BIO_NER_tag'].values:
    match label:
        case "O":
            converted_test_labels.append("O")
        case "B-PERSON":
            converted_test_labels.append("B-per")
        case "I-PERSON":
            converted_test_labels.append("I-per")
        case "B-ORG":
            converted_test_labels.append("B-org")
        case "I-ORG":
            converted_test_labels.append("I-org")
        case "B-LOCATION":
            converted_test_labels.append("B-geo")
        case "I-LOCATION":
            converted_test_labels.append("I-geo")
        case "B-WORK_OF_ART":
            converted_test_labels.append("B-art")
        case "I-WORK_OF_ART":
            converted_test_labels.append("I-art")
        case "B-ORG":
            converted_test_labels.append("B-org")
        case "I-ORG":
            converted_test_labels.append("I-org")
        
transformed_test_data['Sentence #'] = sentence_labels
transformed_test_data['Word'] = test_data['token']
transformed_test_data['Tag'] = converted_test_labels

In [108]:
transformed_test_data.head(20)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,If,O
1,Sentence: 1,you're,O
2,Sentence: 1,visiting,O
3,Sentence: 1,Paris,B-geo
4,Sentence: 1,",",O
5,Sentence: 1,make,O
6,Sentence: 1,sure,O
7,Sentence: 1,to,O
8,Sentence: 1,see,O
9,Sentence: 1,the,O


## Check the labels in the train and test data:

In [107]:
tag_counts_test = transformed_test_data['Tag'].value_counts()
tag_counts_train = transformed_train_data['Tag'].value_counts()
print("Test data tag counts:" , tag_counts_test)
print()
print("Train data tag counts:" , tag_counts_train)

Test data tag counts: Tag
O        159
I-per     13
B-per     12
B-org      8
I-art      8
B-art      6
I-org      5
B-geo      3
I-geo      2
Name: count, dtype: int64

Train data tag counts: Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64


## Create a sentence getter class

In [41]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [42]:
train_sentences = SentenceGetter(transformed_train_data).sentences
test_sentences = SentenceGetter(transformed_test_data).sentences

  self.grouped = self.data.groupby("Sentence #").apply(agg_func)
  self.grouped = self.data.groupby("Sentence #").apply(agg_func)


In [90]:
print(test_sentences[0:3])  

["If you're visiting Paris , make sure to see the Louvre , as they exhibit the Mona Lisa !", 'Amazon , Google and Meta control a huge share of the technology market globally .', 'Did you hear Pharoah Sanders recorded an album with Floating Points ?']


## Create features for the NERC task

In [44]:
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:], 
        'word[-2:]': word[-2:], 
        'word.isupper()': word.isupper(), 
        'word.istitle()': word.istitle(), 
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0] 
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True 

    if i < len(sent)-1:
        word1 = sent[i+1][0] 
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True 

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

## Create the features for the train and test data

In [45]:
X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

## Train a CRF model

In [46]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

## Evaluate the model on the train data

In [47]:
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
print("Labels:", labels)
print("Train classification report:")
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))

Labels: ['O', 'B-geo', 'B-gpe', 'B-tim', 'B-org', 'I-geo', 'B-per', 'I-per', 'I-org', 'I-tim', 'B-art', 'I-art', 'B-nat', 'I-gpe', 'I-nat', 'B-eve', 'I-eve']
Train classification report:
              precision    recall  f1-score   support

           O      0.891     0.975     0.931       159
       B-geo      0.429     1.000     0.600         3
       B-gpe      0.000     0.000     0.000         0
       B-tim      0.000     0.000     0.000         0
       B-org      0.500     0.375     0.429         8
       I-geo      0.667     1.000     0.800         2
       B-per      0.889     0.667     0.762        12
       I-per      1.000     0.538     0.700        13
       I-org      0.500     0.600     0.545         5
       I-tim      0.000     0.000     0.000         0
       B-art      0.000     0.000     0.000         6
       I-art      0.000     0.000     0.000         8
       B-nat      0.000     0.000     0.000         0
       I-gpe      0.000     0.000     0.000         0
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
labels = list(crf.classes_)
labels.remove('O')

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-art      0.000     0.000     0.000         6
       I-art      0.000     0.000     0.000         8
       B-eve      0.000     0.000     0.000         0
       I-eve      0.000     0.000     0.000         0
       B-geo      0.429     1.000     0.600         3
       I-geo      0.667     1.000     0.800         2
       B-gpe      0.000     0.000     0.000         0
       I-gpe      0.000     0.000     0.000         0
       B-nat      0.000     0.000     0.000         0
       I-nat      0.000     0.000     0.000         0
       B-org      0.500     0.375     0.429         8
       I-org      0.500     0.600     0.545         5
       B-per      0.889     0.667     0.762        12
       I-per      1.000     0.538     0.700        13
       B-tim      0.000     0.000     0.000         0
       I-tim      0.000     0.000     0.000         0

   micro avg      0.619     0.456     0.525        57
   macro avg      0.249   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Visualize the results

## Transformers

In [92]:
from simpletransformers.ner import NERModel
bert_model = NERModel(
        model_type="bert",
        model_name="dslim/bert-base-NER",
        use_cuda=False
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [99]:
bert_predictions, raw_outputs = bert_model.predict(test_sentences)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

## Convert the predictions to the required format

In [104]:
converted_bert_predictions = []
label_mapping = {
        'B-PER': 'B-per',
        'I-PER': 'I-per',
        'B-ORG': 'B-org', 
        'I-ORG': 'I-org',
        'B-LOC': 'B-geo',
        'I-LOC': 'I-geo',
        'B-MISC': 'B-art',
        'I-MISC': 'I-art',
        'O': 'O'
}
for sentence_prediction in bert_predictions:
    converted_sentence_prediction = []
    
    for word_prediction in sentence_prediction:
        for _, label in word_prediction.items():
            converted_sentence_prediction.append(label_mapping[label])
            
    converted_bert_predictions.append(converted_sentence_prediction)

[['O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'B-art', 'I-art', 'O'], ['B-org', 'O', 'B-org', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'B-org', 'I-org', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-per', 'O', 'O', 'O', 'B-art', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'O', 'O'], ['B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-per', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'B-art', 'O', 'I-org', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-art', 'I-art', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-art', 'I-art', 'I-art', 'I-art', 'I-art', 'B-geo', 'O'], ['B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O'], ['B-art', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I-p

## Evaluate the BERT model predictions

In [116]:
labels = list(set([label for sublist in converted_bert_predictions for label in sublist]))
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_test, converted_bert_predictions, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           O      0.768     0.792     0.780       159
       B-art      0.143     0.167     0.154         6
       I-art      0.143     0.125     0.133         8
       B-geo      0.250     0.333     0.286         3
       I-geo      0.000     0.000     0.000         2
       B-org      0.125     0.125     0.125         8
       I-org      0.000     0.000     0.000         5
       B-per      0.100     0.083     0.091        12
       I-per      0.000     0.000     0.000        13

    accuracy                          0.606       216
   macro avg      0.170     0.181     0.174       216
weighted avg      0.588     0.606     0.597       216


## Evaluate the BERT model predictions with removed 'O' label

In [115]:
labels.remove('O')
metrics.flat_f1_score(y_test, converted_bert_predictions, 
                      average='weighted', labels=labels)

sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, converted_bert_predictions, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-art      0.143     0.167     0.154         6
       I-art      0.143     0.125     0.133         8
       B-geo      0.250     0.333     0.286         3
       I-geo      0.000     0.000     0.000         2
       B-org      0.125     0.125     0.125         8
       I-org      0.000     0.000     0.000         5
       B-per      0.100     0.083     0.091        12
       I-per      0.000     0.000     0.000        13

   micro avg      0.096     0.088     0.092        57
   macro avg      0.095     0.104     0.099        57
weighted avg      0.087     0.088     0.087        57


## Visualize the results