In [None]:
import os
import torch

import pandas as pd
import numpy as np

import Bert4NER.config as config
import Bert4NER.model.model as model
import Bert4NER.utils.utils as utils
import Bert4NER.utils.engine as engine
import Bert4NER.dataset.dataset as dataset

from sklearn.preprocessing import LabelEncoder

In [None]:
#hide
df = pd.read_csv(config.DATA_PATH/'ner_datasetreference.csv', encoding='latin-1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


We use the foward fill method in pandas to fill all the nans for the each sentence in the `Sentence #` column.

In [None]:
#hide
df['Sentence #'].fillna(method='ffill')

0              Sentence: 1
1              Sentence: 1
2              Sentence: 1
3              Sentence: 1
4              Sentence: 1
                ...       
1048570    Sentence: 47959
1048571    Sentence: 47959
1048572    Sentence: 47959
1048573    Sentence: 47959
1048574    Sentence: 47959
Name: Sentence #, Length: 1048575, dtype: object

In [None]:
#hide
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

In total we cans ee that there are 47959 sentences in our dataset

In [None]:
len(df['Sentence #'].unique())

47959

Now let us encode all the labels for every word in every sentence

In [None]:
#hide
le_pos = LabelEncoder()
le_tag = LabelEncoder()

In [None]:
#hide
df["encoded_POS"] = le_pos.fit_transform(df.POS)
df["encoded_Tag"] = le_tag.fit_transform(df.Tag)

In [None]:
#hide
le_pos.classes_, le_tag.classes_

(array(['$', ',', '.', ':', ';', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ',
        'JJR', 'JJS', 'LRB', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT',
        'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'RRB', 'TO', 'UH',
        'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',
        '``'], dtype=object),
 array(['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per',
        'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org',
        'I-per', 'I-tim', 'O'], dtype=object))

We can now group the df according to the `Sentence #` and use it to curate the `Word`, `POS` and `Tag`

In [None]:
sentence_group = df.groupby('Sentence #')

In [None]:
#hide
grouped_words = sentence_group['Word'].apply(list)
grouped_POS = sentence_group['encoded_POS'].apply(list)
grouped_Tag = sentence_group['encoded_Tag'].apply(list)

In [None]:
#hide
grouped_words

Sentence #
Sentence: 1        [Thousands, of, demonstrators, have, marched, ...
Sentence: 10       [Iranian, officials, say, they, expect, to, ge...
Sentence: 100      [Helicopter, gunships, Saturday, pounded, mili...
Sentence: 1000     [They, left, after, a, tense, hour-long, stand...
Sentence: 10000    [U.N., relief, coordinator, Jan, Egeland, said...
                                         ...                        
Sentence: 9995     [Opposition, leader, Mir, Hossein, Mousavi, ha...
Sentence: 9996     [On, Thursday, ,, Iranian, state, media, publi...
Sentence: 9997     [Following, Iran, 's, disputed, June, 12, elec...
Sentence: 9998     [Since, then, ,, authorities, have, held, publ...
Sentence: 9999     [The, United, Nations, is, praising, the, use,...
Name: Word, Length: 47959, dtype: object

In [None]:
sentences = grouped_words.values
tags = grouped_Tag.values
pos = grouped_POS.values

In [None]:
#hide
len(sentences), len(tags), len(pos)

(47959, 47959, 47959)

In [None]:
sentences, tags, pos = utils.process_data(df)

In [None]:
len(sentences), len(tags), len(pos)

(47959, 47959, 47959)

Now let's construct the dataset

In [None]:
df.encoded_POS.values

array([19, 10, 19, ..., 29,  7, 16])

In [None]:
#hide
ds = dataset.EntityDataset(texts=sentences, pos=pos, tags=tags)

In [None]:
ds[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'input_ids': tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
          1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
          3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,   

## testing how to calculate metrics in engine

In [None]:
from sklearn.metrics import f1_score, accuracy_score

let me change some values in the dataset

In [None]:
v = ds[0]['target_pos'].clone()
v[[9, 0]] = 1

In [None]:
v

tensor([ 1, 19, 10, 19, 35, 34, 10, 17, 29,  1,  7, 16, 10, 17,  5, 31,  7, 16,
        10, 11, 19, 10,  7, 16,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [None]:
f1_score(ds[0]['target_pos'], ds[0]['target_pos'], average='macro')

1.0

In [None]:
f1_score(ds[0]['target_pos'], v, average='macro')

0.9044029672170375

In [None]:
accuracy_score(ds[0]['target_pos'], ds[0]['target_pos'])

1.0

In [None]:
accuracy_score(ds[0]['target_pos'], v)

0.9838709677419355

### score across n batches

To calculate the score across batches, we loop through the sequences it each batch and compare each sequence to it's corresponding target and then later we take an average

In [None]:
#batch size of 1 due to unsqueeze
scores = []
for i in range(ds[0]['target_pos'].unsqueeze(0).shape[0]):
    score = f1_score(ds[0]['target_pos'].unsqueeze(0)[i], ds[0]['target_pos'].unsqueeze(0)[i], average='macro')
    scores.append(score)
score/ds[0]['target_pos'].unsqueeze(0).shape[0]

1.0

## Checking how the model output looks like

In [None]:
modeller = model.EntityModel(len(le_tag.classes_), len(le_pos.classes_))

In [None]:
dl = utils.create_loader(sentences, tags, pos, bs=1)

In [None]:
batch = next(iter(dl))

In [None]:
out = modeller(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])

As we can see, the output from the model is of shape (bs, seq_len, hidden_size[same as num_labels for either pos or tag])

In [None]:
out[0].shape

torch.Size([1, 124, 17])

in order to get the prediction for each token, we have to softmax the labels ie dim 2(num_labels). This will give us probabilities of size (bs, seq_len, num_labels)

In [None]:
out[0].softmax(dim=2)

tensor([[[0.0404, 0.0521, 0.0482,  ..., 0.0444, 0.0364, 0.0606],
         [0.0319, 0.0480, 0.0334,  ..., 0.0288, 0.0905, 0.0457],
         [0.0404, 0.0434, 0.0445,  ..., 0.0310, 0.0434, 0.0524],
         ...,
         [0.0510, 0.0459, 0.0506,  ..., 0.0383, 0.0445, 0.0519],
         [0.0468, 0.0566, 0.0393,  ..., 0.0487, 0.0357, 0.0538],
         [0.0350, 0.0545, 0.0411,  ..., 0.0470, 0.0683, 0.0414]]],
       grad_fn=<SoftmaxBackward>)

We can then take argmax of this labels ie dim 2 to get the prediction for each token in the sequence to get the one with max prob.

This is basically a classification problem for each token in the sequence

In [None]:
out[0].softmax(dim=2).argmax(dim=2)

tensor([[ 9,  6, 12, 10, 11, 11, 11, 10, 11,  5,  3,  2, 11,  9,  9,  8, 10, 13,
          9, 11, 11, 13,  6,  9,  6,  4,  9,  5,  9,  5, 11, 10, 11, 11,  9, 11,
          8,  2,  9,  3,  5, 10,  8, 13, 10,  9,  9,  5,  9,  9,  3,  2, 13,  9,
          9,  9,  9,  9, 11, 10, 10,  1,  9,  5,  5,  5,  7,  9,  5,  5, 10,  5,
         12, 11,  9,  9,  2,  3,  5,  5, 11,  5,  5,  9,  3,  9, 11, 11, 10, 11,
          1,  8, 10,  9, 13,  9,  8,  9,  5,  5, 14, 12,  9, 11,  9,  9,  2,  6,
          9,  5,  5, 11,  5, 11, 11,  5,  9, 11,  5,  5,  9,  9,  9,  5]])