In [None]:
#default_exp dataset.dataset

In [None]:
#export
import os
import torch
import transformers

import pandas as pd
import numpy as np
import Bert4NER.config as config

from sklearn.preprocessing import LabelEncoder

In [None]:
#export
class EntityDataset(torch.utils.data.Dataset):
    def __init__(self, texts, pos, tags, le_pos, le_tags):
        self.texts = texts
        self.pos = pos
        self.tags = tags
        self.le_tags = le_tags
        self.le_pos = le_pos

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        pos = self.pos[item]
        tag = self.tags[item]

        tokens = []
        target_pos = []
        target_tag = []
        
        # tokenize the each word in the text string
        for i, word in enumerate(text):
            inputs = config.TOKENIZER.encode(
                word,
                add_special_tokens = False,
                truncation = True
            )

            input_len = len(inputs)
            tokens.extend(inputs)

            # the tag for that particular word should be the same for all the 
            # sub tokens of the word
            
            target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tag[i]] * input_len)

        tokens = tokens[:config.MAX_SEQ_LEN - 2]
        target_pos = target_pos[:config.MAX_SEQ_LEN - 2]
        target_tag = target_tag[:config.MAX_SEQ_LEN - 2]

        tokens = [101] + tokens + [102]
        target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(tokens)
        token_type_ids = [0] * len(tokens)

        pad_len = (config.MAX_SEQ_LEN) - len(tokens)

        tokens = tokens + ([0] * pad_len)
        mask = mask + ([0] * pad_len)
        token_type_ids = token_type_ids + ([0] * pad_len)
        target_pos = target_pos + ([0] * pad_len)
        target_tag = target_tag + ([0] * pad_len)

        return {
            'input_ids': torch.tensor(tokens, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target_pos': torch.tensor(target_pos, dtype=torch.long),
            'target_tag': torch.tensor(target_tag, dtype=torch.long),
        }

In [None]:
#hide
df = pd.read_csv(config.DATA_PATH/'ner_datasetreference.csv', encoding='latin-1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


We use the foward fill method in pandas to fill all the nans for the each sentence in the `Sentence #` column.

In [None]:
#hide
df['Sentence #'].fillna(method='ffill')

0              Sentence: 1
1              Sentence: 1
2              Sentence: 1
3              Sentence: 1
4              Sentence: 1
                ...       
1048570    Sentence: 47959
1048571    Sentence: 47959
1048572    Sentence: 47959
1048573    Sentence: 47959
1048574    Sentence: 47959
Name: Sentence #, Length: 1048575, dtype: object

In [None]:
#hide
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

In total we cans ee that there are 47959 sentences in our dataset

In [None]:
len(df['Sentence #'].unique())

47959

Now let us encode all the labels for every word in every sentence

In [None]:
#hide
le_pos = LabelEncoder()
le_tag = LabelEncoder()

In [None]:
#hide
df["encoded_POS"] = le_pos.fit_transform(df.POS)
df["encoded_Tag"] = le_tag.fit_transform(df.Tag)

In [None]:
#hide
le_pos.classes_, le_tag.classes_

(array(['$', ',', '.', ':', ';', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ',
        'JJR', 'JJS', 'LRB', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT',
        'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'RRB', 'TO', 'UH',
        'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',
        '``'], dtype=object),
 array(['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per',
        'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org',
        'I-per', 'I-tim', 'O'], dtype=object))

We can now group the df according to the `Sentence #` and use it to curate the `Word`, `POS` and `Tag`

In [None]:
sentence_group = df.groupby('Sentence #')

In [None]:
#hide
grouped_words = sentence_group['Word'].apply(list)
grouped_POS = sentence_group['encoded_POS'].apply(list)
grouped_Tag = sentence_group['encoded_Tag'].apply(list)

In [None]:
#hide
grouped_words

Sentence #
Sentence: 1        [Thousands, of, demonstrators, have, marched, ...
Sentence: 10       [Iranian, officials, say, they, expect, to, ge...
Sentence: 100      [Helicopter, gunships, Saturday, pounded, mili...
Sentence: 1000     [They, left, after, a, tense, hour-long, stand...
Sentence: 10000    [U.N., relief, coordinator, Jan, Egeland, said...
                                         ...                        
Sentence: 9995     [Opposition, leader, Mir, Hossein, Mousavi, ha...
Sentence: 9996     [On, Thursday, ,, Iranian, state, media, publi...
Sentence: 9997     [Following, Iran, 's, disputed, June, 12, elec...
Sentence: 9998     [Since, then, ,, authorities, have, held, publ...
Sentence: 9999     [The, United, Nations, is, praising, the, use,...
Name: Word, Length: 47959, dtype: object

In [None]:
sentences = grouped_words.values
tags = grouped_Tag.values
pos = grouped_POS.values

In [None]:
#hide
len(sentences), len(tags), len(pos)

(47959, 47959, 47959)

Now let's construct the dataset

In [None]:
df.encoded_POS.values

array([19, 10, 19, ..., 29,  7, 16])

In [None]:
#hide
ds = EntityDataset(texts=sentences, pos=pos, tags=tags, le_pos=le_pos, le_tags=le_tag)

In [None]:
ds[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'input_ids': tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
          1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
          3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,   

In [None]:
from sklearn.metrics import f1_score, accuracy_score

In [None]:
v = ds[0]['target_pos'].clone()
v[[9, 0]] = 1

In [None]:
v

tensor([ 1, 19, 10, 19, 35, 34, 10, 17, 29,  1,  7, 16, 10, 17,  5, 31,  7, 16,
        10, 11, 19, 10,  7, 16,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [None]:
f1_score(ds[0]['target_pos'], ds[0]['target_pos'], average='macro')

1.0

score across n batches

In [None]:
#batch size of 1 due to unsqueeze
scores = []
for i in range(ds[0]['target_pos'].unsqueeze(0).shape[0]):
    score = f1_score(ds[0]['target_pos'].unsqueeze(0)[i], ds[0]['target_pos'].unsqueeze(0)[i], average='macro')
    scores.append(score)
score/ds[0]['target_pos'].unsqueeze(0).shape[0]

1.0

In [None]:
f1_score(ds[0]['target_pos'], v, average='macro')

0.9044029672170375

In [None]:
accuracy_score(ds[0]['target_pos'], ds[0]['target_pos'])

1.0

In [None]:
accuracy_score(ds[0]['target_pos'], v)

0.9838709677419355

In [None]:
import Bert4NER.model.model as model

In [None]:
modeller = model.HasocModel(le_pos.classes_, le_pos.classes_)

In [None]:
dl = torch.utils.data.DataLoader(ds)

In [None]:
batch = next(iter(dl))

In [None]:
out = modeller(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])

In [None]:
dl.batch_size

1

In [None]:
out[0].shape

torch.Size([1, 124, 42])

In [None]:
out[0].argmax(dim=2).shape

torch.Size([1, 124])

In [None]:
out[0].argmax(dim=2)

tensor([[12, 33, 31,  5, 16, 37, 16, 16, 35,  3,  3,  2, 35, 35, 25, 12, 25, 16,
         16, 12, 21, 16, 22, 16,  2, 33,  3,  3,  4,  3,  3, 37, 37, 37, 37, 37,
         39, 37, 37,  3, 12,  3, 33, 31, 37,  6, 25,  6, 16, 19, 12, 12, 32, 12,
         31,  3, 37,  3,  3,  9,  3, 31, 37, 29,  3, 37,  3, 25, 37, 37,  3,  3,
         37,  6,  3, 37, 37, 16, 12, 31, 12, 31, 37,  3,  4,  3,  3, 37,  3, 12,
         12, 37, 37, 37, 25,  4,  3, 12, 29,  6, 37, 37, 37, 16, 12, 37,  2, 32,
         12, 29, 16,  3, 16,  3, 36,  4, 31, 27,  3,  3,  3,  3,  3,  4]])

In [None]:
out[0]

tensor([[[-0.4539,  0.0314, -0.1590,  ...,  0.0065,  0.4704, -0.3819],
         [-0.3163,  0.5107, -0.6074,  ...,  0.1804, -0.2988, -0.0019],
         [-0.5365,  0.0143, -0.1526,  ..., -0.1443, -0.0552,  0.3893],
         ...,
         [-0.1756, -0.1651, -0.0453,  ...,  0.1317,  0.3725, -0.3099],
         [-0.2844, -0.1919, -0.0015,  ..., -0.1084,  0.1476, -0.3613],
         [-0.2135,  0.2744, -0.1936,  ..., -0.4689, -0.1695,  0.0111]]],
       grad_fn=<AddBackward0>)

In [None]:
out[0].softmax(dim=2)

tensor([[[0.0142, 0.0231, 0.0191,  ..., 0.0226, 0.0359, 0.0153],
         [0.0167, 0.0381, 0.0125,  ..., 0.0274, 0.0170, 0.0228],
         [0.0126, 0.0219, 0.0186,  ..., 0.0187, 0.0205, 0.0319],
         ...,
         [0.0199, 0.0201, 0.0227,  ..., 0.0271, 0.0345, 0.0174],
         [0.0187, 0.0205, 0.0248,  ..., 0.0223, 0.0288, 0.0173],
         [0.0177, 0.0289, 0.0181,  ..., 0.0137, 0.0185, 0.0222]]],
       grad_fn=<SoftmaxBackward>)

In [None]:
out[0].softmax(dim=2).argmax(dim=2)

tensor([[12, 33, 31,  5, 16, 37, 16, 16, 35,  3,  3,  2, 35, 35, 25, 12, 25, 16,
         16, 12, 21, 16, 22, 16,  2, 33,  3,  3,  4,  3,  3, 37, 37, 37, 37, 37,
         39, 37, 37,  3, 12,  3, 33, 31, 37,  6, 25,  6, 16, 19, 12, 12, 32, 12,
         31,  3, 37,  3,  3,  9,  3, 31, 37, 29,  3, 37,  3, 25, 37, 37,  3,  3,
         37,  6,  3, 37, 37, 16, 12, 31, 12, 31, 37,  3,  4,  3,  3, 37,  3, 12,
         12, 37, 37, 37, 25,  4,  3, 12, 29,  6, 37, 37, 37, 16, 12, 37,  2, 32,
         12, 29, 16,  3, 16,  3, 36,  4, 31, 27,  3,  3,  3,  3,  3,  4]])

In [None]:
batch['input_ids'].shape

torch.Size([1, 124])

In [None]:
batch['input_ids']

tensor([[  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
          1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
          3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])