In [None]:
#default_exp dataset.dataset

In [None]:
#export
import os
import torch
import transformers

import pandas as pd
import numpy as np
import Bert4NER.config as config

from sklearn.preprocessing import LabelEncoder

In [None]:
#export
class EntityDataset(torch.utils.data.Dataset):
    def __init__(self, texts, pos, tags):
        self.texts = texts
        self.pos = pos
        self.tags = tags

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        pos = self.pos[item]
        tag = self.tags[item]

        tokens = []
        target_pos = []
        target_tag = []
        
        # tokenize the each word in the text string
        for i, word in enumerate(text):
            inputs = config.TOKENIZER.encode(
                word,
                add_special_tokens = False,
                truncation = True
            )

            input_len = len(inputs)
            tokens.extend(inputs)

            # the tag for that particular word should be the same for all the 
            # sub tokens of the word
            
            target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tag[i]] * input_len)

        tokens = tokens[:config.MAX_SEQ_LEN - 2]
        target_pos = target_pos[:config.MAX_SEQ_LEN - 2]
        target_tag = target_tag[:config.MAX_SEQ_LEN - 2]

        tokens = [101] + tokens + [102]
        target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(tokens)
        token_type_ids = [0] * len(tokens)

        pad_len = (config.MAX_SEQ_LEN) - len(tokens)

        tokens = tokens + ([0] * pad_len)
        mask = mask + ([0] * pad_len)
        token_type_ids = token_type_ids + ([0] * pad_len)
        target_pos = target_pos + ([0] * pad_len)
        target_tag = target_tag + ([0] * pad_len)

        return {
            'input_ids': torch.tensor(tokens, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target_pos': torch.tensor(target_pos, dtype=torch.long),
            'target_tag': torch.tensor(target_tag, dtype=torch.long),
        }

In [None]:
#hide
df = pd.read_csv(config.DATA_PATH/'ner_datasetreference.csv', encoding='latin-1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


We use the foward fill method in pandas to fill all the nans for the each sentence in the `Sentence #` column.

In [None]:
#hide
df['Sentence #'].fillna(method='ffill')

0              Sentence: 1
1              Sentence: 1
2              Sentence: 1
3              Sentence: 1
4              Sentence: 1
                ...       
1048570    Sentence: 47959
1048571    Sentence: 47959
1048572    Sentence: 47959
1048573    Sentence: 47959
1048574    Sentence: 47959
Name: Sentence #, Length: 1048575, dtype: object

In [None]:
#hide
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

In total we cans ee that there are 47959 sentences in our dataset

In [None]:
len(df['Sentence #'].unique())

47959

Now let us encode all the labels for every word in every sentence

In [None]:
#hide
le_pos = LabelEncoder()
le_tag = LabelEncoder()

In [None]:
#hide
df["encoded_POS"] = le_pos.fit_transform(df.POS)
df["encoded_Tag"] = le_tag.fit_transform(df.Tag)

In [None]:
#hide
le_pos.classes_, le_tag.classes_

(array(['$', ',', '.', ':', ';', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ',
        'JJR', 'JJS', 'LRB', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT',
        'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'RRB', 'TO', 'UH',
        'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB',
        '``'], dtype=object),
 array(['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per',
        'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org',
        'I-per', 'I-tim', 'O'], dtype=object))

We can now group the df according to the `Sentence #` and use it to curate the `Word`, `POS` and `Tag`

In [None]:
sentence_group = df.groupby('Sentence #')

In [None]:
#hide
grouped_words = sentence_group['Word'].apply(list)
grouped_POS = sentence_group['encoded_POS'].apply(list)
grouped_Tag = sentence_group['encoded_Tag'].apply(list)

In [None]:
#hide
grouped_words

Sentence #
Sentence: 1        [Thousands, of, demonstrators, have, marched, ...
Sentence: 10       [Iranian, officials, say, they, expect, to, ge...
Sentence: 100      [Helicopter, gunships, Saturday, pounded, mili...
Sentence: 1000     [They, left, after, a, tense, hour-long, stand...
Sentence: 10000    [U.N., relief, coordinator, Jan, Egeland, said...
                                         ...                        
Sentence: 9995     [Opposition, leader, Mir, Hossein, Mousavi, ha...
Sentence: 9996     [On, Thursday, ,, Iranian, state, media, publi...
Sentence: 9997     [Following, Iran, 's, disputed, June, 12, elec...
Sentence: 9998     [Since, then, ,, authorities, have, held, publ...
Sentence: 9999     [The, United, Nations, is, praising, the, use,...
Name: Word, Length: 47959, dtype: object

In [None]:
sentences = grouped_words.values
tags = grouped_Tag.values
pos = grouped_POS.values

In [None]:
#hide
len(sentences), len(tags), len(pos)

(47959, 47959, 47959)

Now let's construct the dataset

In [None]:
df.encoded_POS.values

array([19, 10, 19, ..., 29,  7, 16])

In [None]:
#hide
ds = EntityDataset(texts=sentences, pos=pos, tags=tags)

In [None]:
ds[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'input_ids': tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
          1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
          3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,   

In [None]:
from sklearn.metrics import f1_score, accuracy_score

In [None]:
v = ds[0]['target_pos'].clone()
v[[9, 0]] = 1

In [None]:
v

tensor([ 1, 19, 10, 19, 35, 34, 10, 17, 29,  1,  7, 16, 10, 17,  5, 31,  7, 16,
        10, 11, 19, 10,  7, 16,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [None]:
f1_score(ds[0]['target_pos'], ds[0]['target_pos'], average='macro')

1.0

score across n batches

In [None]:
#batch size of 1 due to unsqueeze
scores = []
for i in range(ds[0]['target_pos'].unsqueeze(0).shape[0]):
    score = f1_score(ds[0]['target_pos'].unsqueeze(0)[i], ds[0]['target_pos'].unsqueeze(0)[i], average='macro')
    scores.append(score)
score/ds[0]['target_pos'].unsqueeze(0).shape[0]

1.0

In [None]:
f1_score(ds[0]['target_pos'], v, average='macro')

0.9044029672170375

In [None]:
accuracy_score(ds[0]['target_pos'], ds[0]['target_pos'])

1.0

In [None]:
accuracy_score(ds[0]['target_pos'], v)

0.9838709677419355

In [None]:
import Bert4NER.model.model as model

In [None]:
modeller = model.EntityModel(len(le_tag.classes_), len(le_pos.classes_))

In [None]:
dl = torch.utils.data.DataLoader(ds)

In [None]:
batch = next(iter(dl))

In [None]:
out = modeller(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])

In [None]:
dl.batch_size

1

In [None]:
out[0].shape

torch.Size([1, 124, 17])

In [None]:
out[0].argmax(dim=2).shape

torch.Size([1, 124])

In [None]:
out[0].argmax(dim=2)

tensor([[ 0,  4,  4,  1, 15,  6, 15, 13, 15,  7, 15, 15, 15,  7, 15,  9, 15, 14,
         13,  2, 15, 15, 15, 15,  7, 15,  4, 11,  9,  6, 11,  2,  9, 11, 11, 11,
         11, 11,  7, 11,  1,  9, 11,  6,  9, 11,  9,  9, 11, 15, 15, 11,  5, 11,
          6, 13, 11,  4, 11, 11, 11,  9, 11, 11,  0, 16, 11, 11,  9, 11,  6,  9,
         13, 11, 11, 11, 15,  1, 15,  4, 10,  4,  4, 10,  4, 11, 11, 13, 11, 11,
         11, 11, 11,  1, 11, 10, 15, 11,  1, 11,  8,  9, 11,  4, 11,  9, 15,  2,
         15,  4,  4, 13,  4, 11, 11, 11, 11,  2,  7,  5, 11, 10,  5, 11]])

In [None]:
out[0]

tensor([[[ 0.8333, -0.1145, -0.3688,  ...,  0.2512, -0.0550,  0.3903],
         [-0.1125,  0.2988, -0.0009,  ..., -0.1386,  0.2160,  0.3472],
         [-0.0033,  0.3756, -0.1734,  ..., -0.3377,  0.1118,  0.5452],
         ...,
         [-0.3518, -0.1361,  0.0036,  ..., -0.1492,  0.2902,  0.0686],
         [ 0.0101, -0.1048,  0.0638,  ...,  0.0149, -0.0476, -0.2547],
         [-0.3496,  0.1182, -0.2184,  ...,  0.0937,  0.2976,  0.2573]]],
       grad_fn=<AddBackward0>)

In [None]:
out[0].softmax(dim=2)

tensor([[[0.1181, 0.0458, 0.0355,  ..., 0.0660, 0.0486, 0.0758],
         [0.0472, 0.0713, 0.0528,  ..., 0.0460, 0.0656, 0.0748],
         [0.0515, 0.0753, 0.0435,  ..., 0.0369, 0.0578, 0.0892],
         ...,
         [0.0433, 0.0537, 0.0617,  ..., 0.0530, 0.0822, 0.0659],
         [0.0620, 0.0553, 0.0655,  ..., 0.0623, 0.0586, 0.0476],
         [0.0385, 0.0614, 0.0438,  ..., 0.0599, 0.0735, 0.0706]]],
       grad_fn=<SoftmaxBackward>)

In [None]:
out[0].softmax(dim=2).argmax(dim=2)

tensor([[ 0,  4,  4,  1, 15,  6, 15, 13, 15,  7, 15, 15, 15,  7, 15,  9, 15, 14,
         13,  2, 15, 15, 15, 15,  7, 15,  4, 11,  9,  6, 11,  2,  9, 11, 11, 11,
         11, 11,  7, 11,  1,  9, 11,  6,  9, 11,  9,  9, 11, 15, 15, 11,  5, 11,
          6, 13, 11,  4, 11, 11, 11,  9, 11, 11,  0, 16, 11, 11,  9, 11,  6,  9,
         13, 11, 11, 11, 15,  1, 15,  4, 10,  4,  4, 10,  4, 11, 11, 13, 11, 11,
         11, 11, 11,  1, 11, 10, 15, 11,  1, 11,  8,  9, 11,  4, 11,  9, 15,  2,
         15,  4,  4, 13,  4, 11, 11, 11, 11,  2,  7,  5, 11, 10,  5, 11]])

In [None]:
batch['input_ids'].shape

torch.Size([1, 124])

In [None]:
batch['input_ids']

tensor([[  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
          1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
          3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])

In [None]:
batch['attention_mask'][0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])

In [None]:
batch['input_ids'][0]

tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
         1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
         3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])

In [None]:
batch['input_ids'][0] * batch['attention_mask'][0]

tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
         1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
         3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])