In [None]:
#default_exp dataset.dataset

In [None]:
#export
import os
import torch
import transformers

import pandas as pd
import numpy as np
import Hasoc.config as config

In [None]:
#hide
df = pd.read_csv(config.DATA_PATH/'fold_df.csv')

In [None]:
#hide
df.head(2)

Unnamed: 0,tweet_id,text,task1,task2,ID,kfold_task1,kfold_task2
0,1.126953e+18,"We need a word for ‘going somewhere alone,sitt...",NOT,NONE,hasoc_2020_en_1503,2,0
1,1.123482e+18,RT @RiverCityLabs: Come and work from our spac...,NOT,NONE,hasoc_2020_en_3570,2,0


In [None]:
#hide
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(df.task1)
le.classes_

array(['HOF', 'NOT'], dtype=object)

In [None]:
#hide
df['task1_encoded'] = le.transform(df.task1.values)

In [None]:
#hide
# TOKENIZER = transformers.BertTokenizer.from_pretrained(
#             pretrained_model_name_or_path='bert-base-uncased',
#             do_lower_case=True,
#             # force_download = True,
#         )

# MAX_LEN = 72

In [None]:
#export
class BertDataset(torch.utils.data.Dataset):
    def __init__(self,text, target=None, is_test=False):
        self.text, self.target = text, target
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
        self.is_test = is_test

    def __len__(self):
        return len(self.target)

    def __getitem__(self, i):
        # sanity check
        text = ' '.join(self.text[i].split())

        # tokenize using Huggingface tokenizers
        out = self.tokenizer.encode_plus(text, None,
                                   add_special_tokens=True,
                                   max_length = self.max_len,
                                   truncation=True)

        ids = out['input_ids']
        mask = out['attention_mask']
        token_type_ids = out['token_type_ids']

        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        if not self.is_test:
            return {
                'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': self.onehot(len(np.unique(self.target)), self.target[i])
            }
        else:
            return{
                'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            }

    @staticmethod
    def onehot(size, target):
        vec = torch.zeros(size, dtype=torch.long)
        vec[target] = 1.
        return vec

    def get_labels(self):
        return list(self.target)

In [None]:
#hide
d = BertDataset(df.text.values, df.task1_encoded.values)

In [None]:
#hide
d[10]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101, 23917, 10958,  3676,  3540,  3791,  2115,  2393,  1012,  2023,
          2388,  1997, 20662,  8178,  2179,  2041,  2016,  2018,  4456,  2096,
          2016,  2001,  1022,  1011,  2706,  6875,  1012,  1529, 16770,  1024,
          1013,  1013,  1056,  1012,  2522,  1013,  1057,  3501,  2278,  2575,
         12514,  6777,  2080,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]),
 'targets': tensor([0, 1]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0

In [None]:
c = d[0]['targets']

In [None]:
c.argmax(dim=-1)

tensor(1)