## Solution

Tokenizer를 Class 안에서 encoding 함수를 만들어서 해결한다  

getitem return을 {'input_ids':torch.tensor(), 'attention_mask':torch.tensor(), 'labels':torch.tensor()}  
이 형식으로 만든다  
  
input_ids와 attention_mask의 torch tensor는 1차원이어야 한다

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
torch.randn(1, 3)

tensor([[-0.7643, -0.3253,  0.3366]])

In [3]:
import numpy as np
tmp = [torch.randn(1, lst) for lst in range(1, 5)]
tmp2 = [torch.randn(1, lst) for lst in range(3, 7)]
tmp3 = [np.random.randn(1, lst) for lst in range(1, 5)]
tmp4 = [np.random.randn(1, lst) for lst in range(3, 7)]
tmp

[tensor([[-1.2275]]),
 tensor([[-1.0601, -1.9544]]),
 tensor([[-0.7470,  0.7065, -0.6278]]),
 tensor([[2.3750, 0.3567, 1.7337, 0.9199]])]

In [40]:
from transformers import M2M100Tokenizer, DataCollatorWithPadding
tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [63]:
class MyDataset(Dataset):
    def __init__(self, samples, tokenizer):
        super().__init__()
        self.samples = samples
        self.tokenizer = tokenizer
    def _encoding(self, key):
        enc = []
        for dic in self.samples:
            token = self.tokenizer(dic[key], return_tensors='pt')
            token['input_ids'] = token['input_ids'].reshape(-1)
            token['attention_mask'] = token['attention_mask'].reshape(-1)
#             token['token_type_ids'] = token['token_type_ids'].reshape(-1)
            enc.append(token)
        return enc
#     def _labels(self, key):
#         labels = []
#         with self.tokenizer.as_target_tokenizer():
#             labels = self.tokenizer().input_ids
    def __getitem__(self, idx):
#         return {'src_text':self._encoding('src_text')[idx]}, 'tgt_text':self._encoding('tgt_text')[idx]}
        return self._encoding('src_text')[idx]
    def __len__(self):
        return len(self.samples)

In [64]:
samples = [{'src_text':'why so serious', 'tgt_text':'왤케 진지빰'},
           {'src_text':'I am a boy', 'tgt_text':'난 소년헤헤'},
           {'src_text':'my mind do it fire', 'tgt_text':'왈왈소리'}]

dataset = MyDataset(samples, tokenizer)

In [16]:
dataset.__getitem__(1)

{'src_text': {'input_ids': [128022, 203, 257, 8, 24091, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]},
 'tgt_text': {'input_ids': [128022, 56888, 6864, 3639, 118323, 118323, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}}

In [17]:
for i in iter(dataset):
    print(i)

{'src_text': {'input_ids': [128022, 120764, 324, 123659, 2], 'attention_mask': [1, 1, 1, 1, 1]}, 'tgt_text': {'input_ids': [128022, 22, 3, 55792, 14258, 1452, 3, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}}
{'src_text': {'input_ids': [128022, 203, 257, 8, 24091, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}, 'tgt_text': {'input_ids': [128022, 56888, 6864, 3639, 118323, 118323, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}}
{'src_text': {'input_ids': [128022, 1949, 9963, 61, 862, 24923, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}, 'tgt_text': {'input_ids': [128022, 22, 126732, 126732, 3889, 1509, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}}


In [65]:
for i in dataset:
    print(i)

{'input_ids': tensor([128022, 120764,    324, 123659,      2]), 'attention_mask': tensor([1, 1, 1, 1, 1])}
{'input_ids': tensor([128022,    203,    257,      8,  24091,      2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1])}
{'input_ids': tensor([128022,   1949,   9963,     61,    862,  24923,      2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1])}


In [68]:
dataloader = DataLoader(dataset, batch_size=3, collate_fn=data_collator)
for batch in dataloader:
    print(batch)

{'input_ids': tensor([[128022, 120764,    324, 123659,      2,      1,      1],
        [128022,    203,    257,      8,  24091,      2,      1],
        [128022,   1949,   9963,     61,    862,  24923,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])}


### HF Course

In [69]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

Reusing dataset glue (/home/jaehoon/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jaehoon/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-079b17889f69c334.arrow
Loading cached processed dataset at /home/jaehoon/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7c0047ab1a488ed1.arrow
Loading cached processed dataset at /home/jaehoon/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-2d555fabc6ea4aba.arrow


In [70]:
for i in tokenized_datasets['train']:
    print(i)
    break

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), 'input_ids': tensor([  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
         2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
         3436,  2010,  3350,  1012,   102,  7727,  2000,  2032,  2004,  2069,
         1000,  1996,  7409,  1000,  1010,  2572,  3217,  5831,  5496,  2010,
         2567,  1997,  9969,  4487, 23809,  3436,  2010,  3350,  1012,   102]), 'labels': tensor(1), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1])}


In [71]:
for i in train_dataloader:
    print(i)
    break

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 