In [None]:
import torch
import gc
import logging

logging.disable('WARNING')
gc.collect()
torch.cuda.empty_cache()

- **pd.read_json 사용 가능하도록 사전 준비**

In [None]:
import pandas as pd
import json

with open('testing.json', encoding='utf-8') as f:
    data = json.load(f)
    data = data[:1000]
with open('test2.json', 'w') as f:
    json.dump(data, f, ensure_ascii=True)
data = pd.read_json('test2.json')
data.to_json('test2.json', lines=True, orient='records')

## Customizing Iterable Dataset

In [None]:
from torch.utils.data import IterableDataset
import pandas as pd
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

checkpoint = 'facebook/m2m100_418M'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = M2M100Tokenizer.from_pretrained(checkpoint, batched=True)

class MyDataset(IterableDataset):
    def __init__(self, path, tokenizer):
        self.tokenizer = tokenizer
        self.path = path
    def __iter__(self):
        chunksize = 10
        iter_json = pd.read_json(self.path, lines=True, chunksize=chunksize)
        for chunk in iter_json:
            for idx in range(chunksize):
                tokenized_data = {'input_ids':None, 'attention_mask':None, 'labels':None}
                length = []
                idx = idx + chunk.index[0]
                worker = torch.utils.data.get_worker_info()
                worker_id = worker.id if worker is not None else -1
                
                self.tokenizer.src_lang = chunk['src_lang'][idx]
                inputs = self.tokenizer(chunk['src_text'][idx], return_tensors='pt')
                self.tokenizer.tgt_lang = chunk['tgt_lang'][idx]
                with self.tokenizer.as_target_tokenizer():
                    labels = self.tokenizer(chunk['tgt_text'][idx], return_tensors='pt').input_ids
                tokenized_data['input_ids'] = inputs['input_ids'].reshape(-1)
                tokenized_data['attention_mask'] = inputs['attention_mask'].reshape(-1)
                tokenized_data['labels'] = labels.reshape(-1)
                
                length.append(len(tokenized_data['input_ids']))
                length.append(len(tokenized_data['attention_mask']))
                length.append(len(tokenized_data['labels']))
                yield tokenized_data, length, worker_id
                

dataset = MyDataset('test2.json', tokenizer)

## Customizing Collate Fn

In [None]:
from torch.utils.data import DataLoader

def collate_fn(data):
    '''
    sample :
    [{'input_ids':tensor([]), 'attention_mask':tensor([]), 'labels':tensor([])}, ...]
    
    +) dict는 batch size만큼 있다
    
    lengths :
    ([32, 32, 30], [27, 27, 24], [16, 16, 19], [56, 56, 62])
    '''
    
    samples, lengths, _ = zip(*data)
    ids_len, mask_len, label_len = zip(*lengths)
    max_ids_len, max_mask_len, max_label_len = max(ids_len), max(mask_len), max(label_len)
    ids_features = torch.zeros((len(data), max_ids_len), dtype=int)
    mask_features = torch.zeros((len(data), max_mask_len), dtype=int)
    label_features = torch.zeros((len(data), max_label_len), dtype=int)

    for i in range(len(data)):
        ids_idx = ids_len[i]
        mask_idx = mask_len[i]
        label_idx = label_len[i]
        
        ids_features[i] = torch.cat([samples[i]['input_ids'], torch.zeros((max_ids_len - ids_idx), dtype=int)])
        mask_features[i] = torch.cat([samples[i]['attention_mask'], torch.zeros((max_mask_len - mask_idx), dtype=int)])
        label_features[i] = torch.cat([samples[i]['labels'], torch.zeros((max_label_len - label_idx), dtype=int)])
    batch = {'input_ids':ids_features, 'attention_mask':mask_features, 'labels':label_features}
    return batch

dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

# for batch in dataloader:
#     print(batch)
#     break

In [None]:
from transformers import AdamW, get_scheduler

model = M2M100ForConditionalGeneration.from_pretrained(checkpoint)
model.to(device) # model-gpu
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 20
num_training_steps = num_epochs * 10000
# num_eval_steps = num_epochs * len(eval_dataloader)
lr_scheduler = get_scheduler('linear',
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps)

In [None]:
# Colab 에서는 돌아감

from tqdm.notebook import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
train_metric = []
for epoch in range(num_epochs):
    train_loss = 0.0
    for batch in dataloader:
        batch = {k:v.to(device) for k, v in batch.items()} # data-gpu
        output = model(**batch)
        loss = output.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        train_loss += loss.item()
    train_loss /= num_training_steps
    train_metric.append(train_loss)

## + High Level HF Implementation

In [None]:
from datasets import Dataset
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import DataLoader
import torch

checkpoint = 'facebook/m2m100_418M'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = M2M100Tokenizer.from_pretrained(checkpoint, batched=True)

def tokenize_function(data):
    samples = []
    tokenized_data = {'input_ids':None, 'attention_mask':None, 'labels':None}
    for line in data:
        tokenizer.src_lang = line['src_lang']
        inputs = tokenizer(line['src_text'], return_tensors='pt')
        tokenizer.tgt_lang = line['tgt_lang']
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(line['tgt_text'], return_tensors='pt').input_ids
        tokenized_data['input_ids'] = inputs['input_ids'].reshape(-1)
        tokenized_data['attention_mask'] = inputs['attention_mask'].reshape(-1)
        tokenized_data['labels'] = labels.reshape(-1)
        samples.append(tokenized_data)
    return samples

raw_dataset = Dataset.from_json('testing.json')
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_dataset = tokenized_dataset.train_test_split()