In [1]:
import json
with open('testing.json', encoding='utf-8') as f:
    data = json.load(f)
    data = data[:10000]

In [2]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

In [3]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import Dataset, DataLoader

checkpoint = 'facebook/m2m100_418M'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = M2M100ForConditionalGeneration.from_pretrained(checkpoint)
model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(checkpoint, batched=True)

def tokenize_function(data):
    samples = []
    tokenized_data = {'input_ids':None, 'attention_mask':None, 'labels':None}
    for line in data:
        tokenizer.src_lang = line['src_lang']
        inputs = tokenizer(line['src_text'], return_tensors='pt')
        tokenizer.tgt_lang = line['tgt_lang']
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(line['tgt_text'], return_tensors='pt').input_ids
        tokenized_data['input_ids'] = inputs['input_ids'].reshape(-1)
        tokenized_data['attention_mask'] = inputs['attention_mask'].reshape(-1)
        tokenized_data['labels'] = labels.reshape(-1)
        samples.append(tokenized_data)
    return samples

tokenized_data = tokenize_function(data)
del data

In [4]:
tokenized_data[0:2]

[{'input_ids': tensor([128052,    244,  14653,  20104,  16258,  44668,  28012,   8990,   3660,
            4813,  52890,   5448,   8726,   1452,  36617,     12,    522,  12700,
            2973, 119686,    668,  27253,  12689,   7796,   3639,    526,   7375,
            4463,  12094,   5757,  56083,  29147,  45680,  96238,   1452,  17736,
          107521,      4,   5117,   7796,   3639,   1028,  67777,    273,  34452,
             526,  28012,   8990,   3660,  50838,   1384,  68882,    949,   5448,
            8726,  36169,  63767,  10340,      5,      2]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
  'labels': tensor([128022,  53191,   1197,    441,  38997,    432,   1197,     33,  14653,
            3900,  15945,   3593,    193,   1197,   2594,   1006,   1713,    963,
             432,   5273,  3342

In [4]:
class MyDataset(Dataset):
    def __init__(self, samples):
        super().__init__()
        self.samples = samples
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        return self.samples[idx]

# def collate_fn_pad(batch):

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataset = MyDataset(tokenized_data)
dataloader = DataLoader(dataset, shuffle=True, batch_size=2, collate_fn=data_collator)

In [5]:
from transformers import AdamW, get_scheduler
from transformers import TrainingArguments

num_epoch = 10
num_tot_steps = num_epoch * len(dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_tot_steps)

In [6]:
from tqdm.notebook import tqdm

pbar = tqdm(range(num_tot_steps))

model.train()
for epoch in range(num_epoch):
    train_loss = 0.0
    for batch in dataloader:
        batch = {k:v.to(device) for k, v in batch.items()}
        output = model(**batch)
        loss = output.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        
        train_loss += loss.item()
        pbar.update(1)
    train_loss /= num_tot_steps
    print(f'train loss : {train_loss}')

  0%|          | 0/50000 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 502.00 MiB (GPU 0; 6.00 GiB total capacity; 4.43 GiB already allocated; 44.50 MiB free; 4.58 GiB reserved in total by PyTorch)

## Train / Test Split 추가

In [None]:
from datasets import Dataset

raw_dataset = Dataset.from_json('testing.json')
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_dataset = tokenized_dataset.train_test_split()

Using custom data configuration default-f560f2d793950f4e


Downloading and preparing dataset json/default to C:\Users\CPB06GameN\.cache\huggingface\datasets\json\default-f560f2d793950f4e\0.0.0...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]