## HF A to Z : Trainer 사용

#### tokenize_function
> Map-style 사용을 위한 함수  
- tokenizer 실행시 메모리 관리를 위해 **함수**를 만들고 **map**을 사용한다
  
#### DataCollatorWithPadding  
  
>동적 패딩방식
- **배치마다 가장 큰 seq_len를 기준**으로, 배치별로 서로 다르게 패딩을 한다

In [27]:
import gc
gc.collect()

4065

In [49]:
import logging
logging.disable(logging.WARNING)

In [19]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from datasets import load_metric
import numpy as np
import torch

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# map-style을 활용하여 **메모리를 절약**하기 위해 func을 만든다
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

def compute_metrics(eval_preds):
    metric = load_metric('glue', 'mrpc')
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # 동적 패딩
training_args = TrainingArguments('test-trainer',
                                  evaluation_strategy='epoch', # 저장 경로 test-trainer
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4) 
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

raw_dataset = load_dataset('glue', 'mrpc')
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)

trainer = Trainer(model,
                 training_args,
                 train_dataset=tokenized_dataset['train'],
                 eval_dataset=tokenized_dataset['validation'],
                 data_collator=data_collator,
                 tokenizer=tokenizer,
                 compute_metrics=compute_metrics)

trainer.train()

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6405,0.469604,0.781863,0.833645
2,0.5806,0.72063,0.821078,0.873043
3,0.4016,0.739696,0.840686,0.889643


TrainOutput(global_step=2751, training_loss=0.5209580161102639, metrics={'train_runtime': 488.3194, 'train_samples_per_second': 22.534, 'train_steps_per_second': 5.634, 'total_flos': 377309475606720.0, 'train_loss': 0.5209580161102639, 'epoch': 3.0})

## HF A to Z : Pytorch 사용

- metric 함수를 따로 만들지 않아도 된다  
  
  
- dataset의 필요한 col만 추출해서 직접 DataLoader에 태워야 한다


In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from datasets import load_metric
import numpy as np
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

training_args = TrainingArguments('del-dir', evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device) # model-gpu

from torch.utils.data import DataLoader

raw_dataset = load_dataset('glue', 'mrpc')
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)

# col : sentence1  sentence2  idx  label  attention_mask  input_ids  token_type_ids
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=4, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=4, collate_fn=data_collator)

  0%|          | 0/3 [00:00<?, ?it/s]

In [51]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
num_eval_steps = num_epochs * len(eval_dataloader)
lr_scheduler = get_scheduler('linear',
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps)

In [52]:
from tqdm.notebook import tqdm

metrics = {'train':[], 'validation':[]}

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    train_loss = 0.0
    for batch in train_dataloader:
        batch = {k:v.to(device) for k, v in batch.items()} # data-gpu
        output = model(**batch)
        loss = output.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        train_loss += loss.item()
    train_loss /= num_training_steps
    metrics['train'] = train_loss


progress_bar = tqdm(range(num_eval_steps))
metric = load_metric('glue', 'mrpc')
model.eval()
for epoch in range(num_epochs):
    for batch in eval_dataloader:
        batch = {k:v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            output = model(**batch)
        logits = output.logits
        preds = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=preds, references=batch['labels'])
        progress_bar.update(1)
    metric.compute()
    metrics['validation'] = metric

  0%|          | 0/2751 [00:00<?, ?it/s]

  0%|          | 0/306 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data_struct.dtype == np.object:  # pytorch tensors cannot be instantied from an array of objects
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data_struct.dtype == np.object:  # pytorch tensors cannot be instantied from an array of objects


In [59]:
metric = metrics['validation']
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

## HF A to Z : Accelerator + Pytorch 사용

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transfomrers import AutoModelForSequenceClassification
from datasets import load_metric
import numpy as np
import torch

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(example):
    return tokenize(example['sentence1'], example['sentence2'], truncation=True)

training_args = TrainingArguments('del-dir', evalutation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

raw_dataset = load_dataset('glue', 'mrpc')
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)


from torch.utils.data import DataLoader

tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenzied_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=4, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=4, collate_fn=data_collator)

In [None]:
from transformer import AdamW, get_scheduler
from accelerate import Accelerator

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = epoch * len(train_dataloader)
num_eval_steps = epoch * len(eval_dataloader)
lr_scheduler = get_scheduler('linear',
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps)

accelerator = Accelerator()
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader,
                                                                         eval_dataloader,
                                                                         model,
                                                                         optimizer)