#### Trainer provide easily train or fine-tune transformer models
<div>
<img src="image/trainer1.png" width=800>
</div>

#### Preprocessing the GLUE MRPC dataset using dynamic padding

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
print(raw_datasets.column_names)
print(raw_datasets.num_rows)
print(raw_datasets)
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

{'train': ['sentence1', 'sentence2', 'label', 'idx'], 'validation': ['sentence1', 'sentence2', 'label', 'idx'], 'test': ['sentence1', 'sentence2', 'label', 'idx']}
{'train': 3668, 'validation': 408, 'test': 1725}
DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})




#### Set the model and training arguments before creating the trainer

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

training_args = TrainingArguments(
    output_dir="test-trainer",
    per_device_train_batch_size=16,
    per_gpu_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### We can then pass everything to teh ***Trainer*** class and start training

In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mqiyaoxue[0m ([33mqiyaoxue-university-of-pittsburgh[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


TrainOutput(global_step=290, training_loss=0.31281146345467403, metrics={'train_runtime': 45.8426, 'train_samples_per_second': 400.064, 'train_steps_per_second': 6.326, 'total_flos': 824195380915200.0, 'train_loss': 0.31281146345467403, 'epoch': 5.0})

### The ***predict*** methdod allow us to get the predictions of our model on a whole dataset, the metrics can be computed based on the prediction

In [19]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions)
print(predictions.predictions.shape, predictions.label_ids.shape)

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


PredictionOutput(predictions=array([[-1.6088499 ,  1.5501962 ],
       [ 2.1464198 , -1.6830001 ],
       [ 1.8887107 , -1.7898929 ],
       [-1.5972127 ,  1.5385838 ],
       [ 1.774671  , -1.4977522 ],
       [-1.5907019 ,  1.5072527 ],
       [-1.5342594 ,  1.4334662 ],
       [-1.4576026 ,  1.2536845 ],
       [-1.5580225 ,  1.4583511 ],
       [-1.5831745 ,  1.5435964 ],
       [-1.5711699 ,  1.5502669 ],
       [ 1.1144499 , -1.3078934 ],
       [ 2.1658502 , -1.9159585 ],
       [-1.3888607 ,  1.1904985 ],
       [-1.6151892 ,  1.551818  ],
       [ 1.1762064 , -1.4395137 ],
       [-1.610661  ,  1.5660318 ],
       [ 0.243424  , -0.82058716],
       [-1.6221073 ,  1.5586569 ],
       [-0.4881922 ,  0.12062579],
       [ 1.7689115 , -1.564496  ],
       [-1.5147029 ,  1.3050042 ],
       [-0.516321  ,  0.12758185],
       [-1.6041856 ,  1.5550395 ],
       [-1.6243608 ,  1.5212587 ],
       [ 2.0356233 , -1.7156696 ],
       [-0.35949177, -0.04092238],
       [-1.5880136 ,  1.56

In [24]:
import numpy as np
from datasets import load_metric

metric = load_metric("glue", "mrpc")
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8357843137254902, 'f1': 0.8854700854700854}

#### To monitor metrics during training, we need to define a compute_metrics function and pass it to the ***Trainer***

In [25]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    TrainingArguments=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)