# Lab – Large Language Models (LLM) — Notebook complet (TP)
**Master 2 Informatique – VMI | Multi-modalité et IA générative (IFLCE055)**

Ce notebook suit **exactement** l'énoncé :
1. Inference without training  
2. Linear probing  
3. Fine-tuning  
(+ section "tokens / IDs / embeddings" demandée)

> Dataset: `cornell-movie-review-data/rotten_tomatoes`  
> Modèle: `cardiffnlp/twitter-roberta-base-sentiment-latest`


## 0) Installation & authentification Hugging Face (si nécessaire)


In [3]:
!pip -q install -U transformers datasets evaluate scikit-learn accelerate

#  login interactif
!huggingface-cli login

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/512.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m133.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  

## 1) Imports & seed


In [4]:
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from sklearn.metrics import classification_report, accuracy_score, f1_score


In [5]:
torch.manual_seed(42)
np.random.seed(42)

device = 0 if torch.cuda.is_available() else -1
print("CUDA:", torch.cuda.is_available(), "| device:", device)


CUDA: True | device: 0


# Part 1 — Inference without training
## 1.1 Charger le dataset


In [6]:
data = load_dataset("rotten_tomatoes")
data


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [7]:
# structure
print(data)
print("Splits:", list(data.keys()))
print("Train columns:", data["train"].column_names)
print("Label names (0/1):", set(data["train"]["label"]))


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})
Splits: ['train', 'validation', 'test']
Train columns: ['text', 'label']
Label names (0/1): {0, 1}


In [8]:
# first example
data["train"][0]
from collections import Counter
Counter(data["train"]["label"])


{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

## 1.2 Charger le pipeline sentiment-analysis


In [9]:
clf = pipeline(
    task="sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    return_all_scores=True,
    device=device
)

# Petit test
clf("I love this movie!")


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


[[{'label': 'negative', 'score': 0.003461067797616124},
  {'label': 'neutral', 'score': 0.01182967983186245},
  {'label': 'positive', 'score': 0.9847092628479004}]]

## 1.3 Inférence sur le split test + Classification report


In [10]:
def map_label(label_str: str) -> str:
    # Harmonise différents formats possibles: "LABEL_2", "positive", "POSITIVE", etc.
    l = label_str.strip().lower()
    if "pos" in l:
        return "POSITIVE"
    if "neg" in l:
        return "NEGATIVE"
    if "neu" in l:
        return "NEGATIVE"  # règle choisie
    # fallback: renvoyer tel quel
    return label_str

y_true = []
y_pred = []

for sample in data["test"]:
    scores = clf(sample["text"])[0]  # list of dicts
    best = max(scores, key=lambda x: x["score"])
    pred = map_label(best["label"])

    y_pred.append(pred)
    y_true.append("POSITIVE" if sample["label"] == 1 else "NEGATIVE")

print("Example preds:", list(zip(y_true[:5], y_pred[:5])))


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Example preds: [('POSITIVE', 'POSITIVE'), ('POSITIVE', 'POSITIVE'), ('POSITIVE', 'NEGATIVE'), ('POSITIVE', 'POSITIVE'), ('POSITIVE', 'NEGATIVE')]


In [11]:
print(classification_report(y_true, y_pred, digits=4))


              precision    recall  f1-score   support

    NEGATIVE     0.6821    0.9418    0.7912       533
    POSITIVE     0.9061    0.5610    0.6929       533

    accuracy                         0.7514      1066
   macro avg     0.7941    0.7514    0.7421      1066
weighted avg     0.7941    0.7514    0.7421      1066



In [30]:
from collections import Counter
Counter(data["train"]["label"])


Counter({1: 4265, 0: 4265})

# Affichage Tokens / IDs / Embeddings (exigé)


In [12]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
base_model = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

prompt = "I love this movie!"
inputs = tokenizer(prompt, return_tensors="pt")

with torch.no_grad():
    outputs = base_model(**inputs)

token_embeddings = outputs.last_hidden_state[0]     # (seq_len, hidden_size)
input_ids = inputs["input_ids"][0]                 # (seq_len,)

rows = []
for tid, emb in zip(input_ids, token_embeddings):
    rows.append({
        "Token": tokenizer.decode(tid),
        "ID": int(tid),
        "Embedding": emb.cpu().numpy()
    })

df_tokens = pd.DataFrame(rows)
df_tokens


Unnamed: 0,Token,ID,Embedding
0,<s>,0,"[-0.057010423, 0.67449325, -0.24589822, -0.401..."
1,I,100,"[-0.08691577, 0.25903338, -0.22076899, 0.10727..."
2,love,657,"[-0.3427853, 0.48895428, 0.054818332, 0.023184..."
3,this,42,"[-0.18056506, 0.20334326, 0.14085312, -0.06437..."
4,movie,1569,"[-0.3546601, 0.29174262, 0.101843774, -0.03130..."
5,!,328,"[-0.11802462, 0.16298954, 0.20535679, 0.113920..."
6,</s>,2,"[-0.057030816, 0.6745283, -0.24589291, -0.4016..."


> Note: la colonne `Embedding` contient des vecteurs longs (≈ 768 dim).  
Tu peux afficher seulement les 5 premières dimensions pour lisibilité :


In [13]:
df_small = df_tokens.copy()
df_small["Embedding"] = df_small["Embedding"].apply(lambda v: v[:5])
df_small


Unnamed: 0,Token,ID,Embedding
0,<s>,0,"[-0.057010423, 0.67449325, -0.24589822, -0.401..."
1,I,100,"[-0.08691577, 0.25903338, -0.22076899, 0.10727..."
2,love,657,"[-0.3427853, 0.48895428, 0.054818332, 0.023184..."
3,this,42,"[-0.18056506, 0.20334326, 0.14085312, -0.06437..."
4,movie,1569,"[-0.3546601, 0.29174262, 0.101843774, -0.03130..."
5,!,328,"[-0.11802462, 0.16298954, 0.20535679, 0.113920..."
6,</s>,2,"[-0.057030816, 0.6745283, -0.24589291, -0.4016..."


# Part 2 — Linear Probing
Objectif: geler le LLM et entraîner **seulement** la tête de classification.


## 2.1 Préparation : tokenisation du dataset


In [14]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True)

tokenized = data.map(tokenize_batch, batched=True)
tokenized


Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [15]:
# Data collator (padding dynamique)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


## 2.2 Modèle + gel des couches (freeze)



In [16]:
from transformers import AutoModelForSequenceClassification

model_lp = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    num_labels=2,
    ignore_mismatched_sizes=True,  # <-- clé
)

# Freeze tout sauf la tête
for param in model_lp.base_model.parameters():
    param.requires_grad = False

trainable = sum(p.numel() for p in model_lp.parameters() if p.requires_grad)
total = sum(p.numel() for p in model_lp.parameters())
print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.4f}%)")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Trainable params: 592,130 / 124,647,170 (0.4750%)


## 2.3 Entraînement Trainer + évaluation


In [17]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

from transformers import TrainingArguments

training_args_lp = TrainingArguments(
    output_dir="./results_linear_probing",
    learning_rate=5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.0,
    logging_steps=50
)


trainer_lp = Trainer(
    model=model_lp,
    args=training_args_lp,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer_lp.train()


  trainer_lp = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
50,0.4601
100,0.4547
150,0.3989
200,0.4251
250,0.3986
300,0.4507
350,0.4173
400,0.4273
450,0.4034
500,0.4082


TrainOutput(global_step=1602, training_loss=0.4005081894134314, metrics={'train_runtime': 148.0905, 'train_samples_per_second': 172.8, 'train_steps_per_second': 10.818, 'total_flos': 635885976372000.0, 'train_loss': 0.4005081894134314, 'epoch': 3.0})

In [18]:
# Évaluation
lp_test = trainer_lp.evaluate(tokenized["test"])
lp_test


{'eval_loss': 0.37845900654792786,
 'eval_accuracy': 0.8358348968105066,
 'eval_f1': 0.8350612629594723,
 'eval_runtime': 6.0174,
 'eval_samples_per_second': 177.153,
 'eval_steps_per_second': 5.65,
 'epoch': 3.0}

In [19]:
metrics_val = trainer_lp.evaluate()
print("VAL:", metrics_val)

metrics_test = trainer_lp.evaluate(tokenized["test"])
print("TEST:", metrics_test)


VAL: {'eval_loss': 0.33616429567337036, 'eval_accuracy': 0.8630393996247655, 'eval_f1': 0.8632958801498127, 'eval_runtime': 3.0247, 'eval_samples_per_second': 352.432, 'eval_steps_per_second': 11.241, 'epoch': 3.0}
TEST: {'eval_loss': 0.37845900654792786, 'eval_accuracy': 0.8358348968105066, 'eval_f1': 0.8350612629594723, 'eval_runtime': 3.0654, 'eval_samples_per_second': 347.755, 'eval_steps_per_second': 11.092, 'epoch': 3.0}


# Part 3 — Fine-Tuning
 Objectif : entraîner tout le modèle (ou partiellement).

## 3.1 Fine-tuning complet (aucune couche gelée)


In [20]:
from transformers import AutoModelForSequenceClassification

model_ft = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    num_labels=2,
    ignore_mismatched_sizes=True
)

trainable = sum(p.numel() for p in model_ft.parameters() if p.requires_grad)
total = sum(p.numel() for p in model_ft.parameters())
print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.4f}%)")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Trainable params: 124,647,170 / 124,647,170 (100.0000%)


In [21]:
from transformers import TrainingArguments

training_args_ft = TrainingArguments(
    output_dir="./results_finetune_full",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50
)


trainer_ft = Trainer(
    model=model_ft,
    args=training_args_ft,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer_ft.train()


  trainer_ft = Trainer(


Step,Training Loss
50,0.4592
100,0.4089
150,0.349
200,0.3243
250,0.3263
300,0.336
350,0.3202
400,0.3338
450,0.3201
500,0.3073


TrainOutput(global_step=1602, training_loss=0.23189121090368683, metrics={'train_runtime': 443.3574, 'train_samples_per_second': 57.719, 'train_steps_per_second': 3.613, 'total_flos': 635885976372000.0, 'train_loss': 0.23189121090368683, 'epoch': 3.0})

In [22]:
# Évaluation
ft_test = trainer_ft.evaluate(tokenized["test"])
ft_test

{'eval_loss': 0.5735285878181458,
 'eval_accuracy': 0.8827392120075047,
 'eval_f1': 0.880838894184938,
 'eval_runtime': 3.0387,
 'eval_samples_per_second': 350.809,
 'eval_steps_per_second': 11.189,
 'epoch': 3.0}

In [23]:
print("Validation:")
print(trainer_ft.evaluate())

print("Test:")
print(trainer_ft.evaluate(tokenized["test"]))

Validation:


{'eval_loss': 0.47964048385620117, 'eval_accuracy': 0.8921200750469043, 'eval_f1': 0.892623716153128, 'eval_runtime': 3.0211, 'eval_samples_per_second': 352.853, 'eval_steps_per_second': 11.254, 'epoch': 3.0}
Test:
{'eval_loss': 0.5735285878181458, 'eval_accuracy': 0.8827392120075047, 'eval_f1': 0.880838894184938, 'eval_runtime': 3.0996, 'eval_samples_per_second': 343.912, 'eval_steps_per_second': 10.969, 'epoch': 3.0}


## 3.2 Fine-tuning partiel : geler quelques couches de l’encodeur
(Étape 2 de la Part 3)

Ici on gèle les **2 premières couches** de l’encodeur.  
Sur RoBERTa, les couches ressemblent souvent à `roberta.encoder.layer.<i>`.
On applique une règle robuste via `named_parameters()`.


In [24]:
model_pf = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    ignore_mismatched_sizes=True
)

# Geler les 2 premières couches
for name, param in model_pf.named_parameters():
    if "encoder.layer.0" in name or "encoder.layer.1" in name:
        param.requires_grad = False

trainable = sum(p.numel() for p in model_pf.parameters() if p.requires_grad)
total = sum(p.numel() for p in model_pf.parameters())
print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.4f}%)")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Trainable params: 96,296,451 / 124,647,939 (77.2547%)


In [25]:
from transformers import TrainingArguments

training_args_pf = TrainingArguments(
    output_dir="./results_finetune_partial",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50
)


trainer_pf = Trainer(
    model=model_pf,
    args=training_args_pf,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer_pf.train()


  trainer_pf = Trainer(


Step,Training Loss
50,0.4501
100,0.3968
150,0.36
200,0.3432
250,0.3268
300,0.3478
350,0.3214
400,0.3341
450,0.3175
500,0.3216


TrainOutput(global_step=1602, training_loss=0.24347048968113913, metrics={'train_runtime': 368.2697, 'train_samples_per_second': 69.487, 'train_steps_per_second': 4.35, 'total_flos': 635891685735600.0, 'train_loss': 0.24347048968113913, 'epoch': 3.0})

In [26]:
# Évaluation test
pf_test = trainer_pf.evaluate(tokenized["test"])
pf_test


{'eval_loss': 0.5418431162834167,
 'eval_accuracy': 0.875234521575985,
 'eval_f1': 0.87248322147651,
 'eval_runtime': 3.0959,
 'eval_samples_per_second': 344.331,
 'eval_steps_per_second': 10.982,
 'epoch': 3.0}

In [27]:
print("Validation:")
print(trainer_pf.evaluate())

print("Test:")
print(trainer_pf.evaluate(tokenized["test"]))


Validation:


{'eval_loss': 0.450612872838974, 'eval_accuracy': 0.8939962476547842, 'eval_f1': 0.8950789229340761, 'eval_runtime': 3.0138, 'eval_samples_per_second': 353.71, 'eval_steps_per_second': 11.282, 'epoch': 3.0}
Test:
{'eval_loss': 0.5418431162834167, 'eval_accuracy': 0.875234521575985, 'eval_f1': 0.87248322147651, 'eval_runtime': 3.0705, 'eval_samples_per_second': 347.177, 'eval_steps_per_second': 11.073, 'epoch': 3.0}


# 4) Comparaison finale (Part 1 vs Part 2 vs Part 3)



In [28]:
print("Linear probing (test):", lp_test)
print("Fine-tune full (test):", ft_test)
print("Fine-tune partial (test):", pf_test)

Linear probing (test): {'eval_loss': 0.37845900654792786, 'eval_accuracy': 0.8358348968105066, 'eval_f1': 0.8350612629594723, 'eval_runtime': 6.0174, 'eval_samples_per_second': 177.153, 'eval_steps_per_second': 5.65, 'epoch': 3.0}
Fine-tune full (test): {'eval_loss': 0.5735285878181458, 'eval_accuracy': 0.8827392120075047, 'eval_f1': 0.880838894184938, 'eval_runtime': 3.0387, 'eval_samples_per_second': 350.809, 'eval_steps_per_second': 11.189, 'epoch': 3.0}
Fine-tune partial (test): {'eval_loss': 0.5418431162834167, 'eval_accuracy': 0.875234521575985, 'eval_f1': 0.87248322147651, 'eval_runtime': 3.0959, 'eval_samples_per_second': 344.331, 'eval_steps_per_second': 10.982, 'epoch': 3.0}
