# Fine-tuning the text classification model

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
# Imports
import os
import numpy as np
from datasets import load_dataset, load_metric
from tqdm import tqdm
import torch

from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Settings
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
max_length = 128
source_lang = "en"
target_lan = "fr"

# Data preparation

In [87]:
dataset = load_dataset("kde4", lang1=source_lang, lang2=target_lan, trust_remote_code=True)

In [23]:
print(f"DATASET TYPE: {type(dataset)}")
print(f"DATASET INFO: {dataset.items()}")
print(f"DATASET COL NAMES: {dataset.column_names}")

DATASET TYPE: <class 'datasets.dataset_dict.DatasetDict'>
DATASET INFO: dict_items([('train', Dataset({
    features: ['id', 'translation'],
    num_rows: 210173
}))])
DATASET COL NAMES: {'train': ['id', 'translation']}


In [24]:
split_datasets = dataset["train"].train_test_split(train_size=0.9, seed=20)

In [25]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [26]:
split_datasets["validation"] = split_datasets.pop("test")

In [27]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

## Examination of train/test examples

In [93]:
split_datasets["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

In [94]:
split_datasets["validation"][1]["translation"]

{'en': 'Customize Formatting', 'fr': 'Personnaliser le formatage'}

In [99]:
for i in range(0, 50):
    print(split_datasets["validation"][i]["translation"])

{'en': 'User and Group Permissions', 'fr': "Droits d'accès de l'utilisateur et du groupe"}
{'en': 'Customize Formatting', 'fr': 'Personnaliser le formatage'}
{'en': 'This filter will apply a grayish look to the icon. Click Setup... to configure the intensity of this filter. Note that it is customary for most user interfaces to use this effect for disabled icons only.', 'fr': "Ce filtre appliquera un ton gris à l'icône. Cliquez Configurer... pour configurer l'intensité de ce filtre. Remarquez qu'il est courant pour la plupart des interfaces utilisateurs d'utiliser cet effet pour désactiver seulement les icônes."}
{'en': '%1: Failed to schedule after early start. Negative float=%2', 'fr': '%1 & #160;: impossible de planifier après le démarrage anticipé. Marge négative=%2'}
{'en': 'The next step in the wizard is to select whether to store the certificate in a file or send it directly to a & ca;. You will have to specify the filename or email address to send the certificate request to.', '

# Tokenizer

In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [29]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [30]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map: 100%|█| 189155/189155 [00:19<00:00, 
Map: 100%|█| 21018/21018 [00:02<00:00, 93


In [31]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

# Base model

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [20]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [21]:
batch["labels"]

tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]])

In [22]:
batch["decoder_input_ids"]

tensor([[59513,   577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,
         59513, 59513, 59513, 59513, 59513, 59513],
        [59513,  1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,
           817,   550,  7032,  5821,  7907, 12649]])

In [23]:
tokenized_datasets["train"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 189155
})

In [24]:
tokenized_datasets["validation"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 21018
})

In [38]:
def postprocess_text(preds: list, labels: list) -> tuple:
    """Performs post processing on the prediction text and labels"""

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds: tuple) -> dict:
    """computes bleu score and other performance metrics """

    metric = load_metric("sacrebleu")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [39]:
model_args = Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-{source_lang}-to-{target_lan}",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.02,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

In [40]:
trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.1039,1.265297,46.2701,14.6208
2,0.9271,1.06554,49.5692,15.3653
3,0.6556,0.976362,53.227,15.1121


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=70935, training_loss=0.9738361365364748, metrics={'train_runtime': 7888.3575, 'train_samples_per_second': 71.937, 'train_steps_per_second': 8.992, 'total_flos': 6052512803586048.0, 'train_loss': 0.9738361365364748, 'epoch': 3.0})

In [42]:
model_args_2 = Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-{source_lang}-to-{target_lan}_2",
    evaluation_strategy="epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.02,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)
trainer_2 = Seq2SeqTrainer(
    model,
    model_args_2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer_2.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.4009,0.99735,53.642,15.0515
2,0.3767,0.998905,53.633,15.0929
3,0.6023,1.001612,53.5724,15.0708


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=70935, training_loss=0.42000702155974934, metrics={'train_runtime': 8007.4456, 'train_samples_per_second': 70.867, 'train_steps_per_second': 8.859, 'total_flos': 6052512803586048.0, 'train_loss': 0.42000702155974934, 'epoch': 3.0})

In [43]:
model_args_3 = Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-{source_lang}-to-{target_lan}_3",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.02,
    save_total_limit=3,
    num_train_epochs=6,
    predict_with_generate=True
)
trainer_3 = Seq2SeqTrainer(
    model,
    model_args_3,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer_3.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.1088,1.326754,44.2527,14.224
2,1.0135,1.189795,46.8585,15.4934
3,0.8813,1.104855,50.1752,14.7153
4,0.6843,1.047095,52.0816,15.1871
5,0.5007,1.025684,53.8655,14.9982
6,0.357,1.026634,54.5389,15.0712


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=141870, training_loss=0.7543482693807766, metrics={'train_runtime': 15624.3198, 'train_samples_per_second': 72.639, 'train_steps_per_second': 9.08, 'total_flos': 1.2110411474731008e+16, 'train_loss': 0.7543482693807766, 'epoch': 6.0})

In [44]:
model_args_4 = Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-{source_lang}-to-{target_lan}_4",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.02,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True
)
trainer_4 = Seq2SeqTrainer(
    model,
    model_args_4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],tokenize
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer_4.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.0529,1.305142,46.2635,14.8345
2,0.9775,1.230409,46.2903,15.3757
3,0.8953,1.179044,48.2515,14.9831
4,0.7847,1.147221,49.5961,15.1395
5,0.6676,1.12911,50.9564,14.9688
6,0.5747,1.106718,52.2019,15.0094
7,0.4796,1.104587,52.9134,15.0704
8,0.3668,1.116773,53.7643,15.0362
9,0.2585,1.134115,53.8917,15.1109
10,0.1769,1.156149,54.4786,15.1086


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=236450, training_loss=0.5959363181456423, metrics={'train_runtime': 26584.714, 'train_samples_per_second': 71.152, 'train_steps_per_second': 8.894, 'total_flos': 2.0189238766534656e+16, 'train_loss': 0.5959363181456423, 'epoch': 10.0})

# Using the model

In [32]:
my_model_checkpoint = "Helsinki-NLP/opus-mt-en-fr-finetuned-en-to-fr_3/checkpoint-141500"

my_model = AutoModelForSeq2SeqLM.from_pretrained(my_model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(my_model_checkpoint, return_tensors="pt")

In [33]:
text = "Hello, my name is Leo!"
tokenized_text = tokenizer(text, return_tensors="pt")
result = my_model.generate(**tokenized_text)
tokenizer.decode(result[0], skip_special_tokens=True)

'Bonjour, mon nom est Leo & #160;!'

## Using the model on a larger corpus

In [49]:
test_sentences = [
    "Lions are known as the kings of the jungle due to their majestic appearance.",
    "Elephants are the largest land mammals on Earth, known for their long trunks and big ears.",
    "Dolphins are highly intelligent marine mammals that often display playful behavior.",
    "Kangaroos are marsupials native to Australia and are known for their powerful hind legs and pouches.",
    "Penguins are flightless birds that spend most of their lives in the water and are excellent swimmers.",
    "Giraffes have long necks that allow them to reach high leaves in trees, making them the tallest animals on land.",
    "Butterflies undergo a remarkable transformation from caterpillars to beautiful, colorful insects.",
    "Cheetahs are the fastest land animals, capable of reaching speeds up to 60 miles per hour.",
    "Whales are the largest animals on Earth, with some species growing to over 100 feet in length.",
    "Honeybees play a vital role in pollinating plants and are known for their complex hive structures."
]    

In [129]:
all_translations = []

for sentence in tqdm(test_sentences):
    tokenized_sentence = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    raw_translation = my_model.generate(**tokenized_sentence)
    translation = tokenizer.decode(raw_translation[0], skip_special_tokens=True)
    all_translations.append(translation.replace("« & #160;", "").replace("& #160; »", "").replace("& #160;", "").replace("  ", " "))

all_translations

100%|████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.93it/s]


["Les Lunes sont connues comme étant la valeur de l'effet de leur présence d'accolade.",
 'Les élements sont les plus grands moment de vie sur la Terre, connus pour leurs longues pauses et sa grande myrinthe.',
 'Les Dolphins sont un comportement très intelligent et on peut y voir souvent le comportement playable.',
 "Kangaroos est un marasicien natif à l'Australie et sont connus pour leur puissant hen-tête hen-hung and yes.",
 "Les Penguins sont des volumineux qui passent de la plupart de leurs vies dans l'eau et sont d'excellentes averses.",
 "Les combrés de Giraffe ont des colonnes longues qui permettent d'atteindre les grands sauts d'arborescence, en leur rendant les plus grands utility sur les immeubles.",
 "L' grâce à des franges qui s'évaluent d'une transformation de l'épingle de lune enroulée pour rendre agréables, colorées. Name_BAR_plasma contain white spaces contain white spaces and non latin1 characters.",
 "Cheetahs sont les YSTAbout Devices, capable d'atteindre des vitess

## Using the model on a sentences batch

In [139]:
tokenized_batch = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)
translated_batch = my_model.generate(**tokenized_batch)
tokenizer.batch_decode(translated_batch, skip_special_tokens=True)

["Les Lunes sont connues comme étant la valeur de l'effet de leur présence d'accolade.",
 'Les élements sont les plus grands moment de vie sur la Terre, connus pour leurs longues pauses et sa grande myrinthe.',
 'Les Dolphins sont un comportement très intelligent et on peut y voir souvent le comportement playable.',
 "Kangaroos est un marasicien natif à l'Australie et sont connus pour leur puissant hen-tête hen-hung and yes.",
 "Les Penguins sont des volumineux qui passent de la plupart de leurs vies dans l'eau et sont d'excellentes averses.",
 "Les combrés de Giraffe ont des colonnes longues qui permettent d'atteindre les grands sauts d'arborescence, en leur rendant les plus grands utility sur les immeubles.",
 "L' grâce à des franges qui s'évaluent d'une transformation de l'épingle de lune enroulée pour rendre agréables, colorées. Name_BAR_plasma contain white spaces contain white spaces and non latin1 characters.",
 "Cheetahs sont les YSTAbout Devices, capable d'atteindre des vitess