In [14]:
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd 
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [15]:
training_data = pd.read_csv("../../Data/Prepared/DCDD_43K_Parallel_Dataset.csv")
validation_data = pd.read_csv("../../Data/Prepared/DCDD_10K_Parallel_Dataset.csv")

In [16]:
#Convert the pandas Dataframe into Hugging Face Dataset
# dataset = load_dataset("csv", data_files={"train": "../../Data/Prepared/DCDD_43K_Parallel_Dataset.csv"})

#Split the dataset into train (80%) and validation (20%)
# train_test_split = dataset["train"].train_test_split(test_size=0.2)
training_data = Dataset.from_pandas(training_data)
validation_data = Dataset.from_pandas(validation_data)

dataset = DatasetDict({
    "train": training_data,
    "validation": validation_data
})

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['eng', 'dzo'],
        num_rows: 43700
    })
    validation: Dataset({
        features: ['eng', 'dzo'],
        num_rows: 10000
    })
})

In [18]:
model_checkpoint = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang="eng_Latn", tgt_lang="dzo_Tibt")

In [19]:
print(tokenizer)

NllbTokenizerFast(name_or_path='facebook/nllb-200-distilled-600M', vocab_size=256204, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab'

In [20]:
def tokenize_function(examples):
    # Tokenize input and target texts separately and add them to the model_inputs dictionary
    tokenized_inputs = tokenizer(examples["eng"], padding="max_length", truncation=True, max_length=128)
    tokenized_targets = tokenizer(examples["dzo"], padding="max_length", truncation=True, max_length=128)
    
    # Ensure the "labels" field contains the tokenized targets
    model_inputs = {k: v for k, v in tokenized_inputs.items()}
    model_inputs["labels"] = tokenized_targets["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/43700 [00:00<?, ? examples/s]

Map: 100%|██████████| 43700/43700 [00:05<00:00, 7564.90 examples/s]
Map: 100%|██████████| 10000/10000 [00:01<00:00, 7283.75 examples/s]


In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['eng', 'dzo', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 43700
    })
    validation: Dataset({
        features: ['eng', 'dzo', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
})

In [22]:
print(tokenized_datasets['train'][0]['input_ids'])
print(len(tokenized_datasets['train'][0]['input_ids']))

[256047, 142413, 6606, 452, 349, 19912, 468, 54621, 109, 9336, 698, 48950, 519, 3525, 199234, 186860, 155, 6741, 3487, 108, 11895, 8355, 2209, 176, 248071, 13527, 248, 108, 14577, 190599, 248075, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
128


In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

model.to("cuda")

training_args = Seq2SeqTrainingArguments(
    output_dir="../Finetuned Checkpoints/V4/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=3,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()
model.save_pretrained("../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2")
tokenizer.save_pretrained("../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2")

Epoch,Training Loss,Validation Loss
1,0.1657,0.226107
2,0.1502,0.21939
3,0.1374,0.218375


('../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2/tokenizer_config.json',
 '../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2/special_tokens_map.json',
 '../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2/tokenizer.json')