In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd 
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
data = pd.read_csv("../../Data/Prepared/DCDD_43K_Parallel_Dataset.csv")

In [3]:
data.shape

(43700, 2)

In [4]:
data.columns

Index(['eng', 'dzo'], dtype='object')

In [5]:
data

Unnamed: 0,eng,dzo
0,Construction of the Padtselling Thubten Sherub...,གསར་སྤང་རྫོང་ཁག་ནང་ལུ་ པད་ཚལ་གླིང་ཐུབ་བསྟན་བཤད...
1,Around 80 percent of the works have been compl...,ལཱ་བརྒྱ་ཆ་༨༠ དེ་ཅིག་མཇུག་བསྡུ་སྟེ་ཡོདཔ་ཨིན་མས།
2,Construction of the Lhakhang which began in 20...,སྤྱི་ལོ་༢༠༡༦ ལས་བཞེངས་ནི་འགོ་བཙུགས་ཡོད་པའི་ལྷ་...
3,The Dorji Lopen of the Zhung Dratshang appoint...,གཞུང་གྲྭ་ཚང་གི་རྡོ་རྗེ་སློབ་དཔོན་གྱིས་ ད་རིས་ ...
4,Lam Yeshi is the new Lam of Khujula Goenpa in ...,བླམ་ཡེ་ཤེས་འདི་ དབང་འདུས་ཕོ་བྲང་རྫོང་ཁག་འོག་གི...
...,...,...
43695,He was drafted into the army.,ཁོ་ དམག་མི་ནང་བསྡུ་ནུག།
43696,He was in critical condition.,ཁོ་ ཚབས་ཆེན་གྱི་གནས་སྟངས་ནང་འདུག།
43697,He was innocent of the crime.,ཁོ་ ཁྲིམས་འགལ་འདི་ནང་ ཉེས་པ་མེདཔ་ཨིན་མས།
43698,He was jealous of my success.,ཁོ་ ངེ་གི་གྲུབ་འབྲས་ལུ་ མིག་ཏོ་ཚ་ནུག།


In [23]:
#Convert the pandas Dataframe into Hugging Face Dataset
dataset = load_dataset("csv", data_files={"train": "../../Data/Prepared/DCDD_43K_Parallel_Dataset.csv"})

#Split the dataset into train (80%) and validation (20%)
train_test_split = dataset["train"].train_test_split(test_size=0.2) 

dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['eng', 'dzo'],
        num_rows: 34960
    })
    validation: Dataset({
        features: ['eng', 'dzo'],
        num_rows: 8740
    })
})

In [25]:
model_checkpoint = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang="eng_Latn", tgt_lang="dzo_Tibt")

In [26]:
print(tokenizer)

NllbTokenizerFast(name_or_path='facebook/nllb-200-distilled-600M', vocab_size=256204, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab'

In [27]:
def tokenize_function(examples):
    # Tokenize input and target texts separately and add them to the model_inputs dictionary
    tokenized_inputs = tokenizer(examples["eng"], padding="max_length", truncation=True, max_length=128)
    tokenized_targets = tokenizer(examples["dzo"], padding="max_length", truncation=True, max_length=128)
    
    # Ensure the "labels" field contains the tokenized targets
    model_inputs = {k: v for k, v in tokenized_inputs.items()}
    model_inputs["labels"] = tokenized_targets["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 34960/34960 [00:04<00:00, 7402.54 examples/s]
Map: 100%|██████████| 8740/8740 [00:01<00:00, 7160.38 examples/s]


In [28]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['eng', 'dzo', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 34960
    })
    validation: Dataset({
        features: ['eng', 'dzo', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8740
    })
})

In [29]:
print(tokenized_datasets['train'][0]['input_ids'])
print(len(tokenized_datasets['train'][0]['input_ids']))

[256047, 2820, 248116, 119, 15880, 42098, 248075, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
128


In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

model.to("cuda")

training_args = Seq2SeqTrainingArguments(
    output_dir="../Finetuned Checkpoints/V4/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=3,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()
model.save_pretrained("../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2")
tokenizer.save_pretrained("../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2")

Epoch,Training Loss,Validation Loss
1,0.1735,0.1606
2,0.1538,0.149581
3,0.142,0.147199


('../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2/tokenizer_config.json',
 '../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2/special_tokens_map.json',
 '../Finetuned Checkpoints/V4/eng_to_dzo_nllb_finetuned_v2/tokenizer.json')