In [9]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict

In [10]:
data = pd.read_csv("../../Data/Prepared/DCDD_43K_Parallel_Dataset.csv")

In [11]:
data.shape

(43700, 2)

In [12]:
data.columns

Index(['eng', 'dzo'], dtype='object')

In [13]:
data

Unnamed: 0,eng,dzo
0,Construction of the Padtselling Thubten Sherub...,གསར་སྤང་རྫོང་ཁག་ནང་ལུ་ པད་ཚལ་གླིང་ཐུབ་བསྟན་བཤད...
1,Around 80 percent of the works have been compl...,ལཱ་བརྒྱ་ཆ་༨༠ དེ་ཅིག་མཇུག་བསྡུ་སྟེ་ཡོདཔ་ཨིན་མས།
2,Construction of the Lhakhang which began in 20...,སྤྱི་ལོ་༢༠༡༦ ལས་བཞེངས་ནི་འགོ་བཙུགས་ཡོད་པའི་ལྷ་...
3,The Dorji Lopen of the Zhung Dratshang appoint...,གཞུང་གྲྭ་ཚང་གི་རྡོ་རྗེ་སློབ་དཔོན་གྱིས་ ད་རིས་ ...
4,Lam Yeshi is the new Lam of Khujula Goenpa in ...,བླམ་ཡེ་ཤེས་འདི་ དབང་འདུས་ཕོ་བྲང་རྫོང་ཁག་འོག་གི...
...,...,...
43695,He was drafted into the army.,ཁོ་ དམག་མི་ནང་བསྡུ་ནུག།
43696,He was in critical condition.,ཁོ་ ཚབས་ཆེན་གྱི་གནས་སྟངས་ནང་འདུག།
43697,He was innocent of the crime.,ཁོ་ ཁྲིམས་འགལ་འདི་ནང་ ཉེས་པ་མེདཔ་ཨིན་མས།
43698,He was jealous of my success.,ཁོ་ ངེ་གི་གྲུབ་འབྲས་ལུ་ མིག་ཏོ་ཚ་ནུག།


In [14]:
#Convert the pandas Dataframe into Hugging Face Dataset
dataset = load_dataset("csv", data_files={"train": "../../Data/Prepared/DCDD_43K_Parallel_Dataset.csv"})

#Split the dataset into train (80%) and validation (20%)
train_test_split = dataset["train"].train_test_split(test_size=0.2) 

dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [17]:
model_checkpoint = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    # Tokenize input and target texts separately and add them to the model_inputs dictionary
    tokenized_inputs = tokenizer(examples["eng"], padding="max_length", truncation=True, max_length=128)
    tokenized_targets = tokenizer(examples["dzo"], padding="max_length", truncation=True, max_length=128)
    
    # Ensure the "labels" field contains the tokenized targets
    model_inputs = {k: v for k, v in tokenized_inputs.items()}
    model_inputs["labels"] = tokenized_targets["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="../Finetuned Checkpoints/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=3,
    predict_with_generate=True
)

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()
model.save_pretrained("eng_to_dzo_nllb_finetuned")
tokenizer.save_pretrained("eng_to_dzo_nllb_finetuned")

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`