In [1]:
!pip install transformers[torch] accelerate -U
!pip install datasets
!pip install rouge_score
!pip install sacrebleu
!pip install evaluate
import warnings
warnings.filterwarnings("ignore")

Collecting transformers[torch]
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using

In [2]:
import os
import torch
import random
import evaluate
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
from dataclasses import dataclass
from time import perf_counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, disable_progress_bar
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)

disable_progress_bar()


In [3]:
@dataclass
class Config:
    cache_dir: str = "./translation"
    data_dir: str = os.path.join(cache_dir)
    source_lang: str = "eng"
    target_lang: str = "hing"

    batch_size: int = 16
    num_workers: int = 4
    seed: int = 42
    max_source_length: int = 32
    max_target_length: int = 32

    lr: float = 0.0005
    weight_decay: float = 0.01
    epochs: int = 5
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_checkpoint: str = "google/mt5-small"

    def __post_init__(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

In [4]:
config = Config()


In [5]:
import pandas as pd
train_df = pd.read_csv('train_combined.csv')
val_df = pd.read_csv('val_combined.csv')
test_df = pd.read_csv('test_combined.csv')

In [6]:
from datasets import Dataset, DatasetDict,Features, Value

features = Features({"eng": Value("string"), "hing": Value("string")})

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df, features=features),
    "val": Dataset.from_pandas(val_df, features=features),
    "test": Dataset.from_pandas(test_df, features=features)
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['eng', 'hing'],
        num_rows: 16119
    })
    val: Dataset({
        features: ['eng', 'hing'],
        num_rows: 1883
    })
    test: Dataset({
        features: ['eng', 'hing'],
        num_rows: 960
    })
})

In [7]:
sample = dataset_dict["train"][0]
sample

{'eng': 'hi', 'hing': 'hi'}

In [8]:
rouge_score = evaluate.load("rouge", cache_dir=config.cache_dir)
bleu_score = evaluate.load("bleu", cache_dir=config.cache_dir)
sacrebleu_score = evaluate.load("sacrebleu", cache_dir=config.cache_dir)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

model_name = config.model_checkpoint.split("/")[-1]
fine_tuned_model_checkpoint = os.path.join(
    config.cache_dir,
    f"{model_name}_{config.source_lang}-{config.target_lang}",
    "checkpoint-4500"
)
if os.path.isdir(fine_tuned_model_checkpoint):
    do_train = False
    model = AutoModelForSeq2SeqLM.from_pretrained(fine_tuned_model_checkpoint, cache_dir=config.cache_dir)
else:
    do_train = True
    model = AutoModelForSeq2SeqLM.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

print("number of parameters:", model.num_parameters())

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

number of parameters: 300176768


In [10]:
def batch_tokenize_fn(examples):
    sources = examples[config.source_lang]
    targets = examples[config.target_lang]
    model_inputs = tokenizer(sources, max_length=config.max_source_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=config.max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
dataset_dict_tokenized = dataset_dict.map(
    batch_tokenize_fn,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)
dataset_dict_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16119
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1883
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 960
    })
})

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

features = [dataset_dict_tokenized["train"][i] for i in range(2)]
output = data_collator(features)
output

{'input_ids': tensor([[  1823,      1,      0,      0,      0,      0,      0],
        [  2119, 150684,    634,   3031,    521,   2354,      1]]), 'attention_mask': tensor([[1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[ 1823,     1,  -100,  -100,  -100,  -100,  -100,  -100],
        [24993,   405, 54129,   266, 13194,   269, 54096,     1]]), 'decoder_input_ids': tensor([[    0,  1823,     1,     0,     0,     0,     0,     0],
        [    0, 24993,   405, 54129,   266, 13194,   269, 54096]])}

In [13]:
model_name = config.model_checkpoint.split("/")[-1]
output_dir = os.path.join(config.cache_dir, f"{model_name}_{config.source_lang}-{config.target_lang}")

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    learning_rate=config.lr,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    weight_decay=config.weight_decay,
    save_total_limit=2,
    num_train_epochs=config.epochs,
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="rougeL",
    gradient_accumulation_steps=40,
    do_train=do_train,
    fp16=False
)

In [14]:
from nltk.translate.bleu_score import sentence_bleu
weights = [(1, 0, 0, 0),
           (0.5, 0.5),
           (0.33, 0.33, 0.33, 0),
           (0.25, 0.25, 0.25, 0.25),
           (0.2, 0.2, 0.2, 0.2, 0.2),
           (0.16, 0.16, 0.16, 0.16, 0.16, 0.16),
           (0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14),
           (0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125)]

def bleu_score(refs, candidates):
  score = [0] * len(weights)
  for i in range(len(refs)):
    for j, w in enumerate(weights):
      score[j] += sentence_bleu([refs[i]], candidates[i], weights=w)

  for i in range(len(score)):
    score[i] = score[i] / len(refs)
    score[i] = round(score[i], 6)
  return sum(score) / len(score)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries, which is in ids into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode labels, a.k.a. reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rouge1", "rouge2", "rougeL"]
    )
    score = sacrebleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    bluee= bleu_score(decoded_preds,decoded_labels)
    result["sacrebleu"] = score["score"]
    result['bleu'] = bluee
    return {k: round(v, 4) for k, v in result.items()}

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args=training_args,
    train_dataset=dataset_dict_tokenized["train"],
    eval_dataset=dataset_dict_tokenized["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

In [16]:
# should take around 4117.78 seconds on a single V100 GPU
if trainer.args.do_train:
    os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
    t1_start = perf_counter()
    train_output = trainer.train()
    t1_stop = perf_counter()
    print("Training elapsed time:", t1_stop - t1_start)

    # saving the model which allows us to leverage
    # .from_pretrained(model_path)
    trainer.save_model(fine_tuned_model_checkpoint)
    print(train_output)

Step,Training Loss,Validation Loss


Training elapsed time: 1066.789826912
TrainOutput(global_step=125, training_loss=5.8176689453125, metrics={'train_runtime': 1066.3108, 'train_samples_per_second': 75.583, 'train_steps_per_second': 0.117, 'total_flos': 2586316117370880.0, 'train_loss': 5.8176689453125, 'epoch': 4.9603174603174605})


In [17]:
trainer.evaluate()


{'eval_loss': 3.2499141693115234,
 'eval_rouge1': 0.3228,
 'eval_rouge2': 0.101,
 'eval_rougeL': 0.2974,
 'eval_sacrebleu': 4.8032,
 'eval_bleu': 0.3318,
 'eval_runtime': 50.4445,
 'eval_samples_per_second': 37.328,
 'eval_steps_per_second': 2.339,
 'epoch': 4.9603174603174605}

In [18]:
def generate_translation(model, tokenizer, example):
    """print out the source, target and predicted raw text."""
    source = example[config.source_lang]
    target = example[config.target_lang]
    input_ids = tokenizer(source)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    generated_ids = model.generate(input_ids, max_new_tokens=20)
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return prediction

In [19]:
example = dataset_dict['val'][2]
source = example['eng']
target = example['hing']
prediction=generate_translation(model, tokenizer, example)
print('source: ', source)
print('target: ', target)
print('prediction: ', prediction)

source:  Just a moment. I have to read the document.
target:  Just a moment. Mujhe document read karna padega.
prediction:  Mujhe a moment. Mujhe document read kiya.


In [20]:
# test_translated = []
# for sent in dataset_dict['test']:
#   out=generate_translation(model, tokenizer, sent)
#   test_translated.append(out)


In [21]:
# import pandas as pd

# # Load your CSV file into a DataFrame
# df = pd.read_csv('test_combined.csv')


# # Add the new array as a third column to the DataFrame
# df['third_column'] = test_translated

# # Write the DataFrame back to a new CSV file
# df.to_csv('test_results.csv', index=False)


In [22]:
def evaluate_on_test_set(model, tokenizer, test_dataset):
    # Tokenize test dataset
    rouge_scores1=[]
    rouge_scores2=[]
    rouge_scoresl=[]
    bleu_scores=[]
    sacrebleu_scores=[]
    test_dataset_tokenized = test_dataset.map(
        batch_tokenize_fn,
        batched=True,
        remove_columns=test_dataset.column_names
    )

    # Evaluate model on test dataset
    results = trainer.evaluate(eval_dataset=test_dataset_tokenized)

    # Compute ROUGE, BLEU, and sacreBLEU scores
    rouge_score1 = results["eval_rouge1"]
    rouge_score2 = results["eval_rouge2"]
    rouge_scorel = results["eval_rougeL"]
    bleu_score = results["eval_bleu"]
    sacrebleu_score = results["eval_sacrebleu"]

    # Print or log the scores
    # print("ROUGE Scores:", rouge_scores)
    # print("BLEU Score:", bleu_score)
    # print("SacreBLEU Score:", sacrebleu_score)
    rouge_scores1.append(rouge_score1)
    rouge_scores2.append(rouge_score2)
    rouge_scoresl.append(rouge_scorel)
    bleu_scores.append(bleu_score)
    sacrebleu_scores.append(sacrebleu_score)
    return rouge_scores1,rouge_scores2,rouge_scoresl,bleu_scores,sacrebleu_scores
# Assuming you have a test dataset called test_dataset
test_dataset = dataset_dict["test"]
rouge_scores1,rouge_scores2,rouge_scoresl,bleu_scores,sacrebleu_scores=evaluate_on_test_set(model, tokenizer, test_dataset)


In [23]:
print("on test set: ")
print("RoUGE Score 1: ",np.mean(rouge_scores1))
print("RoUGE Score 2: ",np.mean(rouge_scores2))
print("RoUGE Score L: ",np.mean(rouge_scoresl))
print("BlEU Score: ",np.mean(bleu_scores))
print("SACREBLEU Score: ",np.mean(sacrebleu_scores))

on test set: 
RoUGE Score 1:  0.294
RoUGE Score 2:  0.0809
RoUGE Score L:  0.2746
BlEU Score:  0.3166
SACREBLEU Score:  4.444


In [24]:
example = dataset_dict['test'][2]
source = example['eng']
target = example['hing']
prediction=generate_translation(model, tokenizer, example)
print('source: ', source)
print('target: ', target)
print('prediction: ', prediction)

source:  Hello. How are you? I am not entirely sure about what question to ask, so I'll just ask: do you think the critics were fair in their critique of the movie?
target:  Hello. Kaise ho? Main ekdum sure nahi hun ki kya question puchoon, to main bas yahi puchunga : kya tumhe lagta hai critics is movie ka critique karne mein sahi they?
prediction:  hello. Kya tumhe lagta hai ki tumhe critics fair hai?


In [28]:
example = dataset_dict['test'][3]
source = example['eng']
target = example['hing']
prediction=generate_translation(model, tokenizer, example)
print('source: ', source)
print('target: ', target)
print('prediction: ', prediction)

source:  I agree with them that Ruffalo was great in the movie. 
target:  Main agree karta hun Ruffalo movie mein great tha
prediction:  Mujhe agree karta hai ki Ruffalo movie ke liye great hai
