In [1]:
!pip install transformers[torch] accelerate -U
!pip install datasets
!pip install rouge_score
!pip install sacrebleu
!pip install evaluate
import warnings
warnings.filterwarnings("ignore")


Collecting transformers[torch]
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using

In [3]:
import os
import torch
import random
import evaluate
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
from dataclasses import dataclass
from time import perf_counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, disable_progress_bar
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    MBartForConditionalGeneration,
    MBart50TokenizerFast
)

disable_progress_bar()


In [4]:
@dataclass
class Config:
    cache_dir: str = "./translation"
    data_dir: str = os.path.join(cache_dir)
    source_lang: str = "eng"
    target_lang: str = "hing"

    batch_size: int = 16
    num_workers: int = 4
    seed: int = 42
    max_source_length: int = 32
    max_target_length: int = 32

    lr: float = 0.0005
    weight_decay: float = 0.01
    epochs: int = 5
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_checkpoint: str = "facebook/mbart-large-50-many-to-many-mmt"

    def __post_init__(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

In [5]:
config = Config()
config

Config(cache_dir='./translation', data_dir='./translation', source_lang='eng', target_lang='hing', batch_size=16, num_workers=4, seed=42, max_source_length=32, max_target_length=32, lr=0.0005, weight_decay=0.01, epochs=3, device=device(type='cuda'), model_checkpoint='facebook/mbart-large-50-many-to-many-mmt')

In [6]:
import pandas as pd
train_df = pd.read_csv('train_combined.csv')
val_df = pd.read_csv('val_combined.csv')
test_df = pd.read_csv('test_combined.csv')

In [7]:
from datasets import Dataset, DatasetDict,Features, Value

features = Features({"eng": Value("string"), "hing": Value("string")})

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df, features=features),
    "val": Dataset.from_pandas(val_df, features=features),
    "test": Dataset.from_pandas(test_df, features=features)
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['eng', 'hing'],
        num_rows: 16119
    })
    val: Dataset({
        features: ['eng', 'hing'],
        num_rows: 1883
    })
    test: Dataset({
        features: ['eng', 'hing'],
        num_rows: 960
    })
})

In [8]:
sample = dataset_dict["train"][0]
sample

{'eng': 'hi', 'hing': 'hi'}

In [9]:
rouge_score = evaluate.load("rouge", cache_dir=config.cache_dir)
bleu_score = evaluate.load("bleu", cache_dir=config.cache_dir)
sacrebleu_score = evaluate.load("sacrebleu", cache_dir=config.cache_dir)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [13]:
tokenizer = MBart50TokenizerFast.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

model_name = config.model_checkpoint.split("/")[-1]
fine_tuned_model_checkpoint = os.path.join(
    config.cache_dir,
    f"{model_name}_{config.source_lang}-{config.target_lang}",
    "checkpoint-4500"
)
if os.path.isdir(fine_tuned_model_checkpoint):
    do_train = False
    model = MBartForConditionalGeneration.from_pretrained(fine_tuned_model_checkpoint, cache_dir=config.cache_dir)
else:
    do_train = True
    model = MBartForConditionalGeneration.from_pretrained(config.model_checkpoint, cache_dir=config.cache_dir)

print("number of parameters:", model.num_parameters())

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

number of parameters: 610879488


In [14]:
def batch_tokenize_fn(examples):
    
    sources = examples[config.source_lang]
    targets = examples[config.target_lang]
    model_inputs = tokenizer(sources, max_length=config.max_source_length, truncation=True)

    labels = tokenizer(targets, max_length=config.max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
dataset_dict_tokenized = dataset_dict.map(
    batch_tokenize_fn,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)
dataset_dict_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16119
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1883
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 960
    })
})

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

features = [dataset_dict_tokenized["train"][i] for i in range(2)]
output = data_collator(features)
output

{'input_ids': tensor([[250004,   1274,      2,      1,      1,      1,      1,      1,      1],
        [250004,   2367,    931,    686,    478,   6777,    398,   1957,      2]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[250004,   1274,      2,   -100,   -100,   -100,   -100,   -100,   -100],
        [250004,  10029,     86,   1939,    172,  14277,      8,  52568,      2]]), 'decoder_input_ids': tensor([[     2, 250004,   1274,      2,      1,      1,      1,      1,      1],
        [     2, 250004,  10029,     86,   1939,    172,  14277,      8,  52568]])}

In [17]:
model_name = config.model_checkpoint.split("/")[-1]
output_dir = os.path.join(config.cache_dir, f"{model_name}_{config.source_lang}-{config.target_lang}")

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    learning_rate=config.lr,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    weight_decay=config.weight_decay,
    save_total_limit=2,
    num_train_epochs=config.epochs,
    predict_with_generate=True,
    load_best_model_at_end=True,
    greater_is_better=True,
    metric_for_best_model="rougeL",
    gradient_accumulation_steps=40,
    do_train=do_train,
    fp16=False
)

In [18]:
from nltk.translate.bleu_score import sentence_bleu
weights = [(1, 0, 0, 0),
           (0.5, 0.5),
           (0.33, 0.33, 0.33, 0),
           (0.25, 0.25, 0.25, 0.25),
           (0.2, 0.2, 0.2, 0.2, 0.2),
           (0.16, 0.16, 0.16, 0.16, 0.16, 0.16),
           (0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14),
           (0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125)]

def bleu_score(refs, candidates):
  score = [0] * len(weights)
  for i in range(len(refs)):
    for j, w in enumerate(weights):
      score[j] += sentence_bleu([refs[i]], candidates[i], weights=w)

  for i in range(len(score)):
    score[i] = score[i] / len(refs)
    score[i] = round(score[i], 6)
  return sum(score) / len(score)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rouge1", "rouge2", "rougeL"]
    )
    score = sacrebleu_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    bluee= bleu_score(decoded_preds,decoded_labels)
    result["sacrebleu"] = score["score"]
    result['bleu'] = bluee
    return {k: round(v, 4) for k, v in result.items()}

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset_dict_tokenized["train"],
    eval_dataset=dataset_dict_tokenized["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

In [20]:
if trainer.args.do_train:
    os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
    t1_start = perf_counter()
    train_output = trainer.train()
    t1_stop = perf_counter()
    print("Training elapsed time:", t1_stop - t1_start)
    trainer.save_model(fine_tuned_model_checkpoint)
    print(train_output)

Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Training elapsed time: 1721.231703568
TrainOutput(global_step=75, training_loss=3.05042236328125, metrics={'train_runtime': 1720.9072, 'train_samples_per_second': 28.1, 'train_steps_per_second': 0.044, 'total_flos': 3153275396554752.0, 'train_loss': 3.05042236328125, 'epoch': 2.9761904761904763})


In [21]:
trainer.evaluate()


{'eval_loss': 2.43168568611145,
 'eval_rouge1': 0.4453,
 'eval_rouge2': 0.1851,
 'eval_rougeL': 0.4165,
 'eval_sacrebleu': 12.0542,
 'eval_bleu': 0.4678,
 'eval_runtime': 220.0887,
 'eval_samples_per_second': 8.556,
 'eval_steps_per_second': 0.536,
 'epoch': 2.9761904761904763}

In [22]:
def generate_translation(model, tokenizer, example):
    """print out the source, target and predicted raw text."""
    source = example[config.source_lang]
    target = example[config.target_lang]
    input_ids = tokenizer(source)["input_ids"]
    input_ids = torch.LongTensor(input_ids).view(1, -1).to(model.device)
    generated_ids = model.generate(input_ids, max_new_tokens=20)
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return prediction

In [23]:
example = dataset_dict['val'][2]
source = example['eng']
target = example['hing']
prediction=generate_translation(model, tokenizer, example)
print('source: ', source)
print('target: ', target)
print('prediction: ', prediction)

source:  Just a moment. I have to read the document.
target:  Just a moment. Mujhe document read karna padega.
prediction:  Just a moment. Mujhe document padhna padega.


In [24]:
def evaluate_on_test_set(model, tokenizer, test_dataset):
    # Tokenize test dataset
    rouge_scores1=[]
    rouge_scores2=[]
    rouge_scoresl=[]
    bleu_scores=[]
    sacrebleu_scores=[]
    test_dataset_tokenized = test_dataset.map(
        batch_tokenize_fn,
        batched=True,
        remove_columns=test_dataset.column_names
    )

    # Evaluate model on test dataset
    results = trainer.evaluate(eval_dataset=test_dataset_tokenized)

    # Compute ROUGE, BLEU, and sacreBLEU scores
    rouge_score1 = results["eval_rouge1"]
    rouge_score2 = results["eval_rouge2"]
    rouge_scorel = results["eval_rougeL"]
    bleu_score = results["eval_bleu"]
    sacrebleu_score = results["eval_sacrebleu"]

    # Print or log the scores
    # print("ROUGE Scores:", rouge_scores)
    # print("BLEU Score:", bleu_score)
    # print("SacreBLEU Score:", sacrebleu_score)
    rouge_scores1.append(rouge_score1)
    rouge_scores2.append(rouge_score2)
    rouge_scoresl.append(rouge_scorel)
    bleu_scores.append(bleu_score)
    sacrebleu_scores.append(sacrebleu_score)
    return rouge_scores1,rouge_scores2,rouge_scoresl,bleu_scores,sacrebleu_scores
# Assuming you have a test dataset called test_dataset
test_dataset = dataset_dict["test"]
rouge_scores1,rouge_scores2,rouge_scoresl,bleu_scores,sacrebleu_scores=evaluate_on_test_set(model, tokenizer, test_dataset)


In [25]:
print("on test set: ")
print("RoUGE Score 1: ",np.mean(rouge_scores1))
print("RoUGE Score 2: ",np.mean(rouge_scores2))
print("RoUGE Score L: ",np.mean(rouge_scoresl))
print("BlEU Score: ",np.mean(bleu_scores))
print("SACREBLEU Score: ",np.mean(sacrebleu_scores))

on test set: 
RoUGE Score 1:  0.418
RoUGE Score 2:  0.1677
RoUGE Score L:  0.3948
BlEU Score:  0.4561
SACREBLEU Score:  11.231


In [26]:
test_translated = []
for sent in dataset_dict['test']:
  out=generate_translation(model, tokenizer, sent)
  test_translated.append(out)


In [27]:
import pandas as pd

# Load your CSV file into a DataFrame
df = pd.read_csv('test_combined.csv')


# Add the new array as a third column to the DataFrame
df['third_column'] = test_translated

# Write the DataFrame back to a new CSV file
df.to_csv('test_results_mbart.csv', index=False)
