### Implementation 1

In [30]:
#pip install datasets
#pip install transformers
#pip install sacrebleu==1.5.1 datasets portalocker==2.0.0 xxhash==2.0.2
#pip install sentencepiece

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from datasets import load_metric
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

metric = load_metric("sacrebleu")

#training dataset
train_dataset = load_dataset('wmt16', 'de-en', split='train[:100]')
eval_dataset = load_dataset('wmt16', 'de-en', split='test[:10]')

max_input_length = 1000
max_target_length = 1000
source_lang = "de"
target_lang = "en"

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load the BERT-based encoder-decoder model for machine translation
model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = train_dataset.map(preprocess_function, batched=True)
encoded_val_dataset = eval_dataset.map(preprocess_function, batched=True)


batch_size = 32
# # model_name = model_checkpoint.split("/")[-1]
# training_args = Seq2SeqTrainingArguments(
#     "test-translation",
#     evaluation_strategy = "epoch",
#     logging_dir='./logs',
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=1,
#     predict_with_generate=True,
#     fp16=False,
# #     push_to_hub=True,
# #     push_to_hub_model_id=f"{model_name}-finetuned-{source_lang}-to-{target-lang}",
# )
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    "test-translation",
    evaluation_strategy = "epoch",
    logging_dir='./logs',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
import nltk

import nltk
from rouge_score import rouge_scorer

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# def postprocess_text(preds, labels):
#     preds = [" ".join(pred.strip().split()) for pred in preds]  # Join list of strings
#     labels = [[" ".join(label.strip().split())] for label in labels]  # Join list of strings
#     return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU-1, BLEU-2, and ROUGE-L scores
    bleu1 = nltk.translate.bleu_score.corpus_bleu(decoded_labels, decoded_preds, weights=(1, 0))
    bleu2 = nltk.translate.bleu_score.corpus_bleu(decoded_labels, decoded_preds, weights=(0.5, 0.5))
    
    # scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    # scores = scorer.score(decoded_preds, decoded_labels)
    # rouge_l = scores['rougeL'].fmeasure
    
    result = {"bleu1": round(bleu1, 4), "bleu2": round(bleu2, 4)}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    
    return result

# Define the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

In [2]:
# import matplotlib.pyplot as plt

# # Get the training and validation loss and steps from the trainer object
# train_loss = trainer.history["train_loss"]
# val_loss = trainer.history["eval_loss"]
# steps = list(range(len(train_loss)))

# # Plot the training and validation loss over the steps
# plt.plot(steps, train_loss, label="Training loss")
# plt.plot(steps, val_loss, label="Validation loss")

# plt.title("Training and validation loss")
# plt.xlabel("Steps")
# plt.ylabel("Loss")
# plt.legend()

# plt.show()

In [None]:
# def postprocess_text(preds, labels):
#     preds = [pred.strip() for pred in preds]
#     labels = [[label.strip()] for label in labels]
#     return preds, labels
# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     print(preds,preds.shape,22222222222)
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     print(decoded_preds,len(decoded_preds),111111)
#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     print(decoded_labels,len(decoded_labels),111111)
#     # Some simple post-processing
#     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
#     print(decoded_preds,len(decoded_preds),222222222222)
#     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
#     result = {"bleu": result["score"]}

#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
#     result["gen_len"] = np.mean(prediction_lens)
#     result = {k: round(v, 4) for k, v in result.items()}
#     return result

# trainer = Seq2SeqTrainer(
#     model,
#     training_args,
#     train_dataset=tokenized_datasets,
#     eval_dataset=tokenized_datasets,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

# trainer.train()