In [None]:
from tqdm import tqdm
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("marianMT_fine_tunning.ipynb"), "..")))
from datasets import Dataset

from marian.marianMT import MarianMt

In [None]:
model = MarianMt("Helsinki-NLP/opus-mt-en-de")

In [None]:
translation_pairs_en2de = {"en":[], "de":[]}
with open("../../data/train.en", encoding="utf-8") as f_en, \
     open("../../data/train.de", encoding="utf-8") as f_de:

    for en, de in zip(f_en, f_de):
        translation_pairs_en2de["en"].append(en[:-1])
        translation_pairs_en2de["de"].append(de[:-1])

In [None]:
translation_pairs_en2de

{'en': ['Resumption of the session',
  'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 ,  and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .',
  "Although ,  as you will have seen ,  the dreaded  ' millennium bug '  failed to materialise ,  still the people in a number of countries suffered a series of natural disasters that truly were dreadful .",
  'You have requested a debate on this subject in the course of the next few days ,  during this part - session .',
  "In the meantime ,  I should like to observe a minute '  s silence ,  as a number of Members have requested ,  on behalf of all the victims concerned ,  particularly those of the terrible storms ,  in the various countries of the European Union .",
  "Please rise ,  then ,  for this minute '  s silence .",
  "( The House rose and observed a minute '  s silence )",
  'Madam President ,  on a point of order .',
  'You w

In [None]:
translation_pairs_en2de_input_output = [
    {"en": src, "de": tgt}
    for src, tgt in zip(translation_pairs_en2de["en"], translation_pairs_en2de["de"])
]


In [None]:
translation_pairs_en2de_input_output

[{'en': 'Resumption of the session',
  'de': 'Wiederaufnahme der Sitzungsperiode'},
 {'en': 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 ,  and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .',
  'de': 'Ich erkläre die am Freitag ,  dem 17 .  Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen ,  wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe ,  daß Sie schöne Ferien hatten .'},
 {'en': "Although ,  as you will have seen ,  the dreaded  ' millennium bug '  failed to materialise ,  still the people in a number of countries suffered a series of natural disasters that truly were dreadful .",
  'de': 'Wie Sie feststellen konnten ,  ist der gefürchtete  " Millenium - Bug  "  nicht eingetreten .  Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .'},
 {'en': 'You have requested a d

In [None]:
dataset = Dataset.from_list(translation_pairs_en2de_input_output)

In [None]:
dataset

Dataset({
    features: ['en', 'de'],
    num_rows: 4505307
})

In [None]:
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset['train']
val_data = dataset['test']


In [None]:
def preprocess(example):
    inputs = model.tokenize_str(example["en"])
    targets  = model.tokenize_str(example["de"])
    inputs["labels"] = targets["input_ids"]
    return inputs

In [None]:
{
    'input_ids':       [ 45, 678, 902, 0, 0, 0, ..., 0 ],  # encoded
    'attention_mask':  [ 1,   1,   1, 0, 0, 0, ..., 0 ],   # 1 = word, 0 = padding
    'labels':          [ 72, 981, 1257, 0, 0, 0, ..., 0 ]  # label ->encoded target
}


{'input_ids': [45, 678, 902, 0, 0, 0, Ellipsis, 0],
 'attention_mask': [1, 1, 1, 0, 0, 0, Ellipsis, 0],
 'labels': [72, 981, 1257, 0, 0, 0, Ellipsis, 0]}

In [None]:
tokenized_train = train_data.map(preprocess, batched=True)
tokenized_val = val_data.map(preprocess, batched=True)


Map:   0%|          | 0/4054776 [00:00<?, ? examples/s]

Map:   0%|          | 0/450531 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['en', 'de', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4054776
})

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(model.get_tokenizer(), model=model)


In [None]:
import sacrebleu

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = model.get_tokenizer().batch_decode(preds, skip_special_tokens=True)
    decoded_labels = model.get_tokenizer().batch_decode(labels, skip_special_tokens=True)

    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])
    return {"bleu": bleu.score}


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./marian-finetuned-en2de",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    logging_dir="./logs",
)

trainer = Seq2SeqTrainer(
    model=model.get_model(),
    args=training_args,
    train_dataset=tokenized_train.select(range(10000)),
    eval_dataset=tokenized_val.select(range(1000)),
    tokenizer=model.get_tokenizer(),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
metrics = trainer.evaluate()
print(metrics)


NameError: name 'trainer' is not defined

: 

In [None]:
trainer.save_model("./marian-finetuned-en2de")
model.get_tokenizer().save_pretrained("./marian-finetuned-en2de")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

log_history = trainer.state.log_history
df = pd.DataFrame(log_history)

plt.plot(df["step"], df["loss"], label="Train Loss")
if "eval_loss" in df:
    plt.plot(df["step"], df["eval_loss"], label="Eval Loss")

plt.xlabel("Steps")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss over time")
plt.show()


In [None]:

bleu_df = df[df["bleu"].notnull()]
plt.figure(figsize=(8, 5))
plt.plot(bleu_df["step"], bleu_df["bleu"], marker='o', label="BLEU score")
plt.xlabel("Step")
plt.ylabel("BLEU score")
plt.title("BLEU over time")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(bleu_df["epoch"], bleu_df["bleu"], marker="o", linestyle="-", color="teal", label="BLEU per epoch")
plt.xlabel("Epoch")
plt.ylabel("BLEU score")
plt.title("BLEU score per Epoch")
plt.grid(True)
plt.xticks(bleu_df["epoch"])
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
metrics_df = df[["epoch", "eval_loss", "bleu"]].dropna()


In [None]:
fig, ax1 = plt.subplots(figsize=(8, 5))

ax1.plot(metrics_df["epoch"], metrics_df["eval_loss"], color="red", marker="o", label="Eval Loss")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Eval Loss", color="red")
ax1.tick_params(axis="y", labelcolor="red")

ax2 = ax1.twinx()
ax2.plot(metrics_df["epoch"], metrics_df["bleu"], color="blue", marker="s", label="BLEU Score")
ax2.set_ylabel("BLEU Score", color="blue")
ax2.tick_params(axis="y", labelcolor="blue")

plt.title("Eval Loss vs BLEU Score per Epoch")
fig.tight_layout()
plt.grid(True)
plt.show()
