In [14]:
!pip install datasets transformers==4.51.3 nltk evaluate tqdm bert_score wandb




In [15]:
from tqdm import tqdm
import sys
import os
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import corpus_bleu
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("fine_tunning.ipynb"), "..")))
from datasets import load_dataset
import random
import numpy as np
# from transformers_models.marian.marianMT import


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
SEED_VALUE = 42


random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)

In [17]:
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

#GETS SYNONYM FOR A WORD

def get_synonym(word):
    synonyms = wordnet.synsets(word)
    if not synonyms:
        return None

    lemmas = synonyms[0].lemmas()
    for lemma in lemmas:
        synonym = lemma.name().replace("_", " ")
        if synonym.lower() != word.lower():
            return synonym
    return None


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [18]:
#GETS THE NEIGHBORS OF A QWERTY KEYBOARD
def build_qwerty_neighbors():
    layout = [
        "qwertyuiop",
        "asdfghjkl",
        "zxcvbnm"
    ]
    neighbors = {}

    for row in layout:
        for i, char in enumerate(row):
            neighbor_chars = []
            if i > 0:
                neighbor_chars.append(row[i - 1])
            if i < len(row) - 1:
                neighbor_chars.append(row[i + 1])
            neighbors[char] = ''.join(neighbor_chars)

    return neighbors

QWERTY_NEIGHBORS = build_qwerty_neighbors()


In [19]:
#INTRODUCE TYPOS
def typo_char(c):
    if c.lower() in QWERTY_NEIGHBORS:
        return random.choice(QWERTY_NEIGHBORS[c.lower()])
    return c

In [20]:
#CHOOSES IN 20% OF WORDS, ONE OF THE FOLLOWING NOISES
def add_noise(text, noise_prob=0.2):
    words = text.split()
    noisy_words = []

    for word in words:
        if random.random() < noise_prob:
            noise_type = random.choice(["delete_word", "duplicate_word", "shuffle", "synonym", "char_noise"])

            if noise_type == "delete_word":
                continue

            elif noise_type == "duplicate_word":
                noisy_words.extend([word, word])

            elif noise_type == "shuffle":
                if len(words) > 1:
                    idx = words.index(word)
                    if idx < len(words) - 1:
                        noisy_words.append(words[idx + 1])
                        noisy_words.append(word)
                        continue

            elif noise_type == "synonym":
                cleaned_word = word.strip('.,?!')
                synonym = get_synonym(cleaned_word.lower())
                noisy_words.append(synonym if synonym else word)

            elif noise_type == "char_noise":
                noisy_word = ""
                for char in word:
                    if random.random() < 0.2:
                        char_noise_type = random.choice(["typo", "duplicate", "delete", "replace"])
                        if char_noise_type == "typo":
                            noisy_word += typo_char(char)
                        elif char_noise_type == "duplicate":
                            noisy_word += char * 2
                        elif char_noise_type == "delete":
                            continue
                        elif char_noise_type == "replace":
                            noisy_word += random.choice("abcdefghijklmnopqrstuvwxyz")
                    else:
                        noisy_word += char
                noisy_words.append(noisy_word)
        else:
            noisy_words.append(word)

    return " ".join(noisy_words)

In [21]:
def retrieve_data(max_length=30, add_text_noise=False, noise_level=0.2):
    dataset = load_dataset("wmt14", "de-en")

    raw_subset = dataset["train"].select(range(200000))

    def is_short(example):
        return len(example["translation"]["de"].split()) <= max_length and len(example["translation"]["en"].split()) <= max_length

    filtered = raw_subset.filter(is_short)

    if add_text_noise:
        def apply_noise(example):
            example["translation"]["de"] = add_noise(example["translation"]["de"], noise_level)
            example["translation"]["en"] = add_noise(example["translation"]["en"], noise_level)
            return example

        filtered = filtered.map(apply_noise)

    train_data = filtered.select(range(50000))
    val_data = filtered.select(range(50000, 53000))
    test_data = filtered.select(range(53000, 56000))

    return {
        "train": train_data,
        "validation": val_data,
        "test": test_data
    }

In [22]:
def translated(n, model):
    return model.translate_text(n)

In [24]:
data = retrieve_data()


ValueError: Invalid pattern: '**' can only be an entire path component

In [None]:
print(data["train"].shape)
print(data["test"].shape)
print(data["validation"].shape)

In [None]:
data["validation"]["translation"][:10]

In [None]:
data["test"]["translation"][:10]

In [None]:
data["train"]["translation"][:10]

In [None]:
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [None]:
english = [n["en"] for n in data["train"]["translation"]]
german = [n["de"] for n in data["train"]["translation"]]

In [None]:
data

In [None]:
def preprocess(batch):
    src_texts = [ex["en"] for ex in batch["translation"]]
    tgt_texts = [ex["de"] for ex in batch["translation"]]

    model_inputs = tokenizer(
        src_texts,
        truncation=True,
        padding="max_length",
        max_length=40
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            tgt_texts,
            truncation=True,
            padding="max_length",
            max_length=40
        )["input_ids"]

    labels = [
        [(token if token != tokenizer.pad_token_id else -100) for token in seq]
        for seq in labels
    ]

    model_inputs["labels"] = labels
    return model_inputs


In [None]:
tokenized_data = {
    "train": data["train"].map(preprocess, batched = True),
    "validation": data["validation"].map(preprocess, batched=True)
}


In [None]:
from tqdm import tqdm
import evaluate

bleu = evaluate.load("bleu")
test_subset = data["test"].select(range(3000))

src_texts = [ex["translation"]["en"] for ex in test_subset]
references = [[ex["translation"]["de"]] for ex in test_subset]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
batch_size = 16
predictions = []

for i in tqdm(range(0, len(src_texts), batch_size), desc="Translating"):
    batch = src_texts[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=60)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, num_beams=4, max_length=60, early_stopping=True)
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    predictions.extend(preds)


In [None]:
references[:10]

In [None]:
predictions[:10]


In [None]:
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")


In [None]:
bleu_score = bleu.compute(predictions=predictions, references=references)
meteor_score = meteor.compute(predictions=predictions, references=[r[0] for r in references])
bert_score = bertscore.compute(predictions=predictions, references=[r[0] for r in references], lang="de")


In [None]:
bleu_score_BEFORE = bleu_score
meteor_score_BEFORE = meteor_score
bert_score_BEFORE = bert_score

In [None]:


print(f"BLEU before fine-tunning score: {bleu_score['bleu']:.4f}")

print("METEOR:", meteor_score["meteor"])
print("BERTScore:")
bert_precision = sum(bert_score['precision']) / len(bert_score['precision']) # TP/(TP+FP)
bert_recall = sum(bert_score['recall']) / len(bert_score['recall']) # TP/(TP+FN)
bert_f1 = sum(bert_score['f1']) / len(bert_score['f1'])
print(f"  Precision: {bert_precision:.4f}")
print(f"  Recall:    {bert_recall:.4f}")
print(f"  F1:        {bert_f1:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd



data = pd.DataFrame({
    'Metric': ['BLEU', 'METEOR', 'Precision(BERT)', 'Recall(BERT)', 'F1(BERT)'],
    'Score': [bleu_score['bleu'], meteor_score["meteor"], bert_precision, bert_recall, bert_f1]
})

sns.set(style="white", context="talk")
palette = sns.color_palette("viridis", len(data))

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Metric', y='Score', data=data, palette=palette)

for i, row in data.iterrows():
    ax.text(i, row['Score'] + 0.025, f"{row['Score']:.4f}",
            ha='center', va='bottom',  fontsize=12)

plt.title("Evaluation Metrics Before Fine-Tuning", fontsize=18, pad=20)
plt.ylim(0, 1.1)
plt.ylabel("Score", fontsize=14)
plt.xlabel("")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.despine()
ax.yaxis.grid(True, linestyle='--', alpha=0.7)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from evaluate import load

bleu_metric = load("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    list_of_lists = [[lbl] for lbl in decoded_labels]
    flat_list= decoded_labels



    bleu = bleu_metric.compute(predictions=decoded_preds, references=list_of_lists)

    meteor_score = meteor.compute(predictions=decoded_preds, references=list_of_lists)

    bert_score = bertscore.compute(predictions=decoded_preds, references=flat_list, lang="de")
    bert_precision = sum(bert_score['precision']) / len(bert_score['precision']) # TP/(TP+FP)
    bert_recall = sum(bert_score['recall']) / len(bert_score['recall']) # TP/(TP+FN)
    bert_f1 = sum(bert_score['f1']) / len(bert_score['f1'])

    return {
        "bleu": bleu["bleu"],
        "meteor": meteor_score["meteor"],
        "bertscore_precision": bert_precision,
        "bertscore_recall": bert_recall,
        "bertscore_f1": bert_f1
    }


In [None]:
!pip install --upgrade transformers


In [None]:
import transformers
print(transformers.__version__)

In [None]:
import transformers
print(transformers.Seq2SeqTrainingArguments.__init__.__code__.co_varnames)



In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./marianmt-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=7,
    eval_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    logging_steps=500,
    load_best_model_at_end=True,
    #metric_for_best_model="bleu",
    #greater_is_better=True
)


In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],

)

trainer.train()


In [None]:
import matplotlib.pyplot as plt
import math

log_history   = trainer.state.log_history
epoch_entries = [e for e in log_history if "epoch" in e]

metrics = [
    "loss",
    "learning_rate",
    "eval_loss",
    "eval_bleu",
    "eval_meteor",
    "eval_bertscore_precision",
    "eval_bertscore_recall",
    "eval_bertscore_f1"
]
names = {
    "loss": "Training Loss",
    "learning_rate": "Learning Rate",
    "eval_loss": "Validation Loss",
    "eval_bleu": "BLEU",
    "eval_meteor": "METEOR",
    "eval_bertscore_precision": "BERTScore Precision",
    "eval_bertscore_recall": "BERTScore Recall",
    "eval_bertscore_f1": "BERTScore F1"
}

n = len(metrics)
cols = 4
rows = math.ceil(n / cols)
fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))
axes = axes.flatten()

cmap = plt.get_cmap('tab10')

for i, metric in enumerate(metrics):
    ax = axes[i]
    currdata = [(e["epoch"], e[metric]) for e in epoch_entries if metric in e]
    if not currdata:
        ax.set_visible(False)
        continue
    epochs, values = zip(*currdata)

    ax.plot(epochs, values,
            marker='o',
            linestyle='-',
            color=cmap(i % 10),
            label=names.get(metric, metric))
    ax.set_title(names.get(metric, metric), fontsize=12, fontweight='bold')
    ax.set_xlabel("Epoch")
    ax.set_ylabel(names.get(metric, metric))
    ax.grid(linestyle='--', alpha=0.5)

for ax in axes[n:]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
data = retrieve_data()
model = trainer.model
tokenizer = trainer.tokenizer

In [None]:
bleu = evaluate.load("bleu")
test_subset = data["test"].select(range(3000))

src_texts = [ex["translation"]["en"] for ex in test_subset]
references = [[ex["translation"]["de"]] for ex in test_subset]

batch_size = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
predictions = []

for i in tqdm(range(0, len(src_texts), batch_size), desc="Translating"):
    batch = src_texts[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=60)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=60, num_beams=4, early_stopping=True)
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(preds)

In [None]:
references[:10]

In [None]:
predictions[:10]

In [None]:
bleu_score = bleu.compute(predictions=predictions, references=references)
meteor_score = meteor.compute(predictions=predictions, references=[r[0] for r in references])
bert_score = bertscore.compute(predictions=predictions, references=[r[0] for r in references], lang="de")

In [None]:
bleu_score_AFTER = bleu_score
meteor_score_AFTER = meteor_score
bert_score_AFTER = bert_score

In [None]:
bert_precision = sum(bert_score['precision']) / len(bert_score['precision']) # TP/(TP+FP)
bert_recall = sum(bert_score['recall']) / len(bert_score['recall']) # TP/(TP+FN)
bert_f1 = sum(bert_score['f1']) / len(bert_score['f1'])

print(f"BLEU:   {bleu_score['bleu']:.4f}")
print(f"METEOR: {meteor_score['meteor']:.4f}")
print("BERTScore:")
print(f"  Precision: {bert_precision:.4f}")
print(f"  Recall:    {bert_recall:.4f}")
print(f"  F1:        {bert_f1:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd



data = pd.DataFrame({
    'Metric': ['BLEU', 'METEOR', 'Precision(BERT)', 'Recall(BERT)', 'F1(BERT)'],
    'Score': [bleu_score['bleu'], meteor_score["meteor"], bert_precision, bert_recall, bert_f1]
})

sns.set(style="white", context="talk")
palette = sns.color_palette("viridis", len(data))

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Metric', y='Score', data=data, palette=palette)

for i, row in data.iterrows():
    ax.text(i, row['Score'] + 0.025, f"{row['Score']:.4f}",
            ha='center', va='bottom',  fontsize=12)

plt.title("Evaluation Metrics After Fine-Tuning", fontsize=18, pad=20)
plt.ylim(0, 1.1)
plt.ylabel("Score", fontsize=14)
plt.xlabel("")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.despine()
ax.yaxis.grid(True, linestyle='--', alpha=0.7)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

bleu_before= bleu_score_BEFORE["bleu"]
meteor_before= meteor_score_BEFORE["meteor"]
prec_before= np.mean(bert_score_BEFORE["precision"])
rec_before= np.mean(bert_score_BEFORE["recall"])
f1_before= np.mean(bert_score_BEFORE["f1"])

bleu_after= bleu_score_AFTER["bleu"]
meteor_after= meteor_score_AFTER["meteor"]
prec_after= np.mean(bert_score_AFTER["precision"])
rec_after= np.mean(bert_score_AFTER["recall"])
f1_after= np.mean(bert_score_AFTER["f1"])

before = [bleu_before, meteor_before, prec_before, rec_before, f1_before]
after  = [bleu_after,  meteor_after,  prec_after,  rec_after,  f1_after]
metrics = [
    "BLEU",
    "METEOR",
    "BERTScore Precision",
    "BERTScore Recall",
    "BERTScore F1"
]

rows, cols = 2, 3
fig, axes = plt.subplots(
    rows, cols,
    figsize=(14, 8),
    constrained_layout=True
)
axes = axes.flatten()

for idx, ax in enumerate(axes):
    if idx < len(metrics):
        vals = [before[idx], after[idx]]
        ax.bar(
            ["Before", "After"],
            vals,
            color=["orange", "green"],
            width=0.7
        )
        ax.set_title(metrics[idx], fontsize=12, fontweight='bold')
        ax.set_ylabel("Score")
        ax.grid(axis='y', linestyle='--', alpha=0.5)

        top = max(vals) * 1.25
        ax.set_ylim(0, top)

        for i, v in enumerate(vals):
            ax.text(
                i, v + top * 0.02,
                f"{v:.3f}",
                ha='center',
                va='bottom',
                fontsize=10
            )
    else:
        ax.axis("off")



plt.show()


In [None]:
# model.save_pretrained("marianmt-finetuned")
# tokenizer.save_pretrained("marianmt-finetuned")
