# Environment & Dependencies  
Run the next cell **once** (commented by default) to install the dependencies.

In [1]:
# (⚠️ Uncomment the next line if you are in a fresh environment) : You might need to restart your kernel afterwards!
!pip install -q \
    transformers==4.41.0 \
    peft==0.10.0 \
    datasets==2.19.0 \
    evaluate==0.4.1 \
    accelerate==0.28.0 \
    hazm==0.9.1 \
    sacrebleu \
    jupyterlab \
    tqdm \
    openpyxl \
    numpy==1.26.4 \
    "fsspec==2023.12.2" \
    "jupyter-client<8.0" \
    "scipy>=1.12.0" \
    "thinc<8.3.6"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# 📦 Imports
import pandas as pd
import numpy as np
from tqdm import tqdm
from hazm import Normalizer
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq, Seq2SeqTrainingArguments,
                          Seq2SeqTrainer)


# Data Loading & Normalization  
Point `FILE_PATH` to the Excel file of **ParsMap** dataset.
1. Keep only the *informal* and *formal* columns.  
2. Clean each sentence with `hazm.Normalizer`.  
3. Create `train`, `validation`, and `test` splits (90 / 5 / 5 %).  


In [2]:
FILE_PATH = "/content/ParsMap.xlsx"

# 1. Load & select columns
df = (
    pd.read_excel(FILE_PATH)
      .loc[:, ['inFormalForm', 'formalForm']]
      .rename(columns={'inFormalForm': 'input',
                       'formalForm':     'target'})
)

# 2a. Drop rows where either input or target is NaN
df = df.dropna(subset=['input', 'target'])  

# 2b. Normalize each sentence
normalizer = Normalizer()
df['input']  = df['input'].apply(lambda s: normalizer.normalize(s))
df['target'] = df['target'].apply(lambda s: normalizer.normalize(s))  #

# 3. Convert to HF Dataset, shuffle, and split
full_ds    = Dataset.from_pandas(df, preserve_index=False)
shuffled   = full_ds.shuffle(seed=42)

# 90% train, 10% tmp
train_split = shuffled.train_test_split(test_size=0.10, seed=42)
# Split the 10% tmp into 5% val, 5% test
val_test    = train_split['test'].train_test_split(test_size=0.50, seed=42)

dataset = DatasetDict({
    'train':      train_split['train'],
    'validation': val_test['train'],
    'test':       val_test['test']
})

dataset


DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 45011
    })
    validation: Dataset({
        features: ['input', 'target'],
        num_rows: 2501
    })
    test: Dataset({
        features: ['input', 'target'],
        num_rows: 2501
    })
})

# Token‑length Statistics  
Before padding/truncation, inspect sequence lengths to decide `max_length` for **inputs** and **targets**.  
Write a helper `length_stats()` that returns *min, max, mean, 95‑percentile*.  


In [4]:
from transformers import AutoTokenizer
import numpy as np

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/mt5-base', use_fast=False)

def length_stats(texts):
    """
    Compute token‐length statistics over a list of strings.
    Returns a dict with min, max, mean, and 95th percentile.
    """
    # tokenize each string and record lengths
    lengths = [len(tokenizer.encode(t, add_special_tokens=True)) for t in texts]

    stats = {
        'min': int(np.min(lengths)),
        'max': int(np.max(lengths)),
        'mean': float(np.mean(lengths)),
        'p95': int(np.percentile(lengths, 95))
    }
    return stats

# Compute stats on train split
input_stats  = length_stats(dataset['train']['input'])
target_stats = length_stats(dataset['train']['target'])

print('Input stats :', input_stats)
print('Target stats:', target_stats)

# Decide sensible max lengths—e.g. use the 95th percentile so you cover most samples
MAX_SOURCE_LEN = input_stats['p95']
MAX_TARGET_LEN = target_stats['p95']

print(f"→ Setting MAX_SOURCE_LEN = {MAX_SOURCE_LEN}, MAX_TARGET_LEN = {MAX_TARGET_LEN}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Input stats : {'min': 3, 'max': 146, 'mean': 22.69538557241563, 'p95': 45}
Target stats: {'min': 4, 'max': 150, 'mean': 24.66330452556042, 'p95': 48}
→ Setting MAX_SOURCE_LEN = 45, MAX_TARGET_LEN = 48


## Tokenization function  
Complete `preprocess_function` so that it returns `input_ids`, `attention_mask`, and `labels` truncated/padded to the lengths chosen above.

In [5]:
from transformers import DataCollatorForSeq2Seq

# Assume MAX_SOURCE_LEN and MAX_TARGET_LEN already defined
# and tokenizer initialized as above.

def preprocess_function(batch):
    # Tokenize inputs
    model_inputs = tokenizer(
        batch['input'],
        max_length=MAX_SOURCE_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True
    )

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch['target'],
            max_length=MAX_TARGET_LEN,
            padding='max_length',
            truncation=True
        )['input_ids']

    # Replace padding token id's in labels with -100 so they are ignored by the loss
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]

    model_inputs['labels'] = labels
    return model_inputs

# Apply to the HF Dataset
tokenised_ds = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names
)

tokenised_ds  # Now contains input_ids, attention_mask, labels for train/val/test


Map:   0%|          | 0/45011 [00:00<?, ? examples/s]



Map:   0%|          | 0/2501 [00:00<?, ? examples/s]

Map:   0%|          | 0/2501 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 45011
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2501
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2501
    })
})

# Model & LoRA Configuration  
Instantiate *mT5‑base* and wrap it with **LoRA**.  
Read the LoRA paper and, based on its insights and your available GPU resources, experiment with the *rank r*, `lora_alpha`, and target modules.”


In [6]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM

# LoRA hyperparameters (to be tuned)
# - r: low-rank dimension (typical 4–16)
# - lora_alpha: scaling factor (typical 16–32)
# - target_modules: which submodules to inject (usually all Query/Value projections)

lora_config = LoraConfig(
    r=8,                        # start in the middle of the recommended range [4,16]
    lora_alpha=32,              # default high scaling for stable gradients
    target_modules=['q', 'v'],  # inject into Query and Value projection layers
    lora_dropout=0.1,
    bias='none',
    task_type='SEQ_2_SEQ_LM'
)

# 1. Load base model and apply LoRA
base_model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-base')
# 2. Ensure pad and decoder_start are set
base_model.config.pad_token_id = tokenizer.pad_token_id
base_model.config.decoder_start_token_id = tokenizer.pad_token_id

# 3. Wrap with LoRA
model = get_peft_model(base_model, lora_config)

# Print out trainable vs. frozen parameters
model.print_trainable_parameters()


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 884,736 || all params: 583,286,016 || trainable%: 0.15168133226770175


# Fine‑tuning  
Define `Seq2SeqTrainingArguments` and train for **3 epochs**  
Log training loss and evaluate on the validation set each epoch.  


In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-lora-formalization",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    report_to=["none"],
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding='longest',
    label_pad_token_id=-100
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_ds['train'],
    eval_dataset=tokenised_ds['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# 🚀 Train
trainer.train()




Epoch,Training Loss,Validation Loss
1,1.6263,1.133689
2,1.6044,1.00197
3,1.3255,0.969353




TrainOutput(global_step=16881, training_loss=1.9242897641071952, metrics={'train_runtime': 4084.2533, 'train_samples_per_second': 33.062, 'train_steps_per_second': 4.133, 'total_flos': 1.426269759155712e+16, 'train_loss': 1.9242897641071952, 'epoch': 3.0})

# Inference  
Generate the *formal* version of **5 custom informal sentences** using **greedy decoding** *and* your `MAX_TARGET_LEN`.  


In [8]:
import torch

# 1. Informal examples
example_inputs = [
    "واسه چی اینقدر دیر اومدی؟",
    "دیروز کلاسو ول کردم رفتم تو خیابون چرخیدم.",
    "آدم باید همیشه سرش تو کار خودش باشه.",
    "این پروژه رو کی تحویل بدیم؟",
    "یه کم بیشتر تلاش کن تا نتیجه بگیری."
]

# 2. Prefix + tokenize
PREFIX = "formalize: "
inputs = tokenizer(
    [PREFIX + s for s in example_inputs],
    max_length=MAX_SOURCE_LEN,
    padding="longest",
    truncation=True,
    return_tensors="pt"
).to(model.device)

# 3. Generate with pad as decoder start
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=MAX_TARGET_LEN,
        num_beams=1,
        early_stopping=True,
        decoder_start_token_id=tokenizer.pad_token_id
    )

# 4. Decode
formalized = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for inf, form in zip(example_inputs, formalized):
    print(f"- Informal: {inf}\n  Formal : {form}\n")




- Informal: واسه چی اینقدر دیر اومدی؟
  Formal : formalize: واسه چی اینقدر دیر آمدی؟

- Informal: دیروز کلاسو ول کردم رفتم تو خیابون چرخیدم.
  Formal : امروز کلاسو را ول کردم و رفتم در خیابان چرخیدم.

- Informal: آدم باید همیشه سرش تو کار خودش باشه.
  Formal : formalize: آدم باید همیشه سرش در کار خودش باشد.

- Informal: این پروژه رو کی تحویل بدیم؟
  Formal : formalize این پروژه را کی تحویل بدهیم؟

- Informal: یه کم بیشتر تلاش کن تا نتیجه بگیری.
  Formal : formalize: یک کم بیشتر تلاش کن تا نتیجه بگیری.



# Evaluation  
Compute **BLEU** on the *test* split and report **perplexity** on *validation*.  
Explain briefly what each metric captures for this task.  


In [17]:
import evaluate, sacrebleu
import math
from transformers import Seq2SeqTrainer

# 1. Generate predictions on the test split
test_results = trainer.predict(tokenised_ds['test'])
predictions = tokenizer.batch_decode(
    test_results.predictions,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)

# References for sacreBLEU expect a list of reference‐lists
references = [[t] for t in dataset['test']['target']]

bleu = sacrebleu.corpus_bleu(predictions, references)
print("Test BLEU:", bleu.score)

# 3. Compute Perplexity as before
eval_metrics = trainer.evaluate(tokenised_ds['validation'])
eval_loss    = eval_metrics["eval_loss"]
perplexity   = math.exp(eval_loss)
print("Validation Perplexity:", perplexity)


Test BLEU: 48.892302243490086


Validation Perplexity: 2.63623957158902


BLEU and perplexity give us different angles on our Persian sentence formalization task. A BLEU score of around 48.9 shows how much our model's outputs match up with the human "formal" references, which means there's a decent level of similarity (but it might not catch all the good paraphrases). A perplexity score of about 2.64 on the validation set shows how unsure the model is when it comes to predicting the target sentences; lower numbers mean the model thinks those formal sentences are easier to predict based on what it has learned. Together, these metrics help us balance between getting the words right (BLEU) and making sure the model is confident in coming up with smooth, formal-sounding text (perplexity).


# Stochastic Decoding & Diversity Analysis  

Read *Holtzman et al. 2020* — *The Curious Case of Neural Text Degeneration* — to understand how different **stochastic decoding** strategies (like temperature, top‑k, and top‑p sampling) can lead to generating multiple diverse outputs from the same input prompt.

Implement these decoding strategies and experiment with several input examples to observe how the outputs vary.

In [18]:
def sample_outputs(prompt: str,
                   num_return_sequences: int = 5,
                   max_length: int = 128,
                   temperature: float = 1.0,
                   top_k: int = 0,
                   top_p: float = 1.0):
    """
    Generate multiple outputs from the fine‐tuned model using stochastic decoding.
    - temperature: float > 0; higher values increase randomness.
    - top_k: int ≥ 0; 0 means no top-k filtering.
    - top_p: float ∈ (0,1]; cumulative prob. threshold for nucleus sampling.
    """
    # Tokenize prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate with sampling
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        num_return_sequences=num_return_sequences,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode and return as list of strings
    return [
        tokenizer.decode(o, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for o in outputs
    ]

# Example prompt (informal Persian)
prompt = "تو مطمئنی که بابا بلده گره دوتائی به کفشم بزنه وقتی که من صبحها میخوام برم مدرسه؟"

# Generate 5 samples with nucleus sampling (p=0.95), temp=0.9
samples = sample_outputs(
    prompt,
    num_return_sequences=5,
    temperature=0.9,
    top_k=0,
    top_p=0.95
)

print(*samples, sep="\n---\n")


تو مطمئنی که بابا بلده گره دوتایی به کفشم بزنه وقتی که من صبح ها می خواهم رفت مدرسه؟
---
تو مطمئنی که بابا بلده گره دوتایی به کفشان بزنه وقتی که من صبح ها میخوام برم مدرسه؟
---
تو مطمئنی که بابا بلده گره دوتی به کفشم بزنه وقتی که من صبح ها می خواهم به مدرسه برم؟
---
تو مطمئنی که بابا بلده گره دوتایی به کفشم بزند وقتی که من صبح ها میخوام برم مدرسه؟
---
تو مطمئنی که بابا بلده گره دوتایی به کفشم بزنه وقتی که من صبح ها میخوام برم مدرسه؟


# Discussion

# LoRA Hyper-parameters and Their Effects

**Rank (r):** Controls the number of trainable parameters added. A higher r increases model capacity but also memory use and overfitting risk; a smaller r uses less compute and often trains more stably.

**Alpha (α):** Scales the low-rank updates (∆W×α/r). Raising α effectively raises the learning rate for those adapters—too large and training can diverge, too small and convergence slows.

**Dropout:** Injects noise into the LoRA updates. A modest rate (e.g. 0.1) helps regularize and prevents overfitting; excessive dropout, however, can impede learning and destabilize gradients.

----
# Deterministic vs. Stochastic Decoding

**Deterministic (Greedy/Beam):** Always picks the highest-probability tokens or beams. Tends to produce repetitive, “bland” text because it maximizes likelihood without regard for diversity.

**Stochastic (Temp/Top-k/Top-p):** Samples from a truncated distribution.
- **Temperature (T)** flattens or sharpens probabilities (higher T → more randomness).
- **Top-k** limits choices to the k most likely tokens.
- **Top-p (nucleus)** samples from the smallest set whose cumulative probability ≥ p, trimming the unreliable tail. These methods inject controlled randomness, yielding more varied yet coherent outputs.
----
# Suggested Improvement

**Back-Translation Data Augmentation:** Generate additional informal–formal pairs by translating informal sentences into a pivot language and back to Persian. This creates pseudo-formal variants, enriching your training set and improving model robustness without extra human labeling.


# Key References  

| Topic | Paper |
|-------|------------------------------|
| Corpus | *Ehsani et al.* “Developing an Informal‑Formal Persian Corpus.” 🇮🇷 |
| Model | *Xue et al.* “mT5: A Massively Multilingual Pre‑trained Text‑to‑Text Transformer.” TACL 2021 |
| Fine‑tuning | *Hu et al.* “LoRA: Low‑Rank Adaptation of Large Language Models.” ICML 2022 |
| Decoding | *Holtzman et al.* “The Curious Case of Neural Text Degeneration.” ICLR 2020 |
