In [None]:
import os; os.kill(os.getpid(), 9) #restarting the session


In [None]:
!pip -q install transformers datasets accelerate evaluate rouge-score sentencepiece

In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score

In [None]:
!pip install -U transformers accelerate datasets evaluate rouge-score sentencepiece


In [None]:
!pip uninstall -y transformers accelerate datasets evaluate rouge-score sentencepiece
!pip install "transformers==4.46.2" "accelerate==1.2.1" datasets evaluate rouge-score sentencepiece
import os; os.kill(os.getpid(), 9)


In [None]:
# ==== Disable W&B logging globally ====
import os
os.environ["WANDB_DISABLED"] = "true"

# ==== Imports ====
import pandas as pd
import numpy as np
import evaluate
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)
import torch

# ==== Paths (ROND: no chunking) ====
TRAIN = "./rond_train_processed.csv"
VAL   = "./rond_val_processed.csv"
TEST  = "./rond_test_processed.csv"

# ==== Load CSVs ‚Üí HuggingFace Datasets ====
def load_split(path):
    df = pd.read_csv(path)
    if "instruction" in df.columns:
        df["source"] = "Instruction: " + df["instruction"].astype(str) + "\nInput: " + df["input"].astype(str)
    else:
        df["source"] = df["input"].astype(str)
    df = df[["source", "output"]].rename(columns={"output": "target"})
    return Dataset.from_pandas(df)

ds = DatasetDict({
    "train": load_split(TRAIN),
    "validation": load_split(VAL),
    "test": load_split(TEST)
})

# ==== Model & Tokenizer ====
MODEL_NAME = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

max_source_len = 512
max_target_len = 128

def preprocess(batch):
    model_in = tokenizer(batch["source"], max_length=max_source_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target"], max_length=max_target_len, truncation=True)
    model_in["labels"] = labels["input_ids"]
    return model_in

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ==== Metrics (ROUGE-L) ====
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # üß† Handle tuple outputs and logits ‚Üí token IDs
    if isinstance(preds, tuple):
        preds = preds[0]
    if isinstance(preds, np.ndarray) and preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)  # convert logits ‚Üí token IDs

    # Convert torch tensors ‚Üí numpy
    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # Replace -100 (ignored tokens) with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE
    scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {"rougeL": scores["rougeL"]}

# ==== TrainingArguments ====
args = TrainingArguments(
    output_dir="./bart_rond_baseline",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to=[],  # Disable wandb/tensorboard
    disable_tqdm=False,
)

# ==== Trainer ====
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

# ==== Train ====
print("üöÄ Training started...\n")
train_result = trainer.train()

# ==== Evaluate on test set ====
print("\nüîç Evaluating on test set...\n")
test_metrics = trainer.evaluate(tokenized["test"], metric_key_prefix="test")
print("‚úÖ Test set metrics:", test_metrics)

# ==== Visualize Training Progress ====
if hasattr(trainer, "state") and trainer.state.log_history:
    train_logs = [x for x in trainer.state.log_history if "loss" in x]
    if train_logs:
        steps = [x.get("step", i) for i, x in enumerate(train_logs)]
        losses = [x["loss"] for x in train_logs]

        plt.figure(figsize=(7, 4))
        plt.plot(steps, losses, marker='o', label="Training Loss")
        plt.title("Training Progress")
        plt.xlabel("Steps")
        plt.ylabel("Loss")
        plt.grid(True)
        plt.legend()
        plt.show()
