In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ============================================================
# 0. SETUP
# ============================================================

# Install dependencies
!pip install transformers datasets evaluate nltk sacrebleu rouge-score bitsandbytes peft accelerate --quiet

# Import standard libraries
import json
import math
import gc
from pathlib import Path

# Import third-party libraries
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
from datasets import load_dataset, load_from_disk
import evaluate
import pandas as pd
from tqdm import tqdm

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [3]:
# Set project path
project_path = input("➡️ Enter the full path to the project folder (ex: /content/drive/MyDrive/MyProject) : ").strip()
relative_path = Path(project_path)

➡️ Enter the full path to the project folder (ex: /content/drive/MyDrive/MyProject) : /content/drive/MyDrive/Deep Learning Project


In [None]:
# ============================================================
# 1. MODEL LOADING
# ============================================================

# ----------------------------
# 1.1 Load fine-tuned model
# ----------------------------
model_path = str(relative_path / "my-qwen-model")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto"
)
model.eval()


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=False)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=102

In [None]:
# ----------------------------
# 1.2 Load base (pretrained) model
# ----------------------------
base_model_name = "Qwen/Qwen3-0.6B"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
base_tokenizer.pad_token = base_tokenizer.eos_token
base_tokenizer.padding_side = "left"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto"
)
base_model.eval()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear4bit(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear4bit(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear4bit(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
 

In [None]:
# ---------------------------
# 1.3 Fix generation_config for Qwen
# ---------------------------

gen_config = GenerationConfig(
    do_sample=False,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
    eos_token_id=tokenizer.eos_token_id,
)

model.generation_config = gen_config
base_model.generation_config = gen_config


In [None]:
# ============================================================
# 2. DATASET PREPARATION
# ============================================================

# ----------------------------
# 2.1 Load dataset
# ----------------------------

dataset = load_dataset("csv", data_files=str(relative_path / "medDataset_processed.csv"), split="train")
train_val, test = dataset.train_test_split(test_size=0.1, seed=42).values()
questions = test["Question"]
answers = test["Answer"]


# ----------------------------
# 2.2 Load tokenized test dataset
# ----------------------------

tokenized_test = load_from_disk(str(relative_path / "tokenized_test"))
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# ============================================================
# 3. PREDICTION GENERATION
# ============================================================

# ----------------------------
# 3.1 Prediction generation function
# ----------------------------

def generate_predictions(
    texts,
    tokenizer,
    model,
    gen_config,
    batch_size=4,
    save_every=100,
    save_path="partial_preds.json"
):
    preds = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for i in tqdm(range(0, len(texts), batch_size), desc="Generating predictions"):
        batch_questions = texts[i:i + batch_size]

        # Match fine-tuning prompt
        formatted_prompts = [
          "<|im_start|>user\n" +
          q +
          "\n<|im_end|>\n" +
          "<|im_start|>assistant\n"
          for q in batch_questions
        ]

        # Tokenize
        inputs = tokenizer(
            formatted_prompts,
            return_tensors="pt",
            padding=True,
            truncation=False,
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=gen_config.do_sample,
                repetition_penalty=gen_config.repetition_penalty,
                no_repeat_ngram_size=gen_config.no_repeat_ngram_size,
                eos_token_id=gen_config.eos_token_id,
                pad_token_id=tokenizer.eos_token_id,
            )


        # Decode & clean outputs
        batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        cleaned_preds = []
        for p in batch_preds:
            if "<|im_start|>assistant" in p:
                p = p.split("<|im_start|>assistant")[1]
                p = p.split("<|im_end|>")[0]
            p = p.replace("<think>", "").replace("</think>", "").strip()
            cleaned_preds.append(p)
        preds.extend(cleaned_preds)

        # Save progress periodically
        if (i + batch_size) % (save_every * batch_size) == 0 or (i + batch_size) >= len(texts):
            with open(save_path, "w") as f:
                json.dump(preds, f, ensure_ascii=False, indent=2)
            print(f"Saved progress to {save_path} ({len(preds)} samples)")

        # Memory cleanup
        del inputs, outputs, batch_preds
        torch.cuda.empty_cache()
        gc.collect()

    return preds


In [None]:
# ----------------------------
# 3.2 Generate predictions (fine-tuned model)
# ----------------------------
predictions = generate_predictions(
    texts=questions,
    tokenizer=tokenizer,
    model=model,
    gen_config=gen_config,
    batch_size=8,
    save_every=100,
    save_path=str(relative_path / "partial_preds.json")
)
with open(str(relative_path / "final_predictions.json"), "w") as f:
    json.dump(predictions, f, ensure_ascii=False, indent=2)
print("Fine-tuned model predictions saved.")

Generating predictions:   0%|          | 0/206 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:  48%|████▊     | 99/206 [25:37<27:40, 15.51s/it]

Saved progress to /content/drive/MyDrive/Colab Notebooks/partial_preds.json (800 samples)


Generating predictions:  97%|█████████▋| 199/206 [51:22<01:47, 15.35s/it]

Saved progress to /content/drive/MyDrive/Colab Notebooks/partial_preds.json (1600 samples)


Generating predictions: 100%|█████████▉| 205/206 [52:54<00:15, 15.44s/it]

Saved progress to /content/drive/MyDrive/Colab Notebooks/partial_preds.json (1641 samples)


Generating predictions: 100%|██████████| 206/206 [53:07<00:00, 15.47s/it]


Fine-tuned model predictions saved.


In [None]:
# ----------------------------
# 3.3 Generate predictions (base model)
# ----------------------------
base_predictions = generate_predictions(
    texts=questions,
    tokenizer=base_tokenizer,
    model=base_model,
    gen_config=gen_config,
    batch_size=8,
    save_every=100,
    save_path=str(relative_path / "base_partial_preds.json")
)
with open(str(relative_path / "base_final_predictions.json"), "w") as f:
    json.dump(base_predictions, f, ensure_ascii=False, indent=2)
print("Base model predictions saved.")

Generating predictions:  48%|████▊     | 99/206 [22:29<24:03, 13.49s/it]

Saved progress to /content/drive/MyDrive/Colab Notebooks/base_partial_preds.json (800 samples)


Generating predictions:  97%|█████████▋| 199/206 [45:06<01:34, 13.54s/it]

Saved progress to /content/drive/MyDrive/Colab Notebooks/base_partial_preds.json (1600 samples)


Generating predictions: 100%|█████████▉| 205/206 [46:27<00:13, 13.53s/it]

Saved progress to /content/drive/MyDrive/Colab Notebooks/base_partial_preds.json (1641 samples)


Generating predictions: 100%|██████████| 206/206 [46:38<00:00, 13.58s/it]


Base model predictions saved.


In [None]:
# ============================================================
# 4. LOAD PREDICTIONS (OPTIONAL)
# ============================================================

with open(str(relative_path / "final_predictions.json"), "r") as f:
    predictions = json.load(f)
with open(str(relative_path / "base_final_predictions.json"), "r") as f:
    base_predictions = json.load(f)
print(f"Loaded {len(predictions)} fine-tuned predictions")
print(f"Loaded {len(base_predictions)} base predictions")

Loaded 1641 fine-tuned predictions
Loaded 1641 base predictions


In [None]:
# ============================================================
# 5. EVALUATION
# ============================================================

# ----------------------------
# 5.1 Load metrics
# ----------------------------
squad_metric = evaluate.load("squad")
bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# ----------------------------
# 5.2 Compute metrics
# ----------------------------
def compute_metrics(preds, answers):
    formatted_preds = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(preds)]
    formatted_refs  = [{"id": str(i), "answers": {"text": [ans], "answer_start": [0]}} for i, ans in enumerate(answers)]
    squad_res = squad_metric.compute(predictions=formatted_preds, references=formatted_refs)
    bleu_res = bleu_metric.compute(predictions=preds, references=[[r] for r in answers])
    rouge_res = rouge_metric.compute(predictions=preds, references=answers)
    meteor_res = meteor_metric.compute(predictions=preds, references=answers)
    return {
        "EM": squad_res["exact_match"],
        "F1": squad_res["f1"],
        "BLEU": bleu_res["score"],
        "ROUGE-L": rouge_res["rougeL"],
        "METEOR": meteor_res["meteor"]
    }

baseline_metrics = compute_metrics(base_predictions, answers)
fine_tuned_metrics = compute_metrics(predictions, answers)

In [None]:
# ----------------------------
# 5.3 Compute Perplexity
# ----------------------------
def compute_perplexity(model, dataset, batch_size=4):

    model.to(device)
    model.eval()
    loader = DataLoader(dataset, batch_size=batch_size)

    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            attention_mask=batch["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            num_tokens = (labels != -100).sum().item()
            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens

    return math.exp(total_loss / total_tokens)

baseline_perplexity = compute_perplexity(base_model, tokenized_test)
fine_tuned_perplexity = compute_perplexity(model, tokenized_test)
baseline_metrics["Perplexity"] = baseline_perplexity
fine_tuned_metrics["Perplexity"] = fine_tuned_perplexity

In [None]:
# ----------------------------
# 5.4 Display evaluation comparison
# ----------------------------
df = pd.DataFrame([baseline_metrics, fine_tuned_metrics], index=["Baseline", "Fine-tuned"])
print(df)

             EM         F1      BLEU   ROUGE-L    METEOR  Perplexity
Baseline    0.0  19.606634  1.650502  0.123362  0.161470   49.479412
Fine-tuned  0.0  29.329821  4.217529  0.193458  0.231056    3.753385
