In [1]:
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HF_READ_TOKEN'))

In [2]:
!pip install -q sentence-transformers evaluate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [6]:
import pandas as pd
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from tqdm import tqdm # üëà Added for progress bars

# --- Paths ---
TRAIN_FILE_PATH = './webqsp_train_validated_2_hops.pkl'
# üî¥ SET THIS TO YOUR VALIDATION FILE
VAL_FILE_PATH = './webqsp_val_validated_2_hops.pkl'
BEST_CHECKPOINT_PATH = 'granite_2hop.pt'

"""
LightPROF-style adapter training (fixed):
- ... (same as before) ...
- ADDED: Validation loop and best model checkpointing.
"""

@dataclass
class Config:
    qwen_model: str = "ibm-granite/granite-4.0-1b"  # or "Qwen/Qwen3-0.6B"
    bge_model: str = "BAAI/bge-base-en-v1.5"
    lr: float = 2e-4
    batch_size: int = 4
    epochs: int = 10 # Note: You set this to 1, validation will run once.
    prompt_length: int = 8
    max_question_len: int = 128
    max_answer_len: int = 32
    hard_prompt: str = (
        "You are a precise KGQA assistant. Use the embedded knowledge to answer succinctly.\n"
        "Return ONLY the final answer text (no extra words).\n"
    )
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

cfg = Config()

# --- UNCHANGED FUNCTIONS ---

def serialize_reasoning_graph(graph_obj: Any) -> str:
    if isinstance(graph_obj, str):
        return graph_obj
    parts = []
    try:
        for item in graph_obj:
            if isinstance(item, (list, tuple)):
                parts.append(" -> ".join([str(x) for x in item]))
            else:
                parts.append(str(item))
    except Exception:
        parts.append(str(graph_obj))
    return " || ".join(parts)

class KGQADataset(Dataset):
    def __init__(self, frame):
        self.df = frame.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        q = str(row['question'])
        rg = serialize_reasoning_graph(row['reasoning_graph'])
        ans = row['answer']
        if isinstance(ans, (list, tuple)):
            ans = ans[0]
        ans = str(ans)
        return {"question": q, "reasoning_graph_text": rg, "answer_text": ans}

print("Loading models‚Ä¶")
q_tokenizer = AutoTokenizer.from_pretrained(cfg.qwen_model)
if q_tokenizer.pad_token is None:
    q_tokenizer.pad_token = q_tokenizer.eos_token
q_model = AutoModelForCausalLM.from_pretrained(cfg.qwen_model)
q_model.config.pad_token_id = q_tokenizer.pad_token_id
q_model.eval()
for p in q_model.parameters():
    p.requires_grad = False

bge = SentenceTransformer(cfg.bge_model)
bge.eval()
for p in bge.parameters():
    p.requires_grad = False

hidden_size = getattr(q_model.config, 'hidden_size', getattr(q_model.config, 'n_embd', getattr(q_model.config, 'd_model', None)))
assert hidden_size is not None, "Could not infer Qwen hidden size from config."

class KnowledgeAdapter(nn.Module):
    def __init__(self, in_dim: int, hidden_out: int, prompt_length: int):
        super().__init__()
        self.prompt_length = prompt_length
        out_dim = hidden_out * prompt_length
        self.net = nn.Sequential(
            nn.Linear(in_dim, 2*in_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(2*in_dim, out_dim)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = self.net(x)
        return y.view(x.size(0), self.prompt_length, -1)

adapter = KnowledgeAdapter(in_dim=768, hidden_out=hidden_size, prompt_length=cfg.prompt_length).to(cfg.device)
optimizer = torch.optim.AdamW(adapter.parameters(), lr=cfg.lr)

SEP = "\n\n"

def build_inputs(batch: List[Dict[str, str]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    rg_texts = [ex['reasoning_graph_text'] for ex in batch]
    with torch.no_grad():
        rg_emb = bge.encode(rg_texts, convert_to_tensor=True, device=cfg.device, normalize_embeddings=True)
    # Fix: SentenceTransformer may produce tensors under inference mode; clone to enable autograd usage downstream
    rg_emb = rg_emb.detach().clone()
    soft_prompts = adapter(rg_emb)

    inputs, labels = [], []
    for ex in batch:
        prompt_q = cfg.hard_prompt + SEP + "Question: " + ex['question'] + SEP + "Answer:"
        enc_q = q_tokenizer(prompt_q, return_tensors='pt', truncation=True, max_length=cfg.max_question_len)
        enc_a = q_tokenizer(ex['answer_text'], return_tensors='pt', truncation=True, max_length=cfg.max_answer_len)
        input_ids_q = enc_q['input_ids'][0]
        labels_q = torch.full_like(input_ids_q, -100)
        input_ids = torch.cat([input_ids_q, enc_a['input_ids'][0]], dim=0)
        labels_ids = torch.cat([labels_q, enc_a['input_ids'][0]], dim=0)
        inputs.append(input_ids)
        labels.append(labels_ids)

    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=q_tokenizer.pad_token_id)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    with torch.no_grad():
        word_emb = q_model.get_input_embeddings()(inputs.to(cfg.device))
    input_embeds = torch.cat([soft_prompts, word_emb], dim=1)

    pad_soft = torch.full((labels.size(0), soft_prompts.size(1)), -100, dtype=labels.dtype, device=cfg.device)
    labels = torch.cat([pad_soft, labels.to(cfg.device)], dim=1)

    return soft_prompts, input_embeds, labels

def collate_fn(batch):
    return batch

# --- MODIFIED: Load both datasets ---

print(f"Loading training data from {TRAIN_FILE_PATH}...")
train_df = pd.read_pickle(TRAIN_FILE_PATH)

try:
    print(f"Loading validation data from {VAL_FILE_PATH}...")
    val_df = pd.read_pickle(VAL_FILE_PATH)
    print(f"Data loaded: {len(train_df)} training samples, {len(val_df)} validation samples.")
except FileNotFoundError:
    print(f"FATAL ERROR: Validation file not found at {VAL_FILE_PATH}")
    print("Please set the VAL_FILE_PATH variable.")
    # Exiting or raising an error would be appropriate here
    # For now, it will crash on the next line if val_df is not defined.

train_dataset = KGQADataset(train_df)
val_dataset = KGQADataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=False, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=cfg.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn)


q_model.to(cfg.device)
# adapter.train() # This is now called inside the loop

# --- MODIFIED: Training loop with validation ---

best_val_loss = float('inf')
print("Starting training...")

for epoch in range(cfg.epochs):

    # --- TRAINING PHASE ---
    adapter.train() # Set model to training mode
    total_train_loss = 0.0

    train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{cfg.epochs} [TRAIN]")

    for step, batch in enumerate(train_pbar):
        optimizer.zero_grad(set_to_none=True)
        _, input_embeds, labels = build_inputs(batch)
        out = q_model(inputs_embeds=input_embeds, labels=labels)
        loss = out.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

        # Update progress bar description
        train_pbar.set_postfix({'loss': round(loss.item(), 4)})

    avg_train_loss = total_train_loss / max(1, len(train_loader))

    # --- VALIDATION PHASE ---
    adapter.eval() # Set model to evaluation mode
    total_val_loss = 0.0

    val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{cfg.epochs} [VALIDATE]")

    with torch.no_grad(): # Disable gradient calculation
        for step, batch in enumerate(val_pbar):
            _, input_embeds, labels = build_inputs(batch)
            out = q_model(inputs_embeds=input_embeds, labels=labels)
            loss = out.loss
            total_val_loss += loss.item()
            val_pbar.set_postfix({'val_loss': round(loss.item(), 4)})

    avg_val_loss = total_val_loss / max(1, len(val_loader))

    print(f"\nEpoch {epoch+1} Summary: "
          f"Avg Train Loss: {avg_train_loss:.4f} | "
          f"Avg Val Loss: {avg_val_loss:.4f}")

    # --- CHECKPOINTING ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        print(f"  üéâ New best model! Saving checkpoint to {BEST_CHECKPOINT_PATH} (Val Loss: {best_val_loss:.4f})")

        ckpt = {'config': cfg.__dict__, 'state_dict': adapter.state_dict()}
        torch.save(ckpt, BEST_CHECKPOINT_PATH)
    else:
        print(f"  (Val loss did not improve from {best_val_loss:.4f})")

print(f"\nTraining complete. Best model saved to {BEST_CHECKPOINT_PATH} with val loss: {best_val_loss:.4f}")

# --- MODIFIED: Load best model for inference ---
print(f"Loading best model from {BEST_CHECKPOINT_PATH} for inference...")
try:
    ckpt = torch.load(BEST_CHECKPOINT_PATH, map_location=cfg.device)
    adapter.load_state_dict(ckpt['state_dict'])
    print("Best adapter weights loaded successfully.")
except FileNotFoundError:
    print(f"Warning: Could not find {BEST_CHECKPOINT_PATH}. "
          f"The adapter in memory (from the last epoch) will be used.")

adapter.eval() # Ensure adapter is in eval mode

# --- UNCHANGED INFERENCE FUNCTION ---

@torch.no_grad()
def infer_answer(question: str, reasoning_graph_obj: Any, max_new_tokens: int = 32) -> str:
    q_model.eval()
    rg_text = serialize_reasoning_graph(reasoning_graph_obj)
    rg_emb = bge.encode([rg_text], convert_to_tensor=True, device=cfg.device, normalize_embeddings=True)
    # Fix: clone out of inference mode so adapter can compute gradients if needed
    rg_emb = rg_emb.detach().clone()
    soft = adapter(rg_emb)
    prompt_q = cfg.hard_prompt + SEP + "Question: " + question + SEP + "Answer:"
    enc_q = q_tokenizer(prompt_q, return_tensors='pt', truncation=True, max_length=cfg.max_question_len)
    word_emb = q_model.get_input_embeddings()(enc_q['input_ids'].to(cfg.device))
    inputs_embeds = torch.cat([soft, word_emb], dim=1)
    gen_ids = q_model.generate(inputs_embeds=inputs_embeds, max_new_tokens=max_new_tokens, do_sample=False)
    text = q_tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    return text.strip()

print(f"\nInference function is ready and uses the *best* model from {BEST_CHECKPOINT_PATH}.")

Loading models‚Ä¶
Loading training data from ./webqsp_train_validated_2_hops.pkl...
Loading validation data from ./webqsp_val_validated_2_hops.pkl...
Data loaded: 1505 training samples, 242 validation samples.
Starting training...


Epoch 1/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:09<00:00,  5.42it/s, loss=0.608]
Epoch 1/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:06<00:00, 10.13it/s, val_loss=1.97]



Epoch 1 Summary: Avg Train Loss: 2.3520 | Avg Val Loss: 2.0843
  üéâ New best model! Saving checkpoint to granite_2hop.pt (Val Loss: 2.0843)


Epoch 2/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.49it/s, loss=1.24]
Epoch 2/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:05<00:00, 10.22it/s, val_loss=1.93]



Epoch 2 Summary: Avg Train Loss: 1.6124 | Avg Val Loss: 2.0347
  üéâ New best model! Saving checkpoint to granite_2hop.pt (Val Loss: 2.0347)


Epoch 3/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.50it/s, loss=2.94]
Epoch 3/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:05<00:00, 10.17it/s, val_loss=2.09]



Epoch 3 Summary: Avg Train Loss: 1.4654 | Avg Val Loss: 2.0216
  üéâ New best model! Saving checkpoint to granite_2hop.pt (Val Loss: 2.0216)


Epoch 4/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.50it/s, loss=4.9]
Epoch 4/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:05<00:00, 10.18it/s, val_loss=2.46]



Epoch 4 Summary: Avg Train Loss: 1.2594 | Avg Val Loss: 2.1638
  (Val loss did not improve from 2.0216)


Epoch 5/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.52it/s, loss=1.98]
Epoch 5/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:06<00:00, 10.14it/s, val_loss=2.64]



Epoch 5 Summary: Avg Train Loss: 1.0717 | Avg Val Loss: 2.2570
  (Val loss did not improve from 2.0216)


Epoch 6/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.51it/s, loss=0.205]
Epoch 6/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:06<00:00, 10.12it/s, val_loss=2.42]



Epoch 6 Summary: Avg Train Loss: 0.9622 | Avg Val Loss: 2.3774
  (Val loss did not improve from 2.0216)


Epoch 7/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.50it/s, loss=0.0899]
Epoch 7/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:06<00:00, 10.17it/s, val_loss=1.89]



Epoch 7 Summary: Avg Train Loss: 0.7862 | Avg Val Loss: 2.4407
  (Val loss did not improve from 2.0216)


Epoch 8/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.50it/s, loss=1.04]
Epoch 8/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:06<00:00, 10.16it/s, val_loss=3.78]



Epoch 8 Summary: Avg Train Loss: 0.9321 | Avg Val Loss: 2.5517
  (Val loss did not improve from 2.0216)


Epoch 9/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.50it/s, loss=0.238]
Epoch 9/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:06<00:00, 10.11it/s, val_loss=2.71]



Epoch 9 Summary: Avg Train Loss: 0.8239 | Avg Val Loss: 2.5673
  (Val loss did not improve from 2.0216)


Epoch 10/10 [TRAIN]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 377/377 [01:08<00:00,  5.51it/s, loss=0.19]
Epoch 10/10 [VALIDATE]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 61/61 [00:05<00:00, 10.20it/s, val_loss=2.08]



Epoch 10 Summary: Avg Train Loss: 0.7174 | Avg Val Loss: 2.2877
  (Val loss did not improve from 2.0216)

Training complete. Best model saved to granite_2hop.pt with val loss: 2.0216
Loading best model from granite_2hop.pt for inference...
Best adapter weights loaded successfully.

Inference function is ready and uses the *best* model from granite_2hop.pt.


In [7]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from dataclasses import dataclass
from typing import Any, List, Dict, Tuple
import evaluate  # üëà Import the evaluate library
from tqdm import tqdm # üëà For a nice progress bar

# --- 1. Configuration & Model Definitions ---
# (Copied from your script)

@dataclass
class Config:
    qwen_model: str = "ibm-granite/granite-4.0-1b"
    bge_model: str = "BAAI/bge-base-en-v1.5"
    prompt_length: int = 8
    max_question_len: int = 128
    hard_prompt: str = (
        "You are a precise KGQA assistant. Use the embedded knowledge to answer succinctly.\n"
        "Return ONLY the final answer text (no extra words).\n"
    )
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

cfg = Config()
SEP = "\n\n"
ADAPTER_CHECKPOINT_PATH = 'granite_2hop.pt'

def serialize_reasoning_graph(graph_obj: Any) -> str:
    if isinstance(graph_obj, str):
        return graph_obj
    parts = []
    try:
        for item in graph_obj:
            if isinstance(item, (list, tuple)):
                parts.append(" -> ".join([str(x) for x in item]))
            else:
                parts.append(str(item))
    except Exception:
        parts.append(str(graph_obj))
    return " || ".join(parts)

class KnowledgeAdapter(nn.Module):
    def __init__(self, in_dim: int, hidden_out: int, prompt_length: int):
        super().__init__()
        self.prompt_length = prompt_length
        out_dim = hidden_out * prompt_length
        self.net = nn.Sequential(
            nn.Linear(in_dim, 2*in_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(2*in_dim, out_dim)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = self.net(x)
        return y.view(x.size(0), self.prompt_length, -1)

# --- 2. Load All Models ---
print("Loading models for inference...")

q_tokenizer = AutoTokenizer.from_pretrained(cfg.qwen_model)
if q_tokenizer.pad_token is None:
    q_tokenizer.pad_token = q_tokenizer.eos_token

q_model = AutoModelForCausalLM.from_pretrained(cfg.qwen_model)
q_model.config.pad_token_id = q_tokenizer.pad_token_id
q_model.eval()
for p in q_model.parameters(): p.requires_grad = False
q_model.to(cfg.device)

bge = SentenceTransformer(cfg.bge_model)
bge.eval()
for p in bge.parameters(): p.requires_grad = False
bge.to(cfg.device)

hidden_size = getattr(q_model.config, 'hidden_size', getattr(q_model.config, 'n_embd', getattr(q_model.config, 'd_model', None)))
adapter = KnowledgeAdapter(
    in_dim=768, hidden_out=hidden_size, prompt_length=cfg.prompt_length
).to(cfg.device)

print(f"Loading trained adapter weights from {ADAPTER_CHECKPOINT_PATH}...")
ckpt = torch.load(ADAPTER_CHECKPOINT_PATH, map_location=cfg.device)
adapter.load_state_dict(ckpt['state_dict'])
adapter.eval()
print("All models loaded successfully.")


# --- 3. Improved Inference Function ---

@torch.no_grad()
def infer_answer_only(question: str, reasoning_graph_obj: Any, max_new_tokens: int = 32) -> str:
    """
    Runs inference and decodes ONLY the newly generated tokens.
    (FIXED: Removed incorrect slicing)
    """
    rg_text = serialize_reasoning_graph(reasoning_graph_obj)
    rg_emb = bge.encode([rg_text], convert_to_tensor=True, device=cfg.device, normalize_embeddings=True)
    soft = adapter(rg_emb)

    prompt_q = cfg.hard_prompt + SEP + "Question: " + question + SEP + "Answer:"
    enc_q = q_tokenizer(prompt_q, return_tensors='pt', truncation=True, max_length=cfg.max_question_len)
    word_emb = q_model.get_input_embeddings()(enc_q['input_ids'].to(cfg.device))

    inputs_embeds = torch.cat([soft, word_emb], dim=1)

    gen_ids = q_model.generate(
        inputs_embeds=inputs_embeds,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=q_tokenizer.pad_token_id
    )

    # ‚≠ê **THE FIX**: Decode gen_ids[0] directly.
    # The slicing logic has been removed.
    text = q_tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    return text.strip()

# --- 4. Run ROUGE Evaluation ---

# ‚ö†Ô∏è IMPORTANT: You should load a *separate* validation or test set here.
# For this example, I'm just re-using the 'data1' pickle and taking a sample.
# DO NOT evaluate on your training data for a real result.
try:
    # Load the data you want to evaluate on
    eval_df = pd.read_pickle('./webqsp_test_validated_2_hops.pkl')
    # For demonstration, we'll just use the first 50 samples.
    # Replace .head(50) with your full validation set.
    # eval_df = eval_df.head(50)
    print(f"Loaded evaluation data with {len(eval_df)} samples.")

except FileNotFoundError:
    print("Evaluation data file not found. Skipping ROUGE evaluation.")
    eval_df = None

if eval_df is not None:
    predictions = []
    references = []

    print("Generating predictions for ROUGE evaluation...")
    # Loop through the evaluation dataframe
    for _, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
        question = str(row['question'])
        graph = row['reasoning_graph']

        # Get the ground-truth answer (handle list format)
        gt_answer = row['answer']
        if isinstance(gt_answer, (list, tuple)):
            gt_answer = gt_answer[0]
        gt_answer = str(gt_answer)

        # Get the model's predicted answer
        pred_answer = infer_answer_only(question, graph)

        predictions.append(pred_answer)
        references.append(gt_answer)

    print("\n--- Example Predictions ---")
    for i in range(min(3, len(predictions))):
        print(f"Ref {i+1}: {references[i]}")
        print(f"Pred {i+1}: {predictions[i]}\n")


Loading models for inference...
Loading trained adapter weights from granite_2hop.pt...
All models loaded successfully.
Loaded evaluation data with 807 samples.
Generating predictions for ROUGE evaluation...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 807/807 [19:59<00:00,  1.49s/it]


--- Example Predictions ---
Ref 1: Jamaican English
Pred 1: English Language, Jamaican Patois, Jamaican Creole Language, Jamaican English Language, Jamaican Standard Jamaican Language, Jamaican Standard Jamaican

Ref 2: Franklin stove
Pred 2: Franklin stove, Lightning rod, Bifocals, American dollar bill, Lightning rod, Glass armonica, Post Office, American System of weights and

Ref 3: Pat Nixon
Pred 3: Pat Nixon (n√©e Young) (1942‚Äì2007) - First lady of the United States, 1969‚Äì1974 and 1981






In [8]:
from rouge_score import rouge_scorer
import numpy as np


# Initialize the scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)

# Store results
all_scores = []

for pred, ref in zip(predictions, references):
    score = scorer.score(ref, pred)
    all_scores.append(score)

# --- üìà Print detailed results ---
print("\n--- üìà Per-sample ROUGE Scores (Precision | Recall | F1) ---")
for i, s in enumerate(all_scores):
    print(f"\nExample {i+1}:")
    for metric, val in s.items():
        print(f"{metric.upper():<10} ‚Üí Precision: {val.precision:.4f} | Recall: {val.recall:.4f} | F1: {val.fmeasure:.4f}")

# --- üìä Compute mean ROUGE scores ---
mean_scores = {}
for metric in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:
    mean_scores[metric] = {
        'precision': np.mean([s[metric].precision for s in all_scores]),
        'recall': np.mean([s[metric].recall for s in all_scores]),
        'fmeasure': np.mean([s[metric].fmeasure for s in all_scores]),
    }

# --- üßæ Print mean results ---
print("\n--- üßæ Mean ROUGE Scores Across Dataset ---")
for metric, vals in mean_scores.items():
    print(f"{metric.upper():<10} ‚Üí Precision: {vals['precision']:.4f} | Recall: {vals['recall']:.4f} | F1: {vals['fmeasure']:.4f}")



--- üìà Per-sample ROUGE Scores (Precision | Recall | F1) ---

Example 1:
ROUGE1     ‚Üí Precision: 0.1176 | Recall: 1.0000 | F1: 0.2105
ROUGE2     ‚Üí Precision: 0.0625 | Recall: 1.0000 | F1: 0.1176
ROUGEL     ‚Üí Precision: 0.1176 | Recall: 1.0000 | F1: 0.2105
ROUGELSUM  ‚Üí Precision: 0.1176 | Recall: 1.0000 | F1: 0.2105

Example 2:
ROUGE1     ‚Üí Precision: 0.1053 | Recall: 1.0000 | F1: 0.1905
ROUGE2     ‚Üí Precision: 0.0556 | Recall: 1.0000 | F1: 0.1053
ROUGEL     ‚Üí Precision: 0.1053 | Recall: 1.0000 | F1: 0.1905
ROUGELSUM  ‚Üí Precision: 0.1053 | Recall: 1.0000 | F1: 0.1905

Example 3:
ROUGE1     ‚Üí Precision: 0.1176 | Recall: 1.0000 | F1: 0.2105
ROUGE2     ‚Üí Precision: 0.0625 | Recall: 1.0000 | F1: 0.1176
ROUGEL     ‚Üí Precision: 0.1176 | Recall: 1.0000 | F1: 0.2105
ROUGELSUM  ‚Üí Precision: 0.1176 | Recall: 1.0000 | F1: 0.2105

Example 4:
ROUGE1     ‚Üí Precision: 0.0833 | Recall: 1.0000 | F1: 0.1538
ROUGE2     ‚Üí Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000
ROUGEL