In [None]:
# Install evaluation dependencies
# !pip install -q datasets langchain-google-genai rich pydantic tqdm pandas numpy

In [1]:
import json
import torch
from typing import List, Dict, Tuple, Optional
from pathlib import Path
from collections import defaultdict

from pydantic import BaseModel, Field
from datasets import load_dataset, Dataset
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import numpy as np

In [2]:
# ==============================================================================
# CONFIGURATION - EDIT THESE FOR YOUR EVALUATION
# ==============================================================================

class Config:
    """Evaluation Configuration"""
    
    # Model to evaluate (change for before/after comparison)
    # MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"  # Base model (before training)
    # MODEL_NAME = "Vishva007/Llama-3.2-3B-Instruct-RBI-QA"  # Fine-tuned (after training)
    # MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  # Qwen base
    MODEL_NAME = "Vishva007/Qwen2.5-3B-Instruct-RBI-QA"  # Qwen fine-tuned
    
    MODEL_CACHE_DIR = "./Models"
    
    # Dataset
    DATASET_NAME = "Vishva007/RBI-Circular-QA-Dataset"
    EVAL_SPLIT = "eval"
    TRAIN_SPLIT = "train"
    
    # Evaluation size
    EVAL_SIZE = 1000  # Desired number of samples
    SEED = 42
    
    # Stratified sampling columns (in priority order)
    STRATIFY_COLUMNS = [
        'regulation_area',
        'applicable_to', 
        'category',
        'estimated_difficulty'
    ]
    
    # Generation parameters
    MAX_NEW_TOKENS = 1536
    TEMPERATURE = 0.7
    TOP_P = 0.95
    TOP_K = 40
    REPETITION_PENALTY = 1.2
    BATCH_SIZE = 50
    
    # Gemini evaluation
    GOOGLE_API_KEY = "AIzaSyA-MjLcjffR4yDtn-aOuy6CQ9Ze65zMsWY"  # Set your API key
    GOOGLE_MODEL = "gemini-2.0-flash"
    GOOGLE_TEMPERATURE = 0.01
    GOOGLE_MAX_TOKENS = 256
    MAX_CONCURRENCY = 50
    
    # Output
    OUTPUT_DIR = "./Eval"
    TIMESTAMP = None

# Set timestamp
from datetime import datetime
Config.TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")

print("\n[bold cyan]Configuration Loaded[/bold cyan]")
print(f"Model: {Config.MODEL_NAME}")
print(f"Eval size: {Config.EVAL_SIZE}")
print(f"Stratify by: {Config.STRATIFY_COLUMNS}")



[bold cyan]Configuration Loaded[/bold cyan]
Model: Vishva007/Qwen2.5-3B-Instruct-RBI-QA
Eval size: 1000
Stratify by: ['regulation_area', 'applicable_to', 'category', 'estimated_difficulty']


In [3]:
# Verify GPU
print("\n[bold cyan]System Information[/bold cyan]")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")



[bold cyan]System Information[/bold cyan]
PyTorch: 2.7.1+cu126
CUDA Available: True
GPU: NVIDIA RTX A4500
VRAM: 19.7 GB


In [4]:
# ==============================================================================
# HELPER FUNCTIONS
# ==============================================================================

def map_difficulty_to_categorical(difficulty_score):
    """Map numerical difficulty to categorical labels"""
    if pd.isna(difficulty_score):
        return 'unknown'
    
    difficulty_score = int(difficulty_score)
    if difficulty_score in [1, 2]:
        return 'easy'
    elif difficulty_score in [3, 4, 5]:
        return 'medium'
    elif difficulty_score in [6, 7]:
        return 'hard'
    else:
        return 'unknown'


def create_stratification_key(df: pd.DataFrame, columns: List[str]) -> pd.Series:
    """Create combined stratification key from multiple columns"""
    key_parts = []
    for col in columns:
        if col in df.columns:
            # Convert to string and handle NaN
            key_parts.append(df[col].astype(str).fillna('unknown'))
        else:
            print(f"[yellow]Warning: Column '{col}' not found, skipping[/yellow]")
    
    if not key_parts:
        raise ValueError("No valid stratification columns found")
    
    # Combine with separator
    return key_parts[0].str.cat(key_parts[1:], sep="_")


print("[green]✓ Helper functions loaded[/green]")


[green]✓ Helper functions loaded[/green]


In [5]:
# ==============================================================================
# STRATIFIED DATASET SAMPLING
# ==============================================================================

def load_stratified_eval_dataset(
    dataset_name: str,
    eval_split: str,
    train_split: str,
    target_size: int,
    stratify_columns: List[str],
    seed: int
) -> Tuple[Dataset, str, Dict]:
    """
    Load evaluation dataset with stratified sampling.
    Uses your exact strategy: creates stratification key and samples proportionally.
    """
    print(f"\n[bold blue]Loading Dataset: {dataset_name}[/bold blue]")
    
    # Try to load eval split
    try:
        eval_dataset = load_dataset(dataset_name, split=eval_split)
        eval_df = eval_dataset.to_pandas()
        print(f"✓ Eval split found: {len(eval_df)} samples")
    except Exception as e:
        print(f"[yellow]⚠ Eval split not found: {e}[/yellow]")
        eval_df = pd.DataFrame()
    
    # Decide if we need to sample from train
    if len(eval_df) >= target_size:
        # Eval split is sufficient
        print(f"[green]✓ Eval split sufficient, using {target_size} samples[/green]")
        
        # Apply stratified sampling even on eval split for consistency
        df = eval_df
        source = "eval_split"
        
    else:
        # Need to use train split
        print(f"[yellow]⚠ Eval split insufficient ({len(eval_df)} < {target_size})[/yellow]")
        print("Sampling from train split with stratification")
        
        train_dataset = load_dataset(dataset_name, split=train_split)
        df = train_dataset.to_pandas()
        print(f"✓ Train split loaded: {len(df)} samples")
        source = "train_split"
    
    # Map difficulty to categorical
    if 'estimated_difficulty' in df.columns:
        print("Mapping difficulty scores to categories...")
        df['estimated_difficulty_mapped'] = df['estimated_difficulty'].apply(
            map_difficulty_to_categorical
        )
        
        # Use mapped difficulty for stratification
        stratify_cols_to_use = [
            'estimated_difficulty_mapped' if col == 'estimated_difficulty' else col
            for col in stratify_columns
        ]
    else:
        stratify_cols_to_use = stratify_columns
    
    # Create stratification key
    print(f"Creating stratification key from: {stratify_cols_to_use}")
    df['stratification_key'] = create_stratification_key(df, stratify_cols_to_use)
    
    # Count samples per group
    group_counts = df['stratification_key'].value_counts()
    total_groups = len(group_counts)
    print(f"Found {total_groups} unique stratification groups")
    
    # Sample proportionally from each group
    print(f"Sampling {target_size} examples proportionally...")
    eval_split_list = []
    
    for key in group_counts.index:
        group_df = df[df['stratification_key'] == key]
        
        # Calculate proportional samples for this group
        num_samples = int(np.ceil((group_counts[key] / len(df)) * target_size))
        num_samples = min(num_samples, len(group_df))
        
        if num_samples > 0:
            sampled = group_df.sample(n=num_samples, random_state=seed)
            eval_split_list.append(sampled)
    
    # Combine all groups
    if eval_split_list:
        sampled_df = pd.concat(eval_split_list).reset_index(drop=True)
        
        # Trim to exact target size if oversampled
        if len(sampled_df) > target_size:
            sampled_df = sampled_df.sample(n=target_size, random_state=seed).reset_index(drop=True)
    else:
        print("[red]Error: No samples selected[/red]")
        sampled_df = pd.DataFrame()
    
    # Clean up temporary columns
    columns_to_drop = ['stratification_key']
    if 'estimated_difficulty_mapped' in sampled_df.columns:
        columns_to_drop.append('estimated_difficulty_mapped')
    
    sampled_df = sampled_df.drop(columns=[c for c in columns_to_drop if c in sampled_df.columns])
    
    print(f"\n[green]✓ Sampled {len(sampled_df)} examples from {source}[/green]")
    
    # Show distribution
    distribution = {}
    for col in stratify_columns:
        if col in sampled_df.columns:
            dist = sampled_df[col].value_counts(normalize=True)
            distribution[col] = dist.to_dict()
            print(f"\nDistribution of '{col}':")
            for value, pct in dist.head(5).items():
                print(f"  {value}: {pct*100:.1f}%")
    
    # Convert to Dataset
    dataset = Dataset.from_pandas(sampled_df)
    
    return dataset, source, distribution


# Load evaluation dataset
eval_dataset, dataset_source, distribution = load_stratified_eval_dataset(
    dataset_name=Config.DATASET_NAME,
    eval_split=Config.EVAL_SPLIT,
    train_split=Config.TRAIN_SPLIT,
    target_size=Config.EVAL_SIZE,
    stratify_columns=Config.STRATIFY_COLUMNS,
    seed=Config.SEED
)

print("\n[bold green]Dataset loaded and ready for evaluation![/bold green]")



[bold blue]Loading Dataset: Vishva007/RBI-Circular-QA-Dataset[/bold blue]
✓ Eval split found: 1000 samples
[green]✓ Eval split sufficient, using 1000 samples[/green]
Mapping difficulty scores to categories...
Creating stratification key from: ['regulation_area', 'applicable_to', 'category', 'estimated_difficulty_mapped']
Found 366 unique stratification groups
Sampling 1000 examples proportionally...

[green]✓ Sampled 1000 examples from eval_split[/green]

Distribution of 'regulation_area':
  Banking Regulation: 10.0%
  Foreign Exchange Management: 8.6%
  Anti-Money Laundering: 7.4%
  Prudential Norms: 4.7%
  Payment and Settlement Systems: 4.2%

Distribution of 'applicable_to':
  Regulated Entities: 7.3%
  All Primary (Urban) Co-operative Banks: 4.4%
  Scheduled Commercial Banks (excluding Regional Rural Banks), All India Term Financial Institutions (NABARD, NHB, EXIM Bank, and SIDBI), Small Finance Banks, Systemically Important Non-Deposit taking Non-Banking Financial Companies (NBFC

In [7]:
# ==============================================================================
# PYDANTIC MODELS
# ==============================================================================

class EvaluationResult(BaseModel):
    """Single evaluation result from Gemini"""
    score: int = Field(
        description="1 if answer fully satisfies criteria, 0 otherwise",
        ge=0,
        le=1
    )
    reasoning: Optional[str] = Field(
        default=None,
        description="Brief explanation of the score (1-2 sentences)"
    )

print("[green]✓ Pydantic models defined[/green]")


[green]✓ Pydantic models defined[/green]


In [8]:
# ==============================================================================
# LOAD MODEL & TOKENIZER
# ==============================================================================

print(f"\n[bold blue]Loading Model: {Config.MODEL_NAME}[/bold blue]")

tokenizer = AutoTokenizer.from_pretrained(
    Config.MODEL_NAME,
    cache_dir=Config.MODEL_CACHE_DIR,
    trust_remote_code=True
)

# Configure padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Original padding side: {tokenizer.padding_side}")
tokenizer.padding_side = "left"
print(f"Updated padding side: {tokenizer.padding_side}")

print("✓ Tokenizer loaded")
print(f"  Pad token: {tokenizer.pad_token}")

# Load model
model = AutoModelForCausalLM.from_pretrained(
    Config.MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=Config.MODEL_CACHE_DIR,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    attn_implementation="flash_attention_2",
)

model.eval()

print("✓ Model loaded")
print(f"  Device: {model.device}")
print(f"  Dtype: {model.dtype}")



[bold blue]Loading Model: Vishva007/Qwen2.5-3B-Instruct-RBI-QA[/bold blue]


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Original padding side: left
Updated padding side: left
✓ Tokenizer loaded
  Pad token: <|vision_pad|>


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✓ Model loaded
  Device: cuda:0
  Dtype: torch.float16


In [9]:
# ==============================================================================
# BATCH GENERATION FUNCTION
# ==============================================================================

def generate_answers_batch(
    model,
    tokenizer,
    prompts: List[str],
    max_new_tokens: int = 512,
    temperature: float = 0.7,
) -> List[str]:
    """Generate answers for batch of prompts"""
    
    system_prompt = """You are a highly knowledgeable AI assistant with expertise in Indian banking and financial regulations, particularly those outlined in Reserve Bank of India (RBI) circulars.

Provide accurate, specific answers including relevant dates, amounts, and institutional details. If you don't know, say so."""
    
    # Prepare messages
    batched_messages = [
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
        for prompt in prompts
    ]
    
    # Apply chat template
    batched_texts = [
        tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        for messages in batched_messages
    ]
    
    try:
        # Tokenize
        model_inputs = tokenizer(
            batched_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=2048
        ).to(model.device)
        
        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
                # repetition_penalty=1.2
            )
        
        # Decode only new tokens
        input_lengths = model_inputs.input_ids.shape[1]
        batch_generated_ids_only = [
            output_ids[input_lengths:] for output_ids in generated_ids
        ]
        
        responses = tokenizer.batch_decode(
            batch_generated_ids_only,
            skip_special_tokens=True
        )
        
        return responses
        
    except Exception as e:
        print(f"[red]Generation error: {e}[/red]")
        return ["ERROR: Generation failed"] * len(prompts)

print("✓ Generation function ready")


✓ Generation function ready


In [10]:
# ==============================================================================
# GENERATE ANSWERS FOR ALL SAMPLES
# ==============================================================================

from tqdm.auto import tqdm

print(f"\nGenerating Answers (Batch size: {Config.BATCH_SIZE})")

generated_answers = []
dataset_list = eval_dataset.to_list()

# Calculate total batches for progress bar
total_batches = (len(dataset_list) + Config.BATCH_SIZE - 1) // Config.BATCH_SIZE

# Create progress bar
with tqdm(total=total_batches, desc="Generating answers", unit="batch") as pbar:
    for i in range(0, len(dataset_list), Config.BATCH_SIZE):
        batch = dataset_list[i:i + Config.BATCH_SIZE]
        
        # Use rephrased_question if available, else question
        inputs = [
            item.get('rephrased_question') or item.get('question', '')
            for item in batch
        ]
        
        # Generate
        batch_responses = generate_answers_batch(
            model,
            tokenizer,
            inputs,
            max_new_tokens=Config.MAX_NEW_TOKENS,
            temperature=Config.TEMPERATURE,
        )
        
        # Store results
        for item, response in zip(batch, batch_responses):
            generated_answers.append({
                "question": item.get('rephrased_question') or item.get('question', ''),
                "generated_answer": response.strip(),
                "ground_truth": item.get('rephrased_answer') or item.get('answer', ''),
                "metadata": {
                    "regulation_area": item.get('regulation_area', 'Unknown'),
                    "category": item.get('category', 'Unknown'),
                    "difficulty": item.get('estimated_difficulty', 'Unknown'),
                    "applicable_to": item.get('applicable_to', 'Unknown'),
                }
            })
        
        # Update progress bar
        pbar.update(1)
        pbar.set_postfix({
            'samples': len(generated_answers),
            'batch_size': len(batch)
        })

print(f"[green]✓ Generated {len(generated_answers)} answers[/green]")

# Show sample
print("Sample Generated Answer:")
sample = generated_answers[0]

print(f"A: {sample['generated_answer'][:100]}...")



Generating Answers (Batch size: 50)


Generating answers:   0%|          | 0/20 [00:00<?, ?batch/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

[green]✓ Generated 1000 answers[/green]
Sample Generated Answer:
A: Regulated entities must adhere to the procedure outlined in the Unlawful Activities (Prevention) Act...


In [11]:
# ==============================================================================
# INITIALIZE GEMINI FOR EVALUATION
# ==============================================================================

print("[bold blue]Initializing Gemini Evaluator[/bold blue]")

# Check API key
if Config.GOOGLE_API_KEY == "YOUR_GOOGLE_API_KEY_HERE":
    print("[red]ERROR: Set your Google API key in Config.GOOGLE_API_KEY[/red]")
    raise ValueError("Google API key not set")

llm = ChatGoogleGenerativeAI(
    model=Config.GOOGLE_MODEL,
    temperature=Config.GOOGLE_TEMPERATURE,
    max_tokens=Config.GOOGLE_MAX_TOKENS,
    api_key=Config.GOOGLE_API_KEY
)

print(f"✓ Gemini initialized: {Config.GOOGLE_MODEL}")


[bold blue]Initializing Gemini Evaluator[/bold blue]
✓ Gemini initialized: gemini-2.0-flash


In [12]:
# ==============================================================================
# EVALUATE WITH GEMINI - WITH TQDM PROGRESS
# ==============================================================================

from tqdm.auto import tqdm

async def evaluate_with_gemini_async(
    generated_answers: List[Dict],
    llm: ChatGoogleGenerativeAI,
    max_concurrency: int
) -> Tuple[List[Dict], Dict]:
    """Evaluate generated answers using Gemini with progress tracking"""
    
    SYSTEM_MESSAGE = """You are an expert evaluator for RBI banking regulations.

EVALUATION CRITERIA:
✓ Factual accuracy (dates, amounts, percentages)
✓ Correct bank/institution types mentioned
✓ Accurate regulatory timelines
✓ Completeness of key information

Score 1: Answer is factually accurate in ALL key details
Score 0: Any factual errors, wrong dates/amounts, or missing critical info

Provide short reasoning (1-2 sentences).
output should be a valud json object with 'score' and 'reasoning' fields."""
    
    PROMPT_TEMPLATE = """Question: {question}

Ground Truth: {ground_truth}

Model's Answer: {model_answer}

Evaluate the model's answer."""
    
    # Setup chain
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", SYSTEM_MESSAGE),
        ("human", PROMPT_TEMPLATE)
    ])
    
    eval_chain = eval_prompt | llm.with_structured_output(EvaluationResult)
    
    # Prepare inputs
    batch_inputs = [
        {
            "question": item['question'],
            "ground_truth": item['ground_truth'],
            "model_answer": item['generated_answer']
        }
        for item in generated_answers
    ]
    
    print(f"\nEvaluating {len(batch_inputs)} answers with Gemini...")
    
    # Create progress bar
    pbar = tqdm(total=len(batch_inputs), desc="Evaluating with Gemini", unit="answer")
    
    # Batch evaluate with async processing
    batch_results = []
    chunk_size = max_concurrency  # Process in chunks of max_concurrency
    
    for i in range(0, len(batch_inputs), chunk_size):
        chunk = batch_inputs[i:i + chunk_size]
        
        # Evaluate chunk
        chunk_results = await eval_chain.abatch(
            chunk,
            config={"max_concurrency": max_concurrency},
            return_exceptions=True
        )
        
        batch_results.extend(chunk_results)
        
        # Update progress bar
        pbar.update(len(chunk))
        pbar.set_postfix({
            'processed': min(i + chunk_size, len(batch_inputs)),
            'total': len(batch_inputs)
        })
    
    pbar.close()
    
    # Process results
    results = []
    total_score = 0
    failed = 0
    
    print("Processing evaluation results...")
    
    for item, result in tqdm(
        zip(generated_answers, batch_results),
        total=len(generated_answers),
        desc="Processing results",
        unit="result"
    ):
        # Case 1: An actual Python exception occurred (Network, API error, etc.)
        if isinstance(result, Exception):
            score = 0
            reasoning = f"Evaluation failed (Exception): {str(result)[:100]}"
            failed += 1
            
        # Case 2: The result is None (Safety filter triggered or Parsing failed)
        elif result is None:
            score = 0
            reasoning = "Evaluation failed: Model returned None (Likely Safety Filter or JSON Error)"
            failed += 1
            
        # Case 3: Success
        else:
            try:
                score = result.score
                reasoning = result.reasoning or "No reasoning provided"
                total_score += score
            except AttributeError:
                # Fallback if result is an object but missing attributes
                score = 0
                reasoning = f"Evaluation failed: Malformed result object - {type(result)}"
                failed += 1
        
        results.append({
            "question": item['question'],
            "ground_truth": item['ground_truth'],
            "generated_answer": item['generated_answer'],
            "evaluation_score": score,
            "reasoning": reasoning,
            "metadata": item.get('metadata', {})
        })
    
    # Calculate summary
    successful = len(batch_inputs) - failed
    pass_rate = (total_score / successful * 100) if successful > 0 else 0
    
    summary = {
        "model_name": Config.MODEL_NAME,
        "dataset_name": Config.DATASET_NAME,
        "dataset_source": dataset_source,
        "sampling_strategy": "stratified",
        "stratify_columns": Config.STRATIFY_COLUMNS,
        "total_evaluations": len(batch_inputs),
        "successful_evaluations": successful,
        "failed_evaluations": failed,
        "total_passed": total_score,
        "pass_rate_percentage": round(pass_rate, 2),
        "timestamp": Config.TIMESTAMP,
        "distribution": distribution,
    }
    
    return results, summary



In [13]:
print(generated_answers[0])

{'question': 'What are the mandatory compliance requirements for regulated entities under the Unlawful Activities (Prevention) Act, 1967?', 'generated_answer': 'Regulated entities must adhere to the procedure outlined in the Unlawful Activities (Prevention) Act, 1967, as amended, and ensure strict compliance with government orders related to the Act.', 'ground_truth': 'Regulated entities are mandated to rigorously follow the procedures stipulated in the Unlawful Activities (Prevention) Act, 1967, and ensure thorough adherence to all government orders issued in connection with this Act.', 'metadata': {'regulation_area': 'Anti-Money Laundering', 'category': 'fact-based', 'difficulty': 4, 'applicable_to': 'Regulated Entities'}}


In [14]:
# Run evaluation
print("\n[bold cyan]Starting Gemini Evaluation[/bold cyan]")
evaluation_results, evaluation_summary = await evaluate_with_gemini_async(
    generated_answers,  # Use full dataset, not [:20]
    llm,
    Config.MAX_CONCURRENCY
)

print(f"\n[green]✓ Evaluation complete![/green]")
print(f"[bold]Pass rate: {evaluation_summary['pass_rate_percentage']}%[/bold]")


[bold cyan]Starting Gemini Evaluation[/bold cyan]

Evaluating 1000 answers with Gemini...


Evaluating with Gemini:   0%|          | 0/1000 [00:00<?, ?answer/s]

Processing evaluation results...


Processing results:   0%|          | 0/1000 [00:00<?, ?result/s]


[green]✓ Evaluation complete![/green]
[bold]Pass rate: 57.6%[/bold]


In [15]:
# ==============================================================================
# DISPLAY EVALUATION RESULTS
# ==============================================================================


print("\n" + "="*70)
print("[bold magenta]EVALUATION SUMMARY[/bold magenta]")
print("="*70)

for key, value in evaluation_summary.items():
    if key != 'distribution':  # Skip detailed distribution
        print(f"    {key}: {value}")

# Show sample results
print("\n" + "="*70)
print("[bold cyan]SAMPLE RESULTS[/bold cyan]")
print("="*70)

for i in range(min(3, len(evaluation_results))):
    result = evaluation_results[i]
    print(f"\n[bold]Sample {i+1}:[/bold]")
    print(f"Q: {result['question'][:100]}...")
    print(f"A: {result['generated_answer'][:100]}...")
    print(f"Score: {result['evaluation_score']}")
    print(f"Reasoning: {result['reasoning']}")

# Breakdown by stratification columns
print("\n" + "="*70)
print("[bold cyan]BREAKDOWN BY CATEGORIES[/bold cyan]")
print("="*70)

for col in Config.STRATIFY_COLUMNS:
    print(f"\n[bold]{col}:[/bold]")
    
    category_scores = defaultdict(list)
    for result in evaluation_results:
        cat = result['metadata'].get(col, 'Unknown')
        category_scores[cat].append(result['evaluation_score'])
    
    for cat, scores in sorted(category_scores.items()):
        pass_rate = sum(scores) / len(scores) * 100
        print(f"    {cat}: {pass_rate:.1f}% ({sum(scores)}/{len(scores)})")



[bold magenta]EVALUATION SUMMARY[/bold magenta]
    model_name: Vishva007/Qwen2.5-3B-Instruct-RBI-QA
    dataset_name: Vishva007/RBI-Circular-QA-Dataset
    dataset_source: eval_split
    sampling_strategy: stratified
    stratify_columns: ['regulation_area', 'applicable_to', 'category', 'estimated_difficulty']
    total_evaluations: 1000
    successful_evaluations: 1000
    failed_evaluations: 0
    total_passed: 576
    pass_rate_percentage: 57.6
    timestamp: 20251125_162506

[bold cyan]SAMPLE RESULTS[/bold cyan]

[bold]Sample 1:[/bold]
Q: What are the mandatory compliance requirements for regulated entities under the Unlawful Activities ...
A: Regulated entities must adhere to the procedure outlined in the Unlawful Activities (Prevention) Act...
Score: 1
Reasoning: The model accurately states that regulated entities must adhere to the procedures outlined in the Unlawful Activities (Prevention) Act, 1967, and comply with government orders related to the Act.

[bold]Sample 2:[/bold

In [16]:
# ==============================================================================
# SAVE EVALUATION RESULTS
# ==============================================================================

# Create output directory
output_dir = Path(Config.OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)

# Create filename
model_short_name = Config.MODEL_NAME.split('/')[-1]
filename = f"eval_{model_short_name}_{Config.TIMESTAMP}.json"
filepath = output_dir / filename

# Prepare output
output_data = {
    "config": {
        "model": Config.MODEL_NAME,
        "dataset": Config.DATASET_NAME,
        "eval_size": Config.EVAL_SIZE,
        "stratify_columns": Config.STRATIFY_COLUMNS,
        "sampling_strategy": "stratified",
        "timestamp": Config.TIMESTAMP,
    },
    "summary": evaluation_summary,
    "detailed_results": evaluation_results
}

# Save JSON
with open(filepath, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"\n[green]✓ Results saved to: {filepath}[/green]")

# Save CSV
csv_file = filepath.with_suffix('.csv')
df = pd.DataFrame(evaluation_results)
df.to_csv(csv_file, index=False)
    
print(f"✓ CSV saved to: {csv_file}")



[green]✓ Results saved to: Eval/eval_Qwen2.5-3B-Instruct-RBI-QA_20251125_162506.json[/green]
✓ CSV saved to: Eval/eval_Qwen2.5-3B-Instruct-RBI-QA_20251125_162506.csv


In [17]:
# ==============================================================================
# COMPARE BEFORE/AFTER RESULTS
# ==============================================================================

def compare_evaluation_results(before_file: str, after_file: str):
    """Compare two evaluation results"""
    
    with open(before_file) as f:
        before = json.load(f)
    with open(after_file) as f:
        after = json.load(f)
    
    print("\n" + "="*70)
    print("[bold magenta]BEFORE vs AFTER COMPARISON[/bold magenta]")
    
    
    before_rate = before['summary']['pass_rate_percentage']
    after_rate = after['summary']['pass_rate_percentage']
    improvement = after_rate - before_rate
    
    print(f"\nBefore (Base Model): {before_rate}%")
    print(f"After (Fine-tuned): {after_rate}%")
    print(f"Improvement: {improvement:+.2f}%")
    
    if improvement > 0:
        print(f"[green]✓ Model improved by {improvement:.2f}%![/green]")
    else:
        print(f"[red]✗ Model degraded by {abs(improvement):.2f}%[/red]")
    
    # Category-wise comparison
    print(" [bold]Category-wise Improvements:[/bold]")
    
    before_results = before['detailed_results']
    after_results = after['detailed_results']
    
    for col in before['config'].get('stratify_columns', []):
        print(f"\n[bold]{col}:[/bold]")
        
        # Calculate before scores by category
        before_cat = defaultdict(list)
        for r in before_results:
            cat = r['metadata'].get(col, 'Unknown')
            before_cat[cat].append(r['evaluation_score'])
        
        # Calculate after scores by category
        after_cat = defaultdict(list)
        for r in after_results:
            cat = r['metadata'].get(col, 'Unknown')
            after_cat[cat].append(r['evaluation_score'])
        
        # Compare
        all_cats = set(before_cat.keys()) | set(after_cat.keys())
        for cat in sorted(all_cats):
            before_score = sum(before_cat.get(cat, [])) / len(before_cat.get(cat, [1])) * 100
            after_score = sum(after_cat.get(cat, [])) / len(after_cat.get(cat, [1])) * 100
            improvement = after_score - before_score
            print(f"  {cat}: {before_score:.1f}% → {after_score:.1f}% ({improvement:+.1f}%)")


In [18]:
# Example usage (uncomment and update paths):
compare_evaluation_results(
    "Eval/eval_Qwen2.5-3B-Instruct_20251125_161858.json",
    "Eval/eval_Qwen2.5-3B-Instruct-RBI-QA_20251125_162506.json"
)

print("\n[bold green]✅ Evaluation pipeline complete![/bold green]")



[bold magenta]BEFORE vs AFTER COMPARISON[/bold magenta]

Before (Base Model): 7.0%
After (Fine-tuned): 57.6%
Improvement: +50.60%
[green]✓ Model improved by 50.60%![/green]
 [bold]Category-wise Improvements:[/bold]

[bold]regulation_area:[/bold]
  Account Aggregator: 0.0% → 100.0% (+100.0%)
  Account Aggregator Framework: 100.0% → 100.0% (+0.0%)
  Accounting Standards: 0.0% → 75.0% (+75.0%)
  Agriculture Credit: 0.0% → 100.0% (+100.0%)
  Anti-Money Laundering: 5.4% → 77.0% (+71.6%)
  Asset Classification and Income Recognition: 0.0% → 0.0% (+0.0%)
  Asset Reconstruction: 0.0% → 100.0% (+100.0%)
  Asset Reconstruction Companies: 14.3% → 50.0% (+35.7%)
  Auditing and Statutory Compliance: 0.0% → 16.7% (+16.7%)
  Bank Finance to Non-Banking Financial Companies (NBFCs): 7.7% → 38.5% (+30.8%)
  Banking Operations: 0.0% → 50.0% (+50.0%)
  Banking Regulation: 11.0% → 63.0% (+52.0%)
  Banking Regulations: 0.0% → 52.9% (+52.9%)
  Banknote Management: 0.0% → 100.0% (+100.0%)
  Basel III Capital