In [None]:
# !pip install -r requirements-eval.txt

In [None]:
# !MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [1]:
import json
import torch
from typing import List, Optional
from pydantic import BaseModel, Field
from datasets import load_dataset,Dataset
from rich import print
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from typing import Dict,Tuple,Any
import os

In [2]:
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"cuDNN Version: {torch.backends.cudnn.version()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Capability: {torch.cuda.get_device_capability(0)}")
    
    # Add bfloat16 information
    bf16_supported = torch.cuda.is_bf16_supported()
    print(f"BFloat16 Supported: {bf16_supported}")
    
    # Additional compute capability analysis for bf16
    compute_cap = torch.cuda.get_device_capability(0)
    compute_version = compute_cap[0] + compute_cap[1] * 0.1
    native_bf16 = compute_version >= 8.0
    print(f"Compute Capability: {compute_version}")
    print(f"Native BF16 Support (>=8.0): {native_bf16}")
    
    # Show which precision will be used
    will_use_bf16 = bf16_supported and native_bf16
    print(f"Will Use: {'BF16' if will_use_bf16 else 'FP16'}")
    
else:
    print("CUDA not available - will use FP16 for CPU training")

In [3]:

os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'


In [4]:
PARA = 3
MODEL_NAME = f"Qwen/Qwen2.5-{PARA}B-Instruct"
MODEL_PATH = "./Models"
DATASET_NAME = "Vishva007/RBI-Circular-QA-Dataset"
SEED = 42
GOOGLE_API_KEY = "AIzaSyDbRj4eySoPIFKIzHDGD5S8tNxwkXzfx8o"
GOOGLE_MODEL_ID="gemini-2.0-flash"
GOOGLE_TEMPERATURE=0.01
GOOGLE_MAX_TOKENS=128
EVAL_SET_SIZE = 20
MERGED_MODEL_OUTPUT_DIR = f"./Models/Qwen2.5-{PARA}B-Instruct-RBI-QA-Merged"
MODEL_FINETUNED_REPO_ID = f"Vishva007/Qwen2.5-{PARA}B-Instruct-RBI-QA-Finetuned"
MODEL_ADOPTOR_REPO_ID = f"Vishva007/Qwen2.5-{PARA}B-Instruct-RBI-QA-Adoptor"



In [5]:

print(f"Using Model: {MODEL_NAME} with {PARA}B parameters")
print(f"Using LoRA Adapter: {MODEL_ADOPTOR_REPO_ID}")


In [6]:


import os
print(f"Current working directory: {os.getcwd()}")
absolute_model_path = os.path.abspath(MODEL_PATH)
print(f"Absolute model path: {absolute_model_path}")



In [7]:

print(f"Loading dataset: {DATASET_NAME}")
print(f"Using seed: {SEED}")


In [8]:


try:
    dataset = load_dataset(DATASET_NAME,split="eval")
    print("Dataset loaded successfully!")
    print(dataset) # Print the dataset structure to see available splits
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please ensure the dataset name is correct and you have an active internet connection.")



In [9]:
number_of_examples_for_eval = EVAL_SET_SIZE

# Get the total number of examples in full_dataset
num_examples = len(dataset)

# Calculate the start index for the last EVAL_SET_SIZE examples
start_index = num_examples - number_of_examples_for_eval

# Create a list of indices for the last 1EVAL_SET_SIZE0 examples
indices_to_select = list(range(start_index, num_examples))

# Use the .select() method to create a new Dataset object containing only those indices
eval_dataset = dataset.select(indices_to_select)


In [10]:
  
class EvaluationResult(BaseModel):
    score: int = Field(..., description="Score 1 if the answer fully satisfies ALL the specified criteria, 0 otherwise.", ge=0, le=1)


In [11]:


# Load the base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=MODEL_PATH,
)

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=MODEL_PATH)


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [12]:

# Load the LoRA adapter and merge it with the base model
print(f"Loading LoRA adapter from {MODEL_ADOPTOR_REPO_ID}...")
try:
    model = PeftModel.from_pretrained(base_model, MODEL_ADOPTOR_REPO_ID)
    print("LoRA adapter loaded successfully!")
    
    # Optional: Merge the adapter with the base model for faster inference
    print("Merging adapter with base model...")
    model = model.merge_and_unload()
    print("Model merged successfully!")
    
except Exception as e:
    print(f"Error loading LoRA adapter: {e}")
    print("Falling back to base model...")
    model = base_model


if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



In [13]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)

In [18]:

import re
import sys


def generate_responses_from_prompts(model, tokenizer, prompts: list[str]) -> list[str]:
    """
    Generates responses for a batch of prompts using a loaded causal language model and tokenizer.

    Args:
        model: The pre-trained causal language model.
        tokenizer: The tokenizer corresponding to the model.
        prompts (list[str]): A list of user prompts for which to generate responses.

    Returns:
        list[str]: A list of generated responses, corresponding to each input prompt.
    """
    if model is None or tokenizer is None:
        print("Model or tokenizer not loaded. Cannot generate responses.")
        return ["Error: Model not loaded." for _ in prompts]

    sys_prompt = """You are an AI assistant that answers questions about RBI banking regulations based on
                provided document content. Always base your answers strictly on the given document text."""
    # Prepare messages for each prompt in the batch
    batched_messages = []
    for prompt in prompts:
        batched_messages.append([
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": prompt}
        ])

    # Apply chat template to each set of messages and collect texts
    batched_texts = [
        tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        ) for messages in batched_messages
    ]

    # print(f"Processing {len(prompts)} prompts in batch...")

    try:
        # Tokenize the entire batch
        # padding=True pads sequences to the longest sequence in the batch
        # return_tensors="pt" returns PyTorch tensors
        model_inputs = tokenizer(batched_texts, return_tensors="pt", padding=True).to(model.device)

        # Generate responses for the entire batch
        # max_new_tokens controls the maximum number of tokens to generate per response
        # **model_inputs unpacks the dictionary into keyword arguments
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            top_k=40,
            do_sample=True,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.pad_token_id, 
            eos_token_id=tokenizer.eos_token_id  
        )

        # Calculate the length of the input IDs for each original prompt
        # This is necessary to slice out only the newly generated tokens
        input_lengths = model_inputs.input_ids.shape[1]

        # Slice generated_ids to get only the new tokens for each item in the batch
        # We iterate through each generated sequence and remove the input part
        batch_generated_ids_only = [
            output_ids[input_lengths:] for output_ids in generated_ids
        ]

        # Decode the generated tokens back to text for the entire batch
        responses = tokenizer.batch_decode(batch_generated_ids_only, skip_special_tokens=True)
        # print("Batch processing complete.")
        return responses

    except Exception as e:
        print(f"Error during generation: {e}")
        return ["Error generating response." for _ in prompts] # Return error for each prompt


In [19]:

prompt_to_process = eval_dataset[:5]
# print(f"Processing prompt: {prompt_to_process}")


In [20]:

batch_responses = generate_responses_from_prompts(model, tokenizer, prompt_to_process)


In [21]:
for i, generated_response in enumerate(batch_responses):
    original_data = eval_dataset[i]  # Access individual items directly
    print(f"Document {i+1}: {original_data['document']}")
    print(f"Rephrased Question: {original_data['rephrased_question']}")
    print(f"Generated Answer:\n{generated_response}")
    print(f"Expected Rephrased Answer:\n{original_data['rephrased_answer']}")
    print(f"{'-'*30}\n")


In [22]:
from tqdm.auto import tqdm

def generate_answers_with_transformers(
    model,
    eval_dataset: Dataset,
    batch_size: int = 10,
) -> List[Dict[str, str]]:
    """
    Generates answers for a dataset using an transformers LLM.

    Args:
        model: The loaded HuggingFace model for transformers.
        eval_dataset (datasets.Dataset): The dataset containing 'question' fields.
        batch_size (int): The number of samples to process in each batch.
        device (str): The device to run the model on (e.g., "cuda" or "cpu").

    Returns:
        List[Dict[str, str]]: A list of dictionaries, each containing the original question
                                and the 'generated_answer' from Outlines.
    """

    generated_data = []

    print(f"Generating answers in batches of {batch_size}...")
    for i in tqdm(range(0, len(eval_dataset), batch_size), desc="Generating answers"): # This is the modified line
        batch_slice = eval_dataset.select(list(range(i, min(i + batch_size, len(eval_dataset)))))
        
        inputs = [item['rephrased_question'] for item in batch_slice.to_list()]

        try:
            # Note: 'tokenizer' is used here but not in the function signature.
            # You might need to pass 'tokenizer' as an argument to this function,
            # or ensure it's globally accessible where this function is called.
            batch_response_instance = generate_responses_from_prompts(model, tokenizer, inputs)
            
            for original_item, transformers_output in zip(batch_slice.to_list(), batch_response_instance):
                generated_data.append({
                    "question": original_item['rephrased_question'],
                    "generated_answer": transformers_output
                })
        except Exception as e:
            print(f"Error during transformers generation for batch starting at index {i}: {e}")
            for original_item in batch_slice.to_list():
                generated_data.append({
                    "question": original_item['rephrased_question'],
                    "generated_answer": "\\ (Generation Failed)"
                })

    print("transformers answer generation complete.")
    return generated_data

In [23]:

generated_answers = generate_answers_with_transformers(
    model,
    eval_dataset,
    batch_size=50,
)

Generating answers:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
for i, generated_response in enumerate(generated_answers):
    original_data = eval_dataset[i]
    print(f"Document {i+1}: {original_data['document']}")
    print(f"Rephrased Question: {original_data['rephrased_question']}")
    print(f"Generated Answer:\n{generated_response['generated_answer']}")  # Access the 'generated_answer' key
    print(f"Expected Rephrased Answer:\n{original_data['rephrased_answer']}")
    print(f"{'-'*30}\n")


In [27]:

# Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(
        model=GOOGLE_MODEL_ID, 
        temperature=GOOGLE_TEMPERATURE,
        max_tokens=GOOGLE_MAX_TOKENS,
        timeout=None,
        api_key=GOOGLE_API_KEY,
    )


In [42]:
import tqdm.asyncio


async def evaluate_answers_with_gemini_batch_optimized(
    eval_dataset: Dataset,
    generated_answers_data: List[Dict[str, str]],
    llm: ChatGoogleGenerativeAI,
    max_concurrency: int
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
    """
    Highly optimized version using LangChain's native batch processing.
    """
    SYSTEM_MESSAGE = """
        You are an expert evaluator for RBI banking regulations.
        Pay special attention to:
        - EXACT numerical values (amounts, percentages, dates)
        - Specific inclusions/exclusions of bank types
        - Precise regulatory timelines
        
        Score 1 ONLY if the answer is factually accurate in all key details.
        Score 0 if there are any factual errors, wrong dates, incorrect amounts, or misstatements about which institutions are covered.
        """

    PROMPT_TEMPLATE = """
    Question: {question}
    Correct answer: {evaluation_criteria}
    Model Answer: {model_answer}
    Provide the evaluation score in the specified JSON format.
    """.strip()

    gemini_eval_prompt = ChatPromptTemplate.from_messages([
        ("system", SYSTEM_MESSAGE),
        ("human", PROMPT_TEMPLATE),
    ])

    gemini_eval_chain = gemini_eval_prompt | llm.with_structured_output(EvaluationResult)

    # Prepare batch inputs
    batch_inputs = []
    eval_dataset_list = eval_dataset.to_list()
    
    for original_item, generated_item in zip(eval_dataset_list, generated_answers_data):
        batch_inputs.append({
            "question": original_item['rephrased_question'],
            "evaluation_criteria": original_item['rephrased_answer'],
            "model_answer": generated_item['generated_answer'],
        })

    print(f"Processing {len(batch_inputs)} evaluations in batch...")
    
    # Use LangChain's native async batch processing
    batch_results = await gemini_eval_chain.abatch(
        batch_inputs,
        config={"max_concurrency": max_concurrency},  # Control concurrency
        return_exceptions=True
    )

    # Process results
    results = []
    total_score = 0
    failed_evaluations_due_to_error = 0

    for i, (original_item, generated_item, result) in enumerate(
        zip(eval_dataset_list, generated_answers_data, batch_results)
    ):
        score_val = 0
        if isinstance(result, Exception) or result is None:
            failed_evaluations_due_to_error += 1
        else:
            score_val = result.score
            total_score += score_val

        results.append({
            "question": original_item['rephrased_question'],
            "ground_truth_answer": original_item['rephrased_answer'],
            "generated_answer": generated_item['generated_answer'],
            "evaluation_score": score_val,
        })

    # Calculate summary
    successful_eval_count = len(batch_inputs) - failed_evaluations_due_to_error
    percentage_passed_criteria = (total_score / successful_eval_count) * 100 if successful_eval_count > 0 else 0

    summary = {
        "total_evaluations_attempted": len(batch_inputs),
        "successful_evaluations": successful_eval_count,
        "failed_evaluations_due_to_error": failed_evaluations_due_to_error,
        "total_passed_criteria": total_score,
        "percentage_passed_criteria": percentage_passed_criteria,
        "overall_evaluation_summary": (
            f"Out of {len(batch_inputs)} attempted evaluations, "
            f"{successful_eval_count} were successfully processed by Gemini. "
            f"{failed_evaluations_due_to_error} evaluations encountered an error. "
            f"Among the successfully evaluated answers, {total_score} met all criteria, "
            f"resulting in a {percentage_passed_criteria:.2f}% pass rate."
        )
    }

    print("Batch Gemini evaluation complete.")
    return results, summary

In [43]:

evaluation_results, evaluation_summary = await evaluate_answers_with_gemini_batch_optimized(
    eval_dataset=eval_dataset,
    generated_answers_data=generated_answers,
    llm=llm,
    max_concurrency=10
)


In [44]:
print(evaluation_results)

In [45]:
print("\n--- Evaluation Summary ---")
summary_stats = evaluation_summary
print(f"Total Evaluations Attempted: {summary_stats['total_evaluations_attempted']}")
print(f"Evaluations Failed Due to Error: {summary_stats['failed_evaluations_due_to_error']}")
print(f"Total Answers Passed Criteria (Score 1): {summary_stats['total_passed_criteria']}")
print(f"Percentage of Successfully Evaluated Answers Passing Criteria: {summary_stats['percentage_passed_criteria']:.2f}%")
print(f"Overall Summary: {summary_stats['overall_evaluation_summary']}")

In [None]:

import os
import json
def save_evaluation_to_json(evaluation_results: list[dict], evaluation_summary: dict, output_dir: str, filename: str = "gemini_evaluation_report.json"):
    """
    Saves evaluation results and summary to a JSON file in the specified directory.

    Args:
        evaluation_results (list[dict]): A list of dictionaries, where each dictionary
                                         represents the evaluation of a single question-answer pair.
        evaluation_summary (dict): A dictionary containing the overall summary of the evaluation.
        output_dir (str): The directory where the JSON file should be saved.
                          This directory will be created if it does not exist.
        filename (str, optional): The name of the JSON file. Defaults to "gemini_evaluation_report.json".
    """
    # Combine evaluation results and summary into a single dictionary
    output_data = {
        "evaluation_results": evaluation_results,
        "evaluation_summary": evaluation_summary
    }

    # Ensure the output directory exists
    try:
        os.makedirs(output_dir, exist_ok=True)
    except OSError as e:
        print(f"Error creating directory {output_dir}: {e}")
        return

    # Construct the full path for the JSON file
    file_path = os.path.join(output_dir, filename)

    # Write the data to a JSON file
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=4)
        print(f"Successfully wrote evaluation results and summary to {file_path}")
    except IOError as e:
        print(f"Error writing to file {file_path}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


In [None]:


save_evaluation_to_json(
    evaluation_results=evaluation_results,
    evaluation_summary=evaluation_summary,
    output_dir="./Evaluation_Results",
    filename="qwen2.5-3B-finetuned-eval.json"
)