In [None]:
!pip install transformers
!pip install torch
!pip install peft  # For adapter-based fine-tuning

In [None]:
!pip install faiss-cpu  # Use faiss-gpu if you have GPU
!pip install sentence-transformers
!pip install pandas numpy

In [None]:
!pip install nltk
!pip install bert-score
!pip install rouge-score
!pip install evaluate

In [None]:
!pip install accelerate

In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import torch
import json

In [None]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score
from rouge_score import rouge_scorer
import torch
import evaluate
nltk.download('punkt')

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import os

In [None]:
access_token = "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"

In [None]:
pip install langchain_google_genai

In [None]:
def load_fine_tuned_model(adapter_path):
    try:
        
        # Load base model and tokenizer
        base_model_name = "google/gemma-2b"

        # Load tokenizer
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name,use_auth_token=access_token)

        # Load base model
        print("Loading base model...")
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,  # Use float16 for efficiency
            device_map="auto",
            use_auth_token=access_token# Automatically handle device placement
        )

        # Load the fine-tuned adapter
        print("Loading fine-tuned adapter...")
        model = PeftModel.from_pretrained(base_model, adapter_path)

        print("Model loaded successfully!")
        return model, tokenizer

    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None, None

In [None]:
def generate_question(model, tokenizer, passage, topic=None, subtopic=None, difficulty=None):
    try:
        # Construct prompt based on available information
        prompt_parts = ["Generate a question based on the following information:"]

        if topic:
            prompt_parts.append(f"Topic: {topic}")
        if subtopic:
            prompt_parts.append(f"Subtopic: {subtopic}")
        if difficulty:
            prompt_parts.append(f"Difficulty: {difficulty}")

        prompt_parts.append(f"Passage: {passage}")
        prompt_parts.append("Question:")

        prompt = "\n".join(prompt_parts)

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # Generate output
        outputs = model.generate(
            inputs["input_ids"],
            max_length=512,
            temperature=0.7,
            num_return_sequences=1,
            do_sample=True,
            top_p=0.9,
            top_k=50
        )

        # Decode and return the generated question
        question = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract just the generated question (remove the prompt)
        question = question[len(prompt):].strip()

        return question

    except Exception as e:
        print(f"Error generating question: {str(e)}")
        return None

In [None]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score
from rouge_score import rouge_scorer
import nltk
import warnings
warnings.filterwarnings('ignore')

def calculate_metrics(generated_question, reference_questions):
    """
    Calculate all metrics for a generated question against multiple reference questions
    """
    metrics = {}
    
    # Ensure reference_questions is a list of strings
    if isinstance(reference_questions, str):
        reference_questions = [reference_questions]
    elif isinstance(reference_questions, list):
        # Convert any non-string elements to strings
        reference_questions = [str(ref) for ref in reference_questions if ref is not None]
    else:
        print(f"Unexpected reference_questions type: {type(reference_questions)}")
        return get_zero_metrics()

    if not generated_question or not reference_questions:
        return get_zero_metrics()

    try:
        # 1. BLEU Scores
        bleu_scores = get_bleu_scores(generated_question, reference_questions)
        metrics.update(bleu_scores)

        # 2. BERTScore
        bert_scores = get_bert_scores(generated_question, reference_questions)
        metrics.update(bert_scores)

        # 3. ROUGE Scores
        rouge_scores = get_rouge_scores(generated_question, reference_questions)
        metrics.update(rouge_scores)

    except Exception as e:
        print(f"Error in calculate_metrics: {str(e)}")
        return get_zero_metrics()

    return metrics

def get_zero_metrics():
    """Return a dictionary with all metrics set to 0.0"""
    return {
        'BLEU-1': 0.0, 'BLEU-2': 0.0, 'BLEU-3': 0.0, 'BLEU-4': 0.0,
        'BERTScore-P': 0.0, 'BERTScore-R': 0.0, 'BERTScore-F1': 0.0,
        'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0
    }

def get_bleu_scores(generated, references):
    """Calculate BLEU scores with proper tokenization"""
    try:
        # Tokenize generated question
        generated_tokens = nltk.word_tokenize(str(generated).lower())
        
        # Tokenize all reference questions
        reference_tokens = [nltk.word_tokenize(str(ref).lower()) for ref in references]
        
        # Calculate BLEU scores with different weights
        weights = [
            (1.0, 0.0, 0.0, 0.0),  # BLEU-1
            (0.5, 0.5, 0.0, 0.0),  # BLEU-2
            (0.33, 0.33, 0.33, 0.0),  # BLEU-3
            (0.25, 0.25, 0.25, 0.25)  # BLEU-4
        ]
        
        bleu_scores = {}
        for i, weight in enumerate(weights, 1):
            score = sentence_bleu(reference_tokens, generated_tokens, weights=weight)
            bleu_scores[f'BLEU-{i}'] = score
            
        return bleu_scores
        
    except Exception as e:
        print(f"Error in BLEU calculation: {str(e)}")
        return {f'BLEU-{i}': 0.0 for i in range(1, 5)}

def get_bert_scores(generated, references):
    """Calculate BERTScore with proper input handling"""
    try:
        # Ensure inputs are strings
        generated = str(generated)
        references = [str(ref) for ref in references]
        
        # Calculate BERTScore
        P, R, F1 = score([generated], references, lang='en', verbose=False)
        
        return {
            'BERTScore-P': P.mean().item(),
            'BERTScore-R': R.mean().item(),
            'BERTScore-F1': F1.mean().item()
        }
        
    except Exception as e:
        print(f"Error in BERTScore calculation: {str(e)}")
        return {'BERTScore-P': 0.0, 'BERTScore-R': 0.0, 'BERTScore-F1': 0.0}

def get_rouge_scores(generated, references):
    """Calculate ROUGE scores with proper input handling"""
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
        # Ensure inputs are strings
        generated = str(generated)
        
        # Calculate ROUGE scores against all references and take the maximum
        max_scores = {
            'ROUGE-1': 0.0,
            'ROUGE-2': 0.0,
            'ROUGE-L': 0.0
        }
        
        for ref in references:
            ref = str(ref)
            scores = scorer.score(generated, ref)
            max_scores['ROUGE-1'] = max(max_scores['ROUGE-1'], scores['rouge1'].fmeasure)
            max_scores['ROUGE-2'] = max(max_scores['ROUGE-2'], scores['rouge2'].fmeasure)
            max_scores['ROUGE-L'] = max(max_scores['ROUGE-L'], scores['rougeL'].fmeasure)
            
        return max_scores
        
    except Exception as e:
        print(f"Error in ROUGE calculation: {str(e)}")
        return {'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0}

In [None]:
class PassageRetriever:
    def __init__(self, csv_path):
        # Load the dataset
        self.df = pd.read_csv(csv_path)
        
        # Clean and preprocess the data
        self.clean_data()
        
        # Initialize the embedding model
        self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        
        # Create FAISS index
        self.setup_faiss()

    def clean_data(self):
        """Clean and preprocess the dataset"""
        # Convert all passages to string and clean them
        self.df['Passage'] = self.df['Passage'].astype(str)
        self.df['Passage'] = self.df['Passage'].apply(self.clean_text)
        
        # Remove rows with empty passages
        self.df = self.df[self.df['Passage'].str.strip() != '']
        
        # Reset index
        self.df = self.df.reset_index(drop=True)
        
        print(f"Total valid passages after cleaning: {len(self.df)}")

    def clean_text(self, text):
        """Clean individual text"""
        if pd.isna(text) or text == 'nan':
            return ''
        
        # Convert to string if not already
        text = str(text)
        
        # Remove excessive whitespace
        text = ' '.join(text.split())
        
        return text

    def setup_faiss(self):
        try:
            # Create embeddings for all passages
            print("Creating embeddings for passages...")
            passages = self.df['Passage'].tolist()
            
            print("\nFirst few passages:")
            for i, passage in enumerate(passages[:3]):
                print(f"Passage {i+1}: {passage[:100]}...")
            
            embeddings_list = []
            for i, passage in enumerate(passages):
                try:
                    embedding = self.embedder.encode(passage)
                    embeddings_list.append(embedding)
                except Exception as e:
                    print(f"Error encoding passage {i}: {str(e)}")
                    print(f"Problematic passage: {passage[:100]}...")
                    embedding = np.zeros(self.embedder.get_sentence_embedding_dimension())
                    embeddings_list.append(embedding)
            
            self.passage_embeddings = np.vstack(embeddings_list)
            
            embedding_dim = self.passage_embeddings.shape[1]
            self.index = faiss.IndexFlatL2(embedding_dim)
            
            self.index.add(self.passage_embeddings.astype('float32'))
            
            print(f"\nSuccessfully created FAISS index with {len(passages)} passages")
            
        except Exception as e:
            print(f"Error in setup_faiss: {str(e)}")
            raise

    def get_relevant_passage(self, topic, subtopic, difficulty, k=5):  # Increased k to get more candidates
        try:
            # Create query from topic and subtopic
            query = f"{topic} {subtopic}"
            
            # Get query embedding
            query_embedding = self.embedder.encode([query])
            
            # Search in FAISS with increased k
            distances, indices = self.index.search(query_embedding.astype('float32'), k)
            
            # Filter by difficulty and collect all matching passages
            relevant_passages = []
            seen_passages = set()  # To avoid duplicates
            
            for idx in indices[0]:
                passage_row = self.df.iloc[idx]
                
                # Check if this passage matches our criteria
                if (passage_row['Difficulty'].strip().lower() == difficulty.strip().lower() and
                    passage_row['Topic'].strip().lower() == topic.strip().lower() and
                    passage_row['Sub-Topic'].strip().lower() == subtopic.strip().lower()):
                    
                    # Create a unique key for this passage
                    passage_key = (passage_row['Passage'], passage_row['Question'])
                    
                    # Only add if we haven't seen this exact passage before
                    if passage_key not in seen_passages:
                        seen_passages.add(passage_key)
                        relevant_passages.append({
                            'passage': passage_row['Passage'],
                            'topic': passage_row['Topic'],
                            'subtopic': passage_row['Sub-Topic'],
                            'difficulty': passage_row['Difficulty'],
                            'reference_questions': passage_row['Question']
                        })
            
            if not relevant_passages:
                print(f"No passages found for {topic} - {subtopic} with difficulty {difficulty}")
                return None
            
            # Combine all reference questions for the same passage
            if len(relevant_passages) > 1:
                # Combine all reference questions while keeping the first passage's metadata
                combined_passage = relevant_passages[0]
                all_questions = [p['reference_questions'] for p in relevant_passages]
                combined_passage['reference_questions'] = all_questions
                return combined_passage
            
            return relevant_passages[0]
            
        except Exception as e:
            print(f"Error in get_relevant_passage: {str(e)}")
            return None

In [None]:
nltk.download('punkt')

In [None]:
def evaluate_question_generation(retriever, model, tokenizer, topics_list):
    """
    Evaluate question generation for given topics using RAG
    """
    results = []

    for topic_info in topics_list:
        try:
            # Get relevant passage using RAG
            retrieved_data = retriever.get_relevant_passage(
                topic_info['Topic'],
                topic_info['Sub-Topic'],
                topic_info['Difficulty']
            )
            
            if retrieved_data is None:
                print(f"No matching passage found for {topic_info}")
                continue

            # Generate question
            generated_question = generate_question(
                model,
                tokenizer,
                passage=retrieved_data['passage'],
                topic=retrieved_data['topic'],
                subtopic=retrieved_data['subtopic'],
                difficulty=retrieved_data['difficulty']
)           
            print("retrieved data is :",retrieved_data)
            print("GeneratedQuestion:",generated_question)
            if generated_question is None:
                print(f"Failed to generate question for {topic_info}")
                continue

            # Calculate metrics
            metrics = calculate_metrics(generated_question, [retrieved_data['reference_questions']])

            # Store results using the correct dictionary keys
            result = {
                'Topic': retrieved_data['topic'],  # Note the lowercase 'topic'
                'SubTopic': retrieved_data['subtopic'],  # Note the lowercase 'subtopic'
                'Difficulty': retrieved_data['difficulty'],
                'Retrieved_Passage': retrieved_data['passage'],
                'Generated_Question': generated_question,
                'Reference_Question': retrieved_data['reference_questions'],
                **metrics
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing topic {topic_info}: {str(e)}")
            continue

    # Return empty DataFrame if no results
    if not results:
        return pd.DataFrame(columns=[
            'Topic', 'SubTopic', 'Difficulty', 'Retrieved_Passage',
            'Generated_Question', 'Reference_Question',
            'BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4',
            'BERTScore-P', 'BERTScore-R', 'BERTScore-F1',
            'ROUGE-1', 'ROUGE-2', 'ROUGE-L'
        ])

    return pd.DataFrame(results)


In [16]:
def main():
    try:
        # Paths
        csv_path = "/kaggle/input/nikahatmaam/pass_QA_topic_stopic_diff.csv"
        adapter_path = "/kaggle/input/config-files"

        # Initialize retriever
        print("Initializing passage retriever...")
        retriever = PassageRetriever(csv_path)
        print("Passage is:",retriever)
        # Load the question generation model
        print("Loading question generation model...")
        model, tokenizer = load_fine_tuned_model(adapter_path)

        if model is None or tokenizer is None:
            raise ValueError("Failed to load the question generation model")

        # Example topics to evaluate
        topics_to_evaluate = [
            {'Topic': 'Java Variables', 'Sub-Topic': 'Variable Declaration and Initialization', 'Difficulty': 'Easy'},
        ]

        # Run evaluation
        print("Generating and evaluating questions...")
        results_df = evaluate_question_generation(retriever, model, tokenizer, topics_to_evaluate)

        # Save results
        results_df.to_csv('evaluation_results_with_rag.csv', index=False)

        # Print summary statistics
        if not results_df.empty:
            print("\nAverage Scores:")
            metrics_cols = [
                'BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4',
                'BERTScore-P', 'BERTScore-R', 'BERTScore-F1',
                'ROUGE-1', 'ROUGE-2', 'ROUGE-L'
            ]

            for col in metrics_cols:
                print(f"{col}: {results_df[col].mean():.4f}")

            print("\nScores by Difficulty:")
            print(results_df.groupby('Difficulty')[metrics_cols].mean())
        else:
            print("No results generated.")

    except Exception as e:
        print(f"Error in main: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Successfully created FAISS index with 1248 passages
Passage is: <__main__.PassageRetriever object at 0x7ff26a289840>
Loading question generation model...
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Loading base model...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Loading fine-tuned adapter...
Model loaded successfully!
Generating and evaluating questions...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


retrieved data is : {'passage': 'In Java, variables are used to store data values that can be manipulated and accessed throughout the program. To use a variable, it must first be declared and initialized. Declaration involves specifying the type of the variable, such as int, double, or String, followed by the variable name. Initialization is the process of assigning an initial value to the variable. This can be done at the time of declaration or later in the program. For example, int age; declares a variable named age of type int, while int age = 25; declares and initializes the variable with a value of 25. It is important to initialize variables before using them to avoid unexpected behavior in the program.', 'topic': 'Java Variables', 'subtopic': 'Variable Declaration and Initialization', 'difficulty': 'Easy', 'reference_questions': ['What is the purpose of declaring a variable in Java?', 'What is initialization in Java variable declaration?', 'Why is it important to initialize varia

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Average Scores:
BLEU-1: 0.0495
BLEU-2: 0.0460
BLEU-3: 0.0419
BLEU-4: 0.0362
BERTScore-P: 0.9377
BERTScore-R: 0.8847
BERTScore-F1: 0.9104
ROUGE-1: 0.4737
ROUGE-2: 0.4444
ROUGE-L: 0.4737

Scores by Difficulty:
              BLEU-1    BLEU-2    BLEU-3    BLEU-4  BERTScore-P  BERTScore-R  \
Difficulty                                                                     
Easy        0.049521  0.046036  0.041886  0.036207     0.937716     0.884664   

            BERTScore-F1   ROUGE-1   ROUGE-2   ROUGE-L  
Difficulty                                              
Easy            0.910417  0.473684  0.444444  0.473684  


In [17]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import torch
import json
import nltk
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score
from rouge_score import rouge_scorer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import os
from datetime import datetime
import logging
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Gemini integration
from langchain_google_genai import ChatGoogleGenerativeAI

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Download NLTK data
nltk.download('punkt')

@dataclass
class InterviewMetrics:
    overall_score: float
    topic_scores: Dict[str, float]
    difficulty_scores: Dict[str, float]
    topic_coverage: float
    difficulty_progression: float
    topic_progression: float
    response_time_metrics: Dict[str, float]
    confidence_scores: List[float]

@dataclass
class InterviewAlgorithm:
    topics: List[str]
    difficulties: List[str]
    
    def __init__(self, topics: List[str], difficulties: List[str]):
        self.topics = topics
        self.difficulties = difficulties
        self.topic_weights = {topic: 1.0 for topic in topics}
        self.performance_threshold = {
            'promotion': 0.75,
            'demotion': 0.45,
            'mastery': 0.85
        }
        
    def update_topic_weights(self, topic_performances: Dict[str, float]):
        for topic, performance in topic_performances.items():
            if performance < self.performance_threshold['demotion']:
                self.topic_weights[topic] *= 1.5
            elif performance > self.performance_threshold['mastery']:
                self.topic_weights[topic] *= 0.5
                
    def select_next_topic(self, covered_topics: set, topic_performances: Dict[str, float]) -> str:
        uncovered_topics = set(self.topics) - covered_topics
        
        if uncovered_topics:
            topic_weights = {
                t: self.topic_weights[t] * 2.0 if t in uncovered_topics else self.topic_weights[t]
                for t in self.topics
            }
        else:
            topic_weights = {t: 1.0 / (perf + 0.1) for t, perf in topic_performances.items()}
            
        topics = list(topic_weights.keys())
        weights = list(topic_weights.values())
        return np.random.choice(topics, p=np.array(weights)/sum(weights))
    
    def determine_difficulty(self, current_difficulty: str, avg_performance: float) -> str:
        diff_index = self.difficulties.index(current_difficulty)
        
        if avg_performance > self.performance_threshold['promotion'] and diff_index < len(self.difficulties) - 1:
            return self.difficulties[diff_index + 1]
        elif avg_performance < self.performance_threshold['demotion'] and diff_index > 0:
            return self.difficulties[diff_index - 1]
        return current_difficulty

class AdvancedInterviewSystem:
    def __init__(self, access_token: str = None, adapter_path: str = None, google_api_key: str = None):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Using device: {self.device}")
        
        self.access_token = access_token
        self.adapter_path = adapter_path
        
        # Gemini configuration
        os.environ["GOOGLE_API_KEY"] = google_api_key or os.environ.get('GOOGLE_API_KEY')
        self.gemini_llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.7)
        
        # Initialize models
        self.model, self.tokenizer = self.load_fine_tuned_model()
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2').to(self.device)
        
        # Initialize evaluation metrics
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
        # Initialize data structures
        self.index = None
        self.passages = []
        self.df = None
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        
        # Interview tracking
        self.asked_questions = []
        self.responses = []
        self.response_times = []
        self.topics_covered = set()
        self.difficulty_history = []
        self.topic_history = []
        self.performance_history = []
        self.confidence_scores = []
        self.reference_answers = []
        
        self.difficulty_levels = {'Easy': 1, 'Medium': 2, 'Hard': 3}
        self.interview_algorithm = None

    def load_fine_tuned_model(self):
        try:
            base_model_name = "google/gemma-2b"
            logger.info("Loading tokenizer...")
            tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=self.access_token)
            
            logger.info("Loading base model...")
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                torch_dtype=torch.float16,
                device_map="auto",
                use_auth_token=self.access_token
            )
            
            logger.info("Loading fine-tuned adapter...")
            model = PeftModel.from_pretrained(base_model, self.adapter_path)
            logger.info("Model loaded successfully!")
            return model, tokenizer
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return None, None

    def generate_gemini_reference_answer(self, passage: str, question: str) -> str:
        """
        Generate a reference answer using Gemini
        """
        try:
            print("enters gemini")
            print("passage is",passage)
            print("question is:",question)
            prompt = f"""
            Based on the following passage:
            {passage}

            Please generate a comprehensive answer to the question:
            {question}

            Provide a detailed, well-structured response that directly addresses the question.
            """
            
            response = self.gemini_llm.invoke(prompt)
            print("response is:",response)
            return response.content
        except Exception as e:
            logger.error(f"Error generating reference answer with Gemini: {str(e)}")
            return passage  # Fallback to original passage if Gemini fails

    def load_dataset(self, csv_path: str):
        try:
            self.df = pd.read_csv(csv_path)
            self.passages = self.df['Passage'].fillna('').tolist()
            
            embeddings = self.embedding_model.encode(self.passages, convert_to_tensor=True).cpu().numpy()
            self.index = faiss.IndexFlatL2(embeddings.shape[1])
            self.index.add(embeddings)
            
            self.topics = self.df['Topic'].unique().tolist()
            self.subtopics = self.df['Sub-Topic'].unique().tolist()
            
            self.interview_algorithm = InterviewAlgorithm(
                topics=self.topics,
                difficulties=list(self.difficulty_levels.keys())
            )
            
            logger.info(f"Loaded {len(self.passages)} passages")
            logger.info(f"Found {len(self.topics)} topics and {len(self.subtopics)} subtopics")
            
        except Exception as e:
            logger.error(f"Error loading dataset: {str(e)}")
            raise

    # Other methods like generate_question, evaluate_answer, etc. remain the same as in the previous implementation
    def generate_question(self, passage: str, topic: str = None, subtopic: str = None, difficulty: str = None) -> str:
        try:
            prompt_parts = ["Generate a question based on the following information:"]
            if topic:
                prompt_parts.append(f"Topic: {topic}")
            if subtopic:
                prompt_parts.append(f"Subtopic: {subtopic}")
            if difficulty:
                prompt_parts.append(f"Difficulty: {difficulty}")
            prompt_parts.append(f"Passage: {passage}")
            prompt_parts.append("Question:")
            prompt = "\n".join(prompt_parts)
            
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=512,
                temperature=0.7,
                num_return_sequences=1,
                do_sample=True,
                top_p=0.9,
                top_k=50
            )
            
            question = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return question[len(prompt):].strip()
        except Exception as e:
            logger.error(f"Error generating question: {str(e)}")
            return None

    def evaluate_answer(self, question: str, user_answer: str, reference_answer: str, topic: str, difficulty: str) -> float:
        try:
            # Early detection of minimal or non-answers
            minimal_responses = ['i don\'t know', 'idk', 'dont know', 'not sure', '', 'no answer', 'none']
            normalized_answer = user_answer.lower().strip()

            if normalized_answer in minimal_responses:
                logger.info(f"Minimal response detected: {user_answer}")
                return 0.0

            if len(normalized_answer) < 10:
                logger.info(f"Very short response detected: {user_answer}")
                return 10.0

            # Using Gemini for evaluation
            evaluation_prompt = f"""
            Evaluate the following answer based on the given question and reference answer.

            Question: {question}
            Reference Answer: {reference_answer}
            User Answer: {user_answer}

            Score this answer from 0 to 100 based on:
            1. Accuracy and correctness (40%)
            2. Completeness of response (30%)
            3. Clarity and structure (20%)
            4. Technical depth appropriate for {difficulty} level (10%)

            Return only a number between 0 and 100. For example: 75
            """

            try:
                response = self.gemini_llm.invoke(evaluation_prompt)
                score_text = response.content.strip()
                # Extract numeric score using regex
                import re
                score_match = re.search(r'\d+(?:\.\d+)?', score_text)
                if score_match:
                    base_score = float(score_match.group())
                else:
                    base_score = 50.0  # Default score if parsing fails
            except Exception as e:
                logger.error(f"Gemini evaluation failed: {str(e)}")
                base_score = 50.0  # Default fallback score

            # Difficulty adjustment
            difficulty_multipliers = {
                'Easy': 1.0,
                'Medium': 1.2,
                'Hard': 1.5
            }

            # Apply difficulty multiplier
            adjusted_score = base_score * difficulty_multipliers.get(difficulty, 1.0)

            # Calculate confidence score
            confidence_score = self.calculate_confidence_score(user_answer)

            # Apply confidence adjustment (smaller impact)
            final_score = adjusted_score * (0.9 + 0.1 * confidence_score)

            # Ensure score is within 0-100 range
            final_score = max(0, min(final_score, 100))

            # Add confidence score to tracking
            self.confidence_scores.append(confidence_score)

            return final_score

        except Exception as e:
            logger.error(f"Error in answer evaluation: {str(e)}")
            return 50.0  # Return middle score instead of 0 for unexpected errors

           
    def _assess_content_relevance(self, question: str, user_answer: str, reference_answer: str) -> float:
        try:
            # For very short or non-answers
            if len(user_answer.strip()) < 10:
                return 0.0

            prompt = f"""
            Strictly evaluate the content relevance of the following answer to the question:

            Question: {question}
            Reference Answer: {reference_answer}
            User Answer: {user_answer}

            If the user's answer shows absolutely no understanding or is completely off-topic, provide a score of 0.
            If the user acknowledges not knowing or provides minimal information, provide a very low score between 0.05 and 0.2.
            Otherwise, provide a score from 0 to 1 indicating how well the answer addresses the question.

            Provide ONLY a numerical score between 0 and 1.
            """

            response = self.gemini_llm.invoke(prompt)
            relevance_score = float(response.content.strip())
            return relevance_score
        except Exception as e:
            logger.warning(f"Gemini relevance assessment failed: {e}")
            return 0.0  # Very low score if assessment fails

    def _assess_technical_depth(self, user_answer: str, reference_answer: str, difficulty: str) -> float:
        """
        Assess the technical depth of the answer based on difficulty level
        """
        try:
            prompt = f"""
            Evaluate the technical depth of the following answer:

            Reference Answer: {reference_answer}
            User Answer: {user_answer}
            Expected Difficulty: {difficulty}

            Assess the answer's technical depth considering:
            1. Complexity of explanation
            2. Use of technical terminology
            3. Depth of understanding demonstrated
            4. Alignment with difficulty level

            Provide a score from 0 to 1.
            """

            response = self.gemini_llm.invoke(prompt)
            depth_score = float(response.content.strip())
            return depth_score
        except Exception as e:
            logger.warning(f"Technical depth assessment failed: {e}")
            return 0.5

    def _evaluate_domain_knowledge(self, user_answer: str, topic: str) -> float:
        """
        Assess domain-specific knowledge
        """
        try:
            prompt = f"""
            Evaluate the domain knowledge in the following answer for the topic {topic}:

            User Answer: {user_answer}

            Assess based on:
            1. Accuracy of domain-specific information
            2. Depth of understanding
            3. Use of relevant terminology
            4. Demonstration of expertise

            Provide a score from 0 to 1.
            """

            response = self.gemini_llm.invoke(prompt)
            domain_score = float(response.content.strip())
            return domain_score
        except Exception as e:
            logger.warning(f"Domain knowledge assessment failed: {e}")
            return 0.5

    def _assess_communication_quality(self, user_answer: str) -> float:
        """
        Evaluate communication skills
        """
        try:
            # Check for:
            # 1. Clarity
            # 2. Conciseness
            # 3. Structure
            # 4. Grammar and language quality
            communication_metrics = {
                'clarity_score': self._assess_clarity(user_answer),
                'conciseness_score': self._check_conciseness(user_answer),
                'grammatical_score': self._evaluate_grammar(user_answer)
            }

            return np.mean(list(communication_metrics.values()))
        except Exception as e:
            logger.warning(f"Communication quality assessment failed: {e}")
            return 0.5

    def _evaluate_answer_structure(self, user_answer: str) -> float:
        """
        Check the structure and organization of the answer
        """
        try:
            prompt = f"""
            Evaluate the structure of the following answer:

            {user_answer}

            Assess based on:
            1. Logical flow
            2. Clear introduction
            3. Well-organized points
            4. Coherent conclusion

            Provide a score from 0 to 1.
            """

            response = self.gemini_llm.invoke(prompt)
            structure_score = float(response.content.strip())
            return structure_score
        except Exception as e:
            logger.warning(f"Answer structure evaluation failed: {e}")
            return 0.5

    def _check_contextual_understanding(self, question: str, user_answer: str) -> float:
        """
        Assess the contextual understanding of the question
        """
        try:
            prompt = f"""
            Evaluate the contextual understanding in this answer:

            Question: {question}
            Answer: {user_answer}

            Assess:
            1. Comprehension of question context
            2. Addressing implicit aspects of the question
            3. Showing deeper understanding beyond literal interpretation

            Provide a score from 0 to 1.
            """

            response = self.gemini_llm.invoke(prompt)
            context_score = float(response.content.strip())
            return context_score
        except Exception as e:
            logger.warning(f"Contextual understanding assessment failed: {e}")
            return 0.5

    def _calculate_advanced_nlp_metrics(self, user_answer: str, reference_answer: str) -> float:
        """
        Calculate advanced NLP metrics
        """
        try:
            # BLEU score
            bleu_score = sentence_bleu([reference_answer.split()], user_answer.split())

            # BERT score
            _, _, bert_f1 = score([user_answer], [reference_answer], lang='en', verbose=False)
            bert_score = bert_f1.mean().item()

            # ROUGE scores
            rouge_scores = self.rouge_scorer.score(user_answer, reference_answer)
            rouge_f1 = np.mean([
                rouge_scores['rouge1'].fmeasure,
                rouge_scores['rouge2'].fmeasure,
                rouge_scores['rougeL'].fmeasure
            ])

            # Combine NLP metrics
            nlp_score = (0.3 * bleu_score + 0.4 * bert_score + 0.3 * rouge_f1)
            return nlp_score
        except Exception as e:
            logger.warning(f"Advanced NLP metrics calculation failed: {e}")
            return 0.5

    def calculate_confidence_score(self, answer: str) -> float:
        hesitation_words = ['maybe', 'perhaps', 'probably', 'might', 'not sure']
        confident_words = ['definitely', 'certainly', 'clearly', 'indeed', 'absolutely']
        
        answer_lower = answer.lower()
        hesitation_count = sum(word in answer_lower for word in hesitation_words)
        confidence_count = sum(word in answer_lower for word in confident_words)
        
        confidence_score = 0.5 + (confidence_count * 0.1) - (hesitation_count * 0.1)
        return max(0, min(1, confidence_score))

    def select_next_question(self) -> Tuple[str, str, str, str]:
        if not self.asked_questions:
            # First question: Use initial sampling strategy
            row = self.df[self.df['Difficulty'] == 'Easy'].sample(n=1).iloc[0]
            print(row)
            return row['Topic'], row['Sub-Topic'], row['Passage'], 'Easy'
        
        # Calculate performance metrics
        recent_window = min(5, len(self.performance_history))
        print(recent_window)
        avg_performance = np.mean(self.performance_history[-recent_window:])
        print(avg_performance)
        topic_performances = self.calculate_topic_wise_performance()

        # Update topic weights in the interview algorithm
        self.interview_algorithm.update_topic_weights(topic_performances)

        # Select next topic using the interview algorithm
        next_topic = self.interview_algorithm.select_next_topic(
            self.topics_covered,
            topic_performances
        )

        # Determine next difficulty level
        current_difficulty = self.difficulty_history[-1] if self.difficulty_history else 'Easy'
        next_difficulty = self.interview_algorithm.determine_difficulty(
            current_difficulty,
            avg_performance
        )

        # Use last response or last question to perform semantic search
        if self.responses:
            last_response = self.responses[-1]
        else:
            last_response = self.asked_questions[-1]

        # Perform semantic search to find the most relevant passage
        query_embedding = self.embedding_model.encode(last_response, convert_to_tensor=True).cpu().numpy()

        # Search in the FAISS index
        distances, indices = self.index.search(query_embedding.reshape(1, -1), k=10)

        # Filter passages based on next_topic and next_difficulty
        relevant_passages = []
        for idx in indices[0]:
            row = self.df.iloc[idx]
            if row['Topic'] == next_topic and row['Difficulty'] == next_difficulty:
                relevant_passages.append(row)

        # If no passages match both topic and difficulty, try relaxing the constraints
        if not relevant_passages:
            for idx in indices[0]:
                row = self.df.iloc[idx]
                if row['Topic'] == next_topic:
                    relevant_passages.append(row)

        # If still no passages, fallback to sampling
        if not relevant_passages:
            mask = (self.df['Topic'] == next_topic)
            row = self.df[mask].sample(n=1).iloc[0]
        else:
            # Select a random passage from the relevant set
            row = relevant_passages[np.random.randint(len(relevant_passages))]

        return row['Topic'], row['Sub-Topic'], row['Passage'], row['Difficulty']

    def calculate_metrics(self) -> InterviewMetrics:
        try:
            print("hi")
            overall_score = np.mean(self.performance_history) if self.performance_history else 0
            print("overall score is",overall_score)
            topic_scores = self.calculate_topic_wise_performance()
            difficulty_scores = self.calculate_difficulty_wise_performance()
            
            topic_coverage = len(self.topics_covered) / len(self.topics)
            difficulty_progression = self._calculate_difficulty_progression()
            topic_progression = self._calculate_topic_progression()
            print("topic progression is:",topic_progression)
            response_time_metrics = {
                'average': np.mean(self.response_times),
                'min': np.min(self.response_times),
                'max': np.max(self.response_times),
                'std': np.std(self.response_times)
            }
            
            return InterviewMetrics(
                overall_score=overall_score,
                topic_scores=topic_scores,
                difficulty_scores=difficulty_scores,
                topic_coverage=topic_coverage,
                difficulty_progression=difficulty_progression,
                topic_progression=topic_progression,
                response_time_metrics=response_time_metrics,
                confidence_scores=self.confidence_scores
            )
        except Exception as e:
            logger.error(f"Error calculating metrics: {str(e)}")
            raise

    def calculate_topic_wise_performance(self) -> Dict[str, float]:
        topic_scores = {}
        for topic, score in zip(self.topic_history, self.performance_history):
            if topic not in topic_scores:
                topic_scores[topic] = []
            topic_scores[topic].append(score)
        
        return {
            topic: np.average(scores, weights=np.linspace(1, 2, len(scores)))
            for topic, scores in topic_scores.items()
        }

    def calculate_difficulty_wise_performance(self) -> Dict[str, float]:
        diff_scores = {}
        for diff, score in zip(self.difficulty_history, self.performance_history):
            if diff not in diff_scores:
                diff_scores[diff] = []
            diff_scores[diff].append(score)

        return {
            diff: np.mean(scores) / self.difficulty_levels[diff]
            for diff, scores in diff_scores.items()
        }

    def _calculate_difficulty_progression(self) -> float:
        if len(self.difficulty_history) < 2:
            return 1.0
        correct_progressions = 0
        total_transitions = len(self.difficulty_history) - 1
        
        for i in range(total_transitions):
            current_level = self.difficulty_levels[self.difficulty_history[i]]
            next_level = self.difficulty_levels[self.difficulty_history[i + 1]]
            current_performance = self.performance_history[i]
            
            if (current_performance > 0.8 and next_level >= current_level) or \
               (current_performance < 0.4 and next_level <= current_level) or \
               (0.4 <= current_performance <= 0.8 and next_level == current_level):
                correct_progressions += 1
                
        return correct_progressions / total_transitions

    def _calculate_topic_progression(self) -> float:
        if len(self.topic_history) < 2:
            return 1.0
        correct_progressions = 0
        total_transitions = len(self.topic_history) - 1
        covered_topics = set()
        
        for i in range(total_transitions):
            current_topic = self.topic_history[i]
            covered_topics.add(current_topic)
            if self.performance_history[i] > 0.7 and self.topic_history[i + 1] not in covered_topics:
                correct_progressions += 1
            elif self.performance_history[i] < 0.5 and self.topic_history[i + 1] in covered_topics:
                correct_progressions += 1
                
        return correct_progressions / total_transitions
    
    def save_interview_report(self, file_path: str = "/kaggle/working/interview_report.md"):
        print("enters saving mode")
        metrics = self.calculate_metrics()
        print("metrics are:",metrics)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        with open(file_path, "w") as f:
            f.write(f"# Interview Report - {timestamp}\n\n")
            f.write("## Overall Performance\n")
            f.write(f"- Overall Score: {metrics.overall_score:.2f}%\n")
            f.write(f"- Topic Coverage: {metrics.topic_coverage:.2f}%\n")
            f.write(f"- Difficulty Progression Score: {metrics.difficulty_progression:.2f}\n")
            f.write(f"- Topic Progression Score: {metrics.topic_progression:.2f}\n\n")
            
            f.write("## Topic-wise Performance\n")
            for topic, score in metrics.topic_scores.items():
                f.write(f"- {topic}: {score:.2f}%\n")
            f.write("\n")
            
            f.write("## Difficulty-wise Performance\n")
            for diff, score in metrics.difficulty_scores.items():
                f.write(f"- {diff}: {score:.2f}%\n")
            f.write("\n")
            
            f.write("## Response Time Analysis\n")
            f.write(f"- Average Response Time: {metrics.response_time_metrics['average']:.2f} seconds\n")
            f.write(f"- Minimum Response Time: {metrics.response_time_metrics['min']:.2f} seconds\n")
            f.write(f"- Maximum Response Time: {metrics.response_time_metrics['max']:.2f} seconds\n")
            f.write(f"- Response Time Standard Deviation: {metrics.response_time_metrics['std']:.2f} seconds\n\n")
            
            f.write("## Confidence Analysis\n")
            f.write(f"- Average Confidence Score: {np.mean(metrics.confidence_scores):.2f}\n")
            f.write(f"- Confidence Score Trend: {self._calculate_confidence_trend(metrics.confidence_scores)}\n\n")
            
            f.write("## Question History\n")
            for i, (q, r, t, d, p, ref) in enumerate(zip(
                self.asked_questions,
                self.responses,
                self.topic_history,
                self.difficulty_history,
                self.performance_history,
                self.reference_answers
            ), 1):
                f.write(f"\n### Question {i}\n")
                f.write(f"- Topic: {t}\n")
                f.write(f"- Difficulty: {d}\n")
                f.write(f"- Question: {q}\n")
                f.write(f"- Response: {r}\n")
                f.write(f"- Reference Answer: {ref}\n")
                f.write(f"- Performance Score: {p:.2f}%\n")
            
            # Generate and save performance visualization
            self._generate_performance_plots()
            f.write("\n## Performance Visualizations\n")
            f.write("- Performance trends visualization has been saved as 'performance_trends.png'\n")
        
        logger.info(f"Interview report saved to {file_path}")

    def _calculate_confidence_trend(self, confidence_scores: List[float]) -> str:
        if len(confidence_scores) < 2:
            return "Insufficient data"
        
        slope = np.polyfit(range(len(confidence_scores)), confidence_scores, 1)[0]
        if slope > 0.05:
            return "Increasing"
        elif slope < -0.05:
            return "Decreasing"
        else:
            return "Stable"

    def _generate_performance_plots(self):
        plt.figure(figsize=(15, 10))
        
        # Performance over time
        plt.subplot(2, 2, 1)
        plt.plot(self.performance_history, marker='o')
        plt.title('Performance Over Time')
        plt.xlabel('Question Number')
        plt.ylabel('Score (%)')
        
        # Difficulty progression
        plt.subplot(2, 2, 2)
        difficulty_values = [self.difficulty_levels[d] for d in self.difficulty_history]
        plt.plot(difficulty_values, marker='s')
        plt.title('Difficulty Progression')
        plt.xlabel('Question Number')
        plt.ylabel('Difficulty Level')
        plt.yticks([1, 2, 3], ['Easy', 'Medium', 'Hard'])
        
        # Topic coverage
        plt.subplot(2, 2, 3)
        topic_counts = pd.Series(self.topic_history).value_counts()
        topic_counts.plot(kind='bar')
        plt.title('Topic Coverage')
        plt.xlabel('Topics')
        plt.ylabel('Number of Questions')
        plt.xticks(rotation=45)
        
        # Confidence trend
        plt.subplot(2, 2, 4)
        plt.plot(self.confidence_scores, marker='o')
        plt.title('Confidence Trend')
        plt.xlabel('Question Number')
        plt.ylabel('Confidence Score')
        
        plt.tight_layout()
        plt.savefig('/kaggle/working/performance_trends.png')
        plt.close()

def main():
    # Configure logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Configuration
    try:
        # Option 1: Use environment variables or default paths
        csv_path = os.environ.get('DATASET_PATH', '/kaggle/input/nikahatmaam/pass_QA_topic_stopic_diff.csv')
        adapter_path = os.environ.get('ADAPTER_PATH', '/kaggle/input/config-files')
        access_token = "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"
        google_api_key = "AIzaSyC29gObkycJDBjVkEWjhJoJO-HVB0pC00E"

        # Option 2: Prompt user for inputs or use defaults
        if not csv_path or not os.path.exists(csv_path):
            csv_path = input("Enter the path to your interview dataset CSV (default: './dataset.csv'): ") or './dataset.csv'
        
        if not adapter_path or not os.path.exists(adapter_path):
            adapter_path = input("Enter the path to your model adapter (default: './adapter'): ") or './adapter'
        
        if not access_token:
            access_token = input("Enter your Hugging Face access token (optional, press enter to skip): ") or None

        if not google_api_key:
            google_api_key = input("Enter your Google API key: ")

        # Initialize the interview system
        interview_system = AdvancedInterviewSystem(
            access_token=access_token, 
            adapter_path=adapter_path,
            google_api_key=google_api_key
        )
        
        # Load the dataset
        interview_system.load_dataset(csv_path)
        logger.info("Dataset loaded successfully")
        print("Dataset succesfully loaded")
        # Conduct interview
        num_questions = int(input("How many questions do you want in the interview? (default is 10): ") or 10)
        
        for _ in tqdm(range(num_questions), desc="Conducting interview"):
            # Select next question
            topic, subtopic, passage, difficulty = interview_system.select_next_question()
            
            # Generate question
            if interview_system.model and interview_system.tokenizer:
                question = interview_system.generate_question(passage, topic, subtopic, difficulty)
            else:
                # Fallback to a simple question generation if no model is available
                question = f"Tell me about {topic} in the context of {subtopic}"
            
            # Prompt user for response
            print(f"\nQuestion (Difficulty: {difficulty}, Topic: {topic}): {question}")
            user_response = input("Your response: ")
            
            # Evaluate response
            reference_answer = interview_system.generate_gemini_reference_answer(passage, question)
#             print("reference answer is:", reference_answer)
            performance = interview_system.evaluate_answer(
                question,          # Question
                user_response,     # User response
                reference_answer,  # Reference answer
                topic,             # Topic
                difficulty         # Difficulty
            )
            
            print("performance is:",performance)
            # Track interview progress
            interview_system.asked_questions.append(question)
            interview_system.responses.append(user_response)
            interview_system.reference_answers.append(reference_answer)
            interview_system.topic_history.append(topic)
            interview_system.difficulty_history.append(difficulty)
            interview_system.performance_history.append(performance)
        
        # Generate and save interview report
        interview_system.save_interview_report()
        logger.info("Interview completed. Report generated successfully.")
    
    except Exception as e:
        logger.error(f"An error occurred during the interview: {str(e)}")
        logger.exception(e)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Dataset succesfully loaded


How many questions do you want in the interview? (default is 10):  2


Conducting interview:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0                                                  957
ID                                                          958
Topic                                       Java Multithreading
Sub-Topic                       Introduction to Multithreading:
Passage       Multithreading is a powerful concept in Java t...
Difficulty                                                 Easy
Question                        What is multithreading in Java?
Answer        Multithreading in Java allows concurrent execu...
Name: 957, dtype: object

Question (Difficulty: Easy, Topic: Java Multithreading): How can multithreading help in improving the performance of a program?


Your response:  Multithreading improves program performance by allowing multiple tasks to execute concurrently within a single process


enters gemini
passage is Multithreading is a powerful concept in Java that allows concurrent execution of multiple threads within a single program. It enables developers to write efficient and responsive applications by dividing tasks into smaller units of execution that can run simultaneously. By utilizing multiple threads, a program can make better use of available system resources and improve overall performance. In Java, multithreading is achieved by extending the Thread class or implementing the Runnable interface. Threads can be created, started, paused, resumed, and terminated, providing a high level of control over the execution flow. However, multithreading also introduces challenges such as thread synchronization, resource sharing, and deadlock prevention, which need to be carefully addressed to ensure the correctness and reliability of the application.
question is: How can multithreading help in improving the performance of a program?
response is: content='Multithreading sig

Conducting interview:  50%|█████     | 1/2 [00:28<00:28, 28.27s/it]

performance is: 66.5
1
66.5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Question (Difficulty: Easy, Topic: Java Abstraction): How does encapsulation contribute to creating modular and maintainable code in Java?


Your response:  java is idiot


enters gemini
passage is Encapsulation is another important concept in Java that complements abstraction. It involves bundling data and methods together into a single unit called a class. Encapsulation provides data hiding and protects the internal state of an object from external access. By using encapsulation, we can control how the data is accessed and modified, ensuring data integrity and security. Abstraction, on the other hand, focuses on hiding unnecessary details and providing a simplified view of an object. Together, encapsulation and abstraction help in creating modular and maintainable code by separating the implementation details from the external interface.
question is: How does encapsulation contribute to creating modular and maintainable code in Java?
response is: content="**Encapsulation's Contribution to Modular and Maintainable Code in Java**\n\nEncapsulation, a fundamental concept in Java, plays a vital role in fostering the development of modular and maintainable co

Conducting interview: 100%|██████████| 2/2 [00:47<00:00, 23.95s/it]

performance is: 0
enters saving mode
hi
overall score is 33.25
topic progression is: 1.0



