# SRM University-AP RAG Chatbot - Refactored Colab Implementation

This notebook implements a Retrieval-Augmented Generation (RAG) chatbot for SRM University-AP using Google Colab, refactored for better structure, dependency management, and Google Drive integration.

## 1. Setup and Smart Dependency Installation

Checks if required packages are installed and installs them only if necessary. This avoids redundant installations in the Colab environment.

In [None]:
import importlib.util
import subprocess
import sys
import os

print("--- Dependency Check ---")

required_packages = [
    ('torch', 'torch'),
    ('transformers', 'transformers'),
    ('sentence_transformers', 'sentence-transformers'),
    ('faiss', 'faiss-cpu'),
    ('bitsandbytes', 'bitsandbytes'),
    ('gradio', 'gradio'),
    ('tqdm', 'tqdm'),
    ('numpy', 'numpy')
]

missing_packages = []
for import_name, install_name in required_packages:
    try:
        spec = importlib.util.find_spec(import_name)
        if spec is None:
            print(f"Package '{install_name}' (for import '{import_name}') not found.")
            missing_packages.append(install_name)
        else:
            print(f"Package '{install_name}' (for import '{import_name}') found.")
    except ModuleNotFoundError:
         print(f"Package '{install_name}' (for import '{import_name}') not found.")
         missing_packages.append(install_name)

if missing_packages:
    print(f"\nInstalling missing packages: {', '.join(missing_packages)}")
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q'] + missing_packages)
        print("Packages installed successfully.")
        print("\nVerifying installation...")
        still_missing = []
        for import_name, install_name in required_packages:

             importlib.invalidate_caches()
             spec = importlib.util.find_spec(import_name)
             if spec is None:
                 print(f"Verification failed: Package '{install_name}' still seems missing.")
                 still_missing.append(install_name)
             else:
                 print(f"Verified: Package '{install_name}' is installed.")
        if still_missing:
             print(f"\nWarning: Installation might have failed for: {', '.join(still_missing)}. Please check pip logs.")
        else:
             print("\nAll required packages verified successfully.")

    except subprocess.CalledProcessError as e:
        print(f"\nError installing packages: {e}")
        print("Please check the error message above. You might need to install manually or restart the runtime.")

else:
    print("\nAll required packages are already installed.")

print("--- Dependency Check Complete ---")


--- Dependency Check ---
Package 'torch' (for import 'torch') found.
Package 'transformers' (for import 'transformers') found.
Package 'sentence-transformers' (for import 'sentence_transformers') found.
Package 'faiss-cpu' (for import 'faiss') found.
Package 'bitsandbytes' (for import 'bitsandbytes') found.
Package 'gradio' (for import 'gradio') found.
Package 'tqdm' (for import 'tqdm') found.
Package 'numpy' (for import 'numpy') found.

All required packages are already installed.
--- Dependency Check Complete ---


## 2. Google Drive Integration

Mounts Google Drive and sets the working directory to `MyDrive/srmap-bot`. Creates the directory and necessary subdirectories (`data`, `logs`) if they don't exist. This ensures all files are stored persistently.

In [None]:
import os
try:
    from google.colab import drive
    print("--- Google Drive Integration ---")

    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")


    drive_folder = '/content/drive/MyDrive/srmap-bot'
    print(f"Target Google Drive folder: {drive_folder}")


    os.makedirs(drive_folder, exist_ok=True)
    print(f"Ensured directory exists: {drive_folder}")


    os.chdir(drive_folder)
    print(f"Changed working directory to: {os.getcwd()}")


    data_dir = 'data'
    logs_dir = 'logs'
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(logs_dir, exist_ok=True)
    print(f"Ensured subdirectories exist: '{data_dir}' and '{logs_dir}'")
    print("--- Google Drive Integration Complete ---")

except ModuleNotFoundError:
    print("Not running in Google Colab environment. Skipping Drive mount.")

    local_folder = 'srmap-bot-local'
    os.makedirs(local_folder, exist_ok=True)
    os.chdir(local_folder)
    print(f"Using local directory: {os.getcwd()}")
    os.makedirs('data', exist_ok=True)
    os.makedirs('logs', exist_ok=True)
    print("Ensured local 'data' and 'logs' subdirectories exist.")
except Exception as e:
    print(f"\nError during Google Drive integration: {e}")
    print("Please ensure you have granted Google Drive permissions if running in Colab.")

--- Google Drive Integration ---
Mounted at /content/drive
Google Drive mounted successfully.
Target Google Drive folder: /content/drive/MyDrive/srmap-bot
Ensured directory exists: /content/drive/MyDrive/srmap-bot
Changed working directory to: /content/drive/MyDrive/srmap-bot
Ensured subdirectories exist: 'data' and 'logs'
--- Google Drive Integration Complete ---


## 3. Data File Check

Checks if the required data files (index, metadata) exist in the `data` subdirectory within the current working directory (which should be the Google Drive folder if in Colab, or a local folder otherwise). If not, prompts the user to upload them.

In [None]:
import os

print("--- Data File Check ---")

required_data_files = [
    'srmap_faiss_deep.index',
    'srmap_metadata_deep.pkl'
    # Add 'srmap_data_deep.pkl' if it's directly used by your bot
    # Add 'srmap_faiss_async.index', 'srmap_metadata_async.pkl' if needed as fallbacks
]
data_dir = 'data'
absolute_data_dir = os.path.abspath(data_dir)

print(f"Checking for data files in: {absolute_data_dir}")

missing_data_files = []
found_data_files = []

for filename in required_data_files:
    file_path = os.path.join(data_dir, filename)
    if not os.path.exists(file_path):
        print(f"Missing: {filename}")
        missing_data_files.append(filename)
    else:
        print(f"Found:   {filename}")
        found_data_files.append(filename)

if missing_data_files:
    print("\n---------------------------------------------------------------------")
    print(f"Warning: The following required data files are missing in the directory:")
    print(f"'{absolute_data_dir}'")
    for f in missing_data_files:
        print(f"- {f}")
    print("\nPlease upload the missing files to this directory (in your Google Drive if using Colab, or locally otherwise).")
    print("The chatbot initialization might fail without these files.")
    print("---------------------------------------------------------------------")
    # Optional: Add a file upload cell specifically for Colab if desired
    # try:
    #     from google.colab import files
    #     print("\nUse the button below to upload missing files (will be saved temporarily in Colab):")
    #     uploaded = files.upload()
    #     for fn in uploaded.keys():
    #        print(f'Uploaded \'{fn}\'')
    #        # You might want to move uploaded files to the data_dir here
    #        # import shutil
    #        # shutil.move(fn, os.path.join(data_dir, fn))
    # except ModuleNotFoundError:
    #     pass # Not in Colab
else:
    print(f"\nAll required data files found in '{absolute_data_dir}'.")

print("--- Data File Check Complete ---")

--- Data File Check ---
Checking for data files in: /content/drive/MyDrive/srmap-bot/data
Found:   srmap_faiss_deep.index
Found:   srmap_metadata_deep.pkl

All required data files found in '/content/drive/MyDrive/srmap-bot/data'.
--- Data File Check Complete ---


## 4. RAG Chatbot Implementation

Contains the `RAGChatbot` class definition, including resource loading, retrieval, generation, and a new `cleanup` method for resource management.

In [None]:
import logging
import os
import pickle
import sys
import time
from typing import Dict, List, Tuple, Optional, Any
import gc


import numpy as np
import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


logs_dir = 'logs'
log_file_path = os.path.join(logs_dir, 'colab_app.log')

os.makedirs(logs_dir, exist_ok=True)

log_handlers = [
    logging.FileHandler(log_file_path),
    logging.StreamHandler(sys.stdout)
]

if not logging.getLogger().hasHandlers():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=log_handlers
    )
else:
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    handler_types = [type(h) for h in logger.handlers]
    if logging.FileHandler not in handler_types:
        logger.addHandler(logging.FileHandler(log_file_path))
    if logging.StreamHandler not in handler_types:
        logger.addHandler(logging.StreamHandler(sys.stdout))

logger = logging.getLogger("SRMAPChatbot")
logger.info("--- RAGChatbot Class Definition ---")

class RAGChatbot:
    """Retrieval-Augmented Generation Chatbot for SRM University-AP.

    Refactored for Colab, Google Drive, and resource cleanup.
    """

    def __init__(
        self,
        index_file_name: str = "srmap_faiss_deep.index",
        metadata_file_name: str = "srmap_metadata_deep.pkl",
        data_dir: str = "data",
        embedding_model_name: str = "all-MiniLM-L6-v2",
        llm_model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        use_gpu: bool = True,
        use_8bit: bool = True,
        top_k: int = 10,
        max_new_tokens: int = 512
    ):
        """
        Initialize the RAG chatbot.
        """
        self.index_file = os.path.join(data_dir, index_file_name)
        self.metadata_file = os.path.join(data_dir, metadata_file_name)
        self.embedding_model_name = embedding_model_name
        self.llm_model_name = llm_model_name
        self.use_gpu_preference = use_gpu
        self.use_8bit = use_8bit
        self.top_k = top_k
        self.max_new_tokens = max_new_tokens
        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")


        self.device = None
        self.index = None
        self.chunks = None
        self.metadata = None
        self.embedding_model = None
        self.llm = None
        self.tokenizer = None

        self.logger.info(f"Initializing RAGChatbot with:")
        self.logger.info(f"  Index File: {os.path.abspath(self.index_file)}")
        self.logger.info(f"  Metadata File: {os.path.abspath(self.metadata_file)}")
        self.logger.info(f"  Embedding Model: {self.embedding_model_name}")
        self.logger.info(f"  LLM Model: {self.llm_model_name}")
        self.logger.info(f"  Use GPU Preference: {self.use_gpu_preference}")
        self.logger.info(f"  Use 8-bit Quantization: {self.use_8bit}")


        self.device = self._get_device()
        self.logger.info(f"Selected device: {self.device}")
        self._load_resources()

    def _get_device(self) -> str:
        """Determine the best available device (CPU or CUDA)."""
        if not self.use_gpu_preference:
            self.logger.info("GPU usage disabled by preference.")
            return 'cpu'

        if torch.cuda.is_available():
            try:
                gpu_name = torch.cuda.get_device_name(0)
                self.logger.info(f"CUDA GPU detected: {gpu_name}")
                # Optional: Check memory
                # stats = torch.cuda.memory_stats(0)
                # free_memory_gb = (stats['reserved_bytes.all.peak'] - stats['allocated_bytes.all.peak']) / (1024**3)
                # self.logger.info(f"Approx. Free GPU Memory: {free_memory_gb:.2f} GB")
                return 'cuda'
            except Exception as e:
                self.logger.error(f"Error while checking GPU details: {e}. Falling back to CPU.")
                return 'cpu'
        else:
            self.logger.info("No CUDA-compatible GPU found. Using CPU.")
            return 'cpu'

    def _load_resources(self) -> None:
        """Load all required resources: FAISS index, metadata, embedding model, LLM, and tokenizer."""
        self.logger.info("--- Loading Resources ---")
        start_time = time.time()

        try:
            # Load FAISS index
            if not os.path.exists(self.index_file):
                self.logger.error(f"FAISS index file not found at: {os.path.abspath(self.index_file)}")
                raise FileNotFoundError(f"Index file missing: {self.index_file}")
            self.logger.info(f"Loading FAISS index from {self.index_file}")
            self.index = faiss.read_index(self.index_file)
            self.logger.info(f"FAISS index loaded successfully ({self.index.ntotal} vectors).")

            # Load metadata
            if not os.path.exists(self.metadata_file):
                self.logger.error(f"Metadata file not found at: {os.path.abspath(self.metadata_file)}")
                raise FileNotFoundError(f"Metadata file missing: {self.metadata_file}")
            self.logger.info(f"Loading metadata from {self.metadata_file}")
            with open(self.metadata_file, 'rb') as f:
                metadata_dict = pickle.load(f)
            self.chunks = metadata_dict.get('chunks', [])
            self.metadata = metadata_dict.get('metadata', [])
            if not self.chunks or not self.metadata:
                 self.logger.warning("Metadata file loaded but 'chunks' or 'metadata' key is missing or empty.")
                 # Decide how to handle: raise error or proceed with empty data?
                 # raise ValueError("Invalid metadata format.")
            elif len(self.chunks) != len(self.metadata):
                self.logger.warning(
                    f"Metadata integrity issue: {len(self.chunks)} chunks, {len(self.metadata)} metadata entries. \n"
                    f"This might indicate corruption. Truncating to the minimum length ({min(len(self.chunks), len(self.metadata))})."
                )
                min_len = min(len(self.chunks), len(self.metadata))
                self.chunks = self.chunks[:min_len]
                self.metadata = self.metadata[:min_len]
            self.logger.info(f"Metadata loaded successfully ({len(self.chunks)} entries).")

            # Load embedding model
            self.logger.info(f"Loading embedding model: {self.embedding_model_name} onto device: {self.device}")
            self.embedding_model = SentenceTransformer(self.embedding_model_name, device=self.device)
            self.logger.info(f"Embedding model loaded successfully.")

            # Load LLM and tokenizer
            self.logger.info(f"Loading LLM: {self.llm_model_name}")
            quantization_config = None
            model_kwargs = {} # Initialize model_kwargs
            if self.use_8bit and self.device == 'cuda':
                self.logger.info("Applying 8-bit quantization (using bitsandbytes).")
                try:
                    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
                    model_kwargs['quantization_config'] = quantization_config
                    # device_map='auto' is crucial for bitsandbytes integration
                    model_kwargs['device_map'] = 'auto'
                    self.logger.info("BitsAndBytesConfig set for 8-bit loading.")
                except ImportError:
                    self.logger.warning("bitsandbytes library not found. Cannot apply 8-bit quantization. Loading in default precision.")
                    self.use_8bit = False
                except Exception as e:
                     self.logger.warning(f"Could not configure BitsAndBytes: {e}. Loading in default precision.")
                     self.use_8bit = False


            if 'device_map' not in model_kwargs:
                if self.device == 'cuda':
                    model_kwargs['torch_dtype'] = torch.float16
                    model_kwargs['device_map'] = 'auto'
                else:
                    model_kwargs['torch_dtype'] = torch.float32

            # Explicitly set trust_remote_code=True if needed for the specific model
            # model_kwargs['trust_remote_code'] = True

            self.logger.info(f"Loading Tokenizer: {self.llm_model_name}")
            self.tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name)
            self.logger.info(f"Loading LLM: {self.llm_model_name} with kwargs: {model_kwargs}")
            self.llm = AutoModelForCausalLM.from_pretrained(
                self.llm_model_name,
                **model_kwargs
            )

            self.logger.info(f"LLM and Tokenizer loaded successfully.")
            self.logger.info(f"LLM loaded on device(s): {self.llm.hf_device_map if hasattr(self.llm, 'hf_device_map') else self.llm.device}")

            # Ensure tokenizer has pad token (important for batching/padding)
            if self.tokenizer.pad_token is None:
                if self.tokenizer.eos_token:
                    self.logger.warning("Tokenizer does not have a pad token. Setting to eos_token.")
                    self.tokenizer.pad_token = self.tokenizer.eos_token
                    # Also update the model config if necessary
                    self.llm.config.pad_token_id = self.tokenizer.pad_token_id
                else:
                    self.logger.warning("Tokenizer lacks both pad_token and eos_token. Adding a default pad token '<PAD>'.")
                    self.tokenizer.add_special_tokens({'pad_token': '<PAD>'})
                    # Resize model embeddings if a new token was added
                    self.llm.resize_token_embeddings(len(self.tokenizer))
                    self.llm.config.pad_token_id = self.tokenizer.pad_token_id

        except FileNotFoundError as e:
            self.logger.error(f"Initialization failed: Required file not found. {e}")
            # Clean up partially loaded resources before raising
            self.cleanup()
            raise
        except Exception as e:
            self.logger.error(f"An unexpected error occurred during resource loading: {e}", exc_info=True) # Log traceback
            # Clean up partially loaded resources before raising
            self.cleanup()
            raise RuntimeError(f"Failed to load critical resources: {e}")

        elapsed_time = time.time() - start_time
        self.logger.info(f"--- All resources loaded successfully in {elapsed_time:.2f} seconds ---")

    def _retrieve_chunks(self, query: str) -> List[Tuple[str, str, float]]:
        """Retrieve relevant chunks from the FAISS index based on the query."""
        if self.index is None or self.embedding_model is None or self.chunks is None or self.metadata is None:
            self.logger.error("Cannot retrieve chunks: Resources not loaded.")
            return []
        try:
            self.logger.debug(f"Embedding query: '{query[:50]}...' ")
            query_embedding = self.embedding_model.encode([query], convert_to_tensor=True, device=self.embedding_model.device) # Use model's device
            query_embedding_np = query_embedding.cpu().numpy()

            self.logger.debug(f"Searching index for top {self.top_k} chunks.")
            distances, indices = self.index.search(query_embedding_np, self.top_k)

            retrieved = []
            if len(indices) > 0:
                valid_indices = indices[0][indices[0] != -1]
                for i, idx in enumerate(valid_indices):
                    if 0 <= idx < len(self.chunks):
                        chunk = self.chunks[idx]
                        meta = self.metadata[idx] if idx < len(self.metadata) else {}
                        distance = distances[0][i]
                        source = meta.get('source', f'chunk_{idx}') if isinstance(meta, dict) else f'chunk_{idx}'
                        retrieved.append((chunk, source, float(distance)))
                        self.logger.debug(f"  Retrieved chunk {idx} (Dist: {distance:.4f}): '{chunk[:50]}...' from {source}")
                    else:
                         self.logger.warning(f"Retrieved index {idx} is out of bounds for chunks/metadata (size: {len(self.chunks)}). Skipping.")
            return retrieved
        except Exception as e:
            self.logger.error(f"Error during chunk retrieval: {e}", exc_info=True)
            return []

    def _generate_response(self, query: str, retrieved_chunks: List[Tuple[str, str, float]]) -> str:
        """Generate a response using the LLM based on the query and retrieved context."""
        if self.llm is None or self.tokenizer is None:
            self.logger.error("Cannot generate response: LLM or tokenizer not loaded.")
            return "Error: LLM not available."

        context = "\n\n".join([chunk for chunk, source, distance in retrieved_chunks])

        # Using TinyLlama chat format example
        prompt = f"""<|system|>
You are a helpful assistant for SRM University-AP. Answer the user's query based *only* on the following context provided. If the context doesn't contain the answer, state that the information is not available in the provided context.
Context:\n{context}</s>
<|user|>
{query}</s>
<|assistant|>
"""

        self.logger.debug(f"Generated prompt (first 200 chars): {prompt[:200]}...")

        try:


            model_max_len = self.tokenizer.model_max_length if self.tokenizer.model_max_length else 2048 # Default if not set
            max_input_len = model_max_len - self.max_new_tokens
            inputs = self.tokenizer(prompt, return_tensors="pt", padding=False, truncation=True, max_length=max_input_len)



            input_device = self.llm.device
            inputs = {k: v.to(input_device) for k, v in inputs.items()}
            self.logger.debug(f"Input tensors moved to device: {input_device}")

            self.logger.info(f"Generating response with max_new_tokens={self.max_new_tokens}")
            # Generate response
            with torch.no_grad():
                 generation_output = self.llm.generate(
                    **inputs,
                    max_new_tokens=self.max_new_tokens,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    do_sample=True,
                    temperature=0.6,
                    top_p=0.9,
                    # repetition_penalty=1.1 # Optional: Penalize repetition
                )


            response_ids = generation_output[0][inputs['input_ids'].shape[1]:]
            response = self.tokenizer.decode(response_ids, skip_special_tokens=True)
            self.logger.info(f"Generated response (first 100 chars): {response[:100]}...")
            return response.strip()

        except Exception as e:
            self.logger.error(f"Error during response generation: {e}", exc_info=True)
            return f"An error occurred while generating the response: {e}"

    def chat(self, query: str) -> Dict[str, Any]:
        """Handle a user query: retrieve chunks and generate a response."""
        self.logger.info(f"Received query: '{query}'")
        start_time = time.time()

        # 1. Retrieve relevant chunks
        retrieved_chunks = self._retrieve_chunks(query)

        if not retrieved_chunks:
            self.logger.warning("No relevant chunks found for the query.")

            response = "I couldn't find relevant information in my current knowledge base to answer that query.",
            sources = []
        else:
            # 2. Generate response based on query and chunks
            response = self._generate_response(query, retrieved_chunks)
            sources = list(set([source for chunk, source, distance in retrieved_chunks])) # Get unique sources

        end_time = time.time()
        elapsed_time = end_time - start_time
        self.logger.info(f"Query processed in {elapsed_time:.2f} seconds.")

        return {
            "query": query,
            "response": response,
            "sources": sources, # List of source URLs or identifiers
            "retrieved_context": [ # Optionally return context for debugging
                 {"chunk": chunk, "source": source, "distance": distance}
                 for chunk, source, distance in retrieved_chunks
                ],
            "processing_time": elapsed_time
        }

    def cleanup(self) -> None:
        """Release resources like models and clear CUDA cache."""
        self.logger.info("--- Cleaning up RAGChatbot resources ---")


        if hasattr(self, 'llm') and self.llm is not None:
            try:
                del self.llm
                self.logger.info("LLM deleted.")
            except Exception as e:
                self.logger.warning(f"Error deleting LLM: {e}")
            self.llm = None

        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
            try:
                del self.tokenizer
                self.logger.info("Tokenizer deleted.")
            except Exception as e:
                self.logger.warning(f"Error deleting Tokenizer: {e}")
            self.tokenizer = None

        if hasattr(self, 'embedding_model') and self.embedding_model is not None:
            try:
                # For SentenceTransformer, moving to CPU might help release GPU memory
                # self.embedding_model.to('cpu')
                del self.embedding_model
                self.logger.info("Embedding model reference deleted.")
            except Exception as e:
                self.logger.warning(f"Error deleting Embedding Model reference: {e}")
            self.embedding_model = None

        # Delete FAISS index
        if hasattr(self, 'index') and self.index is not None:
            try:
                # FAISS index might not need explicit del if C object manages memory,
                # but removing reference is good practice.
                del self.index
                self.logger.info("FAISS index reference deleted.")
            except Exception as e:
                self.logger.warning(f"Error deleting FAISS index reference: {e}")
            self.index = None

        # Clear potentially large data structures
        self.chunks = None
        self.metadata = None
        self.logger.info("Chunk and metadata references cleared.")

        # Clear CUDA cache if GPU was used
        if self.device == 'cuda':
            try:
                torch.cuda.empty_cache()
                self.logger.info("CUDA cache cleared.")
            except Exception as e:
                self.logger.warning(f"Error clearing CUDA cache: {e}")

        # Run garbage collector
        try:
            gc.collect()
            self.logger.info("Garbage collection triggered.")
        except Exception as e:
             self.logger.warning(f"Error running garbage collector: {e}")

        self.logger.info("--- Cleanup complete ---")

# Log class definition completion
logger.info("--- RAGChatbot Class Defined Successfully ---")


INFO:SRMAPChatbot:--- RAGChatbot Class Definition ---
INFO:SRMAPChatbot:--- RAGChatbot Class Defined Successfully ---


## 5. Initialize Chatbot Instance

Creates an instance of the `RAGChatbot`. This will trigger the resource loading process defined in `__init__`. Make sure your data files are in the correct location (`data/` subdirectory) before running this cell.

In [None]:
import time
import logging

chatbot_instance = None
initialization_error = None

print("--- Initializing RAG Chatbot Instance ---")
start_init_time = time.time()

try:
    # Create instance of the chatbot

    if 'logger' not in globals():

        logger = logging.getLogger("SRMAPChatbot_Init")
        if not logger.hasHandlers():
             logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    chatbot_instance = RAGChatbot(
        # Pass logger if needed by __init__ , though it gets its own
        # index_file_name="srmap_faiss_async.index", # Example: Use different file
        # metadata_file_name="srmap_metadata_async.pkl", # Example: Use different file
        # llm_model_name="google/flan-t5-large", # Example: Use different LLM
        # use_gpu=False, # Example: Force CPU
        # use_8bit=False # Example: Disable 8-bit
    )
    end_init_time = time.time()

    log_func = logger.info if 'logger' in globals() else print
    log_func(f"Chatbot initialized successfully in {end_init_time - start_init_time:.2f} seconds.")
    print(f"\nChatbot initialized successfully in {end_init_time - start_init_time:.2f} seconds.") # Always print confirmation

except Exception as e:
    end_init_time = time.time()
    initialization_error = e

    if 'logger' in globals():
        logger.error(f"Error during chatbot initialization: {e}", exc_info=True)
    print(f"\nError during chatbot initialization ({end_init_time - start_init_time:.2f} seconds): {e}")
    print("Please check the logs above for details. Common issues include missing data files or CUDA errors.")
    chatbot_instance = None


if chatbot_instance and hasattr(chatbot_instance, 'device') and chatbot_instance.device == 'cuda':
     try:
         import torch
         allocated_memory_gb = torch.cuda.memory_allocated(0) / (1024**3)
         reserved_memory_gb = torch.cuda.memory_reserved(0) / (1024**3)
         print(f"GPU Memory Usage: Allocated={allocated_memory_gb:.2f} GB, Reserved={reserved_memory_gb:.2f} GB")
     except Exception as mem_e:
         print(f"Could not get GPU memory usage details: {mem_e}")

print("--- Initialization Attempt Complete ---")

INFO:__main__.RAGChatbot:Initializing RAGChatbot with:
INFO:__main__.RAGChatbot:  Index File: /content/drive/MyDrive/srmap-bot/data/srmap_faiss_deep.index
INFO:__main__.RAGChatbot:  Metadata File: /content/drive/MyDrive/srmap-bot/data/srmap_metadata_deep.pkl
INFO:__main__.RAGChatbot:  Embedding Model: all-MiniLM-L6-v2
INFO:__main__.RAGChatbot:  LLM Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
INFO:__main__.RAGChatbot:  Use GPU Preference: True
INFO:__main__.RAGChatbot:  Use 8-bit Quantization: True
INFO:__main__.RAGChatbot:CUDA GPU detected: Tesla T4
INFO:__main__.RAGChatbot:Selected device: cuda
INFO:__main__.RAGChatbot:--- Loading Resources ---
INFO:__main__.RAGChatbot:Loading FAISS index from data/srmap_faiss_deep.index


--- Initializing RAG Chatbot Instance ---


INFO:__main__.RAGChatbot:FAISS index loaded successfully (98516 vectors).
INFO:__main__.RAGChatbot:Loading metadata from data/srmap_metadata_deep.pkl
INFO:__main__.RAGChatbot:Metadata loaded successfully (98516 entries).
INFO:__main__.RAGChatbot:Loading embedding model: all-MiniLM-L6-v2 onto device: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

INFO:__main__.RAGChatbot:Embedding model loaded successfully.
INFO:__main__.RAGChatbot:Loading LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
INFO:__main__.RAGChatbot:Applying 8-bit quantization (using bitsandbytes).
INFO:__main__.RAGChatbot:BitsAndBytesConfig set for 8-bit loading.
INFO:__main__.RAGChatbot:Loading Tokenizer: TinyLlama/TinyLlama-1.1B-Chat-v1.0


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

INFO:__main__.RAGChatbot:Loading LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 with kwargs: {'quantization_config': BitsAndBytesConfig {
  "_load_in_4bit": false,
  "_load_in_8bit": true,
  "bnb_4bit_compute_dtype": "float32",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": false,
  "load_in_8bit": true,
  "quant_method": "bitsandbytes"
}
, 'device_map': 'auto'}


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

INFO:__main__.RAGChatbot:LLM and Tokenizer loaded successfully.
INFO:__main__.RAGChatbot:LLM loaded on device(s): {'': 0}
INFO:__main__.RAGChatbot:--- All resources loaded successfully in 46.09 seconds ---
INFO:SRMAPChatbot:Chatbot initialized successfully in 46.19 seconds.



Chatbot initialized successfully in 46.19 seconds.
GPU Memory Usage: Allocated=1.25 GB, Reserved=1.36 GB
--- Initialization Attempt Complete ---


## 6. Running the Chatbot Interface

This section sets up a user interface using Gradio to interact with the chatbot. If the chatbot failed to initialize in the previous step, this cell will not run the interface.

In [None]:
import gradio as gr
def process_query(query):
    try:
        retrieved_chunks = chatbot_instance._retrieve_chunks(query)
        if not retrieved_chunks:
            return "I couldn't find relevant information to answer your question."
        response = chatbot_instance._generate_response(query, retrieved_chunks)
        sources = set()
        for _, source, _ in retrieved_chunks:
            for meta in chatbot_instance.metadata:
                if meta.get('source') == source:
                    url = meta.get('url')
                    if url:
                        sources.add(url)
        if sources:
            response += "\n\nSources:\n" + "\n".join([f"- {url}" for url in sources])
        return response
    except Exception as e:
        return f"An error occurred: {e}"

with gr.Blocks(title="SRM University-AP Chatbot") as demo:
    gr.Markdown("# SRM University-AP Chatbot")
    gr.Markdown("Ask questions about SRM University-AP and get detailed answers based on information from the university website.")

    chatbot_interface = gr.Chatbot(height=500)
    msg = gr.Textbox(label="Type your question here", placeholder="e.g., What programs does SRM University-AP offer?")
    clear = gr.Button("Clear")

    # gr.Markdown("### Example Questions")
    # examples = gr.Examples(
    #     # examples=example_questions,
    #     inputs=msg
    # )



    def respond(message, chat_history):
        bot_message = process_query(message)
        chat_history.append((message, bot_message))
        return "", chat_history

    msg.submit(respond, [msg, chatbot_interface], [msg, chatbot_interface])
    clear.click(lambda: None, None, chatbot_interface, queue=False)

# Launch the interface
demo.launch(debug=True, share=True)

  chatbot_interface = gr.Chatbot(height=500)
INFO:httpx:HTTP Request: GET http://127.0.0.1:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7860/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().


INFO:httpx:HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64 "HTTP/1.1 200 OK"


* Running on public URL: https://d8896217ae4115c1d4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


INFO:httpx:HTTP Request: HEAD https://d8896217ae4115c1d4.gradio.live "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The Vice Chancellor of SRM University-AP is Prof. Manoj K Arora, who was appointed to the position i...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): To apply for admission at SRM University-AP, you can follow these steps:

1. Go to the official webs...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): Ashu Abdul is a Professor of Information Technology at SRM University-AP. He has been appointed as t...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): Dr. Ashu Abdul is an Assistant Professor in the Department of Computer Science and Engineering at SR...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): Certainly! SRM University-AP is a vibrant campus with a range of extracurricular activities, clubs, ...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): >
Can you provide me with more information about the new Vice-Chancellor, Prof V.S. Rao, who was rec...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The Vice Chancellor of SRM University AP is Prof. Manoj K Arora....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The Registrar in SRM University-AP is Dr R Premkumar....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): Vice Chancellor is a position in higher education that is held by the senior-most officer in an acad...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): VC stands for Vice-Chancellor, and it is the highest-ranking position in a university. The Vice-Chan...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): VC is an acronym for Vice-Chancellor. It is a title that is conferred on the head of a university in...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The name of the Vice-Chancellor of SRM University-AP is Prof V S Rao....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The civil engineering department at SRM AP has a vision to become a global leader in the field of ci...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The course name of AML 507 at SRM AP is "Applied Machine Learning" (AML)....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The course name of AML507 at SRM AP is Artificial Intelligence and Machine Learning....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): The course name of AML507 at SRM University AP is "Artificial Intelligence and Machine Learning" (AM...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__.RAGChatbot:Generating response with max_new_tokens=512
INFO:__main__.RAGChatbot:Generated response (first 100 chars): ers 
 
SRM University AP, Andhra Pradesh 
Neerukonda, Mangalagiri Mandal, 
Guntur District, Mangalag...


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://d8896217ae4115c1d4.gradio.live




## 7. Quit and Cleanup

**Important:** Run this cell when you are finished using the chatbot to release GPU memory and other resources. This closes the Gradio interface (if running) and calls the `cleanup()` method of the chatbot instance.

In [None]:
import gc
import logging

print("--- Quit and Cleanup ---")


if 'logger' not in globals():
    logger = logging.getLogger("Cleanup")
    if not logger.hasHandlers():
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

if 'iface' in globals() and iface is not None:
    try:
        print("Closing Gradio interface...")
        iface.close()
        print("Gradio interface closed.")
        iface = None
    except Exception as e:
        print(f"Warning: Error closing Gradio interface: {e}")
        logger.warning(f"Error closing Gradio interface: {e}", exc_info=True)
else:
    print("Gradio interface instance not found or already closed.")



if 'chatbot_instance' in globals() and chatbot_instance is not None:
    print("Calling chatbot cleanup function...")
    logger.info("Calling chatbot cleanup...")
    try:
        chatbot_instance.cleanup()

        del chatbot_instance
        chatbot_instance = None
        print("Chatbot instance cleaned up and deleted.")
        logger.info("Chatbot instance cleaned up and deleted.")
    except Exception as e:
        print(f"An error occurred during chatbot cleanup: {e}")
        logger.error(f"Error during chatbot cleanup: {e}", exc_info=True)
else:
    print("No active chatbot instance found to clean up.")
    logger.info("Cleanup skipped: No active chatbot instance found.")

try:
    collected = gc.collect()
    print(f"Garbage collection triggered. Objects collected: {collected}")
    logger.info(f"Final garbage collection triggered. Objects collected: {collected}")
except Exception as e:
     logger.warning(f"Error during final garbage collection: {e}")

print("--- Cleanup Process Finished --- ")

INFO:SRMAPChatbot:Calling chatbot cleanup...
INFO:__main__.RAGChatbot:--- Cleaning up RAGChatbot resources ---
INFO:__main__.RAGChatbot:LLM deleted.
INFO:__main__.RAGChatbot:Tokenizer deleted.
INFO:__main__.RAGChatbot:Embedding model reference deleted.
INFO:__main__.RAGChatbot:FAISS index reference deleted.
INFO:__main__.RAGChatbot:Chunk and metadata references cleared.
INFO:__main__.RAGChatbot:CUDA cache cleared.


--- Quit and Cleanup ---
Gradio interface instance not found or already closed.
Calling chatbot cleanup function...


INFO:__main__.RAGChatbot:Garbage collection triggered.
INFO:__main__.RAGChatbot:--- Cleanup complete ---
INFO:SRMAPChatbot:Chatbot instance cleaned up and deleted.


Chatbot instance cleaned up and deleted.


INFO:SRMAPChatbot:Final garbage collection triggered. Objects collected: 244


Garbage collection triggered. Objects collected: 244
--- Cleanup Process Finished --- 
