# CTSE - Assignment 2 - Lecture notes chatbot

# Install all the dependencies


In [1]:
%pip install google-generativeai langchain langchain_community pypdf nltk sentence-transformers faiss-cpu python-dotenv python-pptx unstructured

^C
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.14.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.4 which is incompatible.

[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl (155 kB)
     ------------------------------------ 155.4/155.4 kB 319.8 kB/s eta 0:00:00
Collecting langchain
  Downloading langchain-0.3.25-py3-none-any.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 1.5 MB/s eta 0:00:00
Collecting langchain_community
  Downloading langchain_community-0.3.23-py3-none-any.whl (2.5 MB)
     ---------------------------------------- 2.5/2.5 MB 3.2 MB/s eta 0:00:00
Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl (303 kB)
     -------------------------------------- 303.4/303.4 kB 3.1 MB/s eta 0:00:00
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 2.8 MB/s eta 0:00:00
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
     -------------------------------------- 345.7/345.7 kB 3.6 MB/s eta 0:00:00
Collecting faiss

# Import necessary libraries

In [None]:
import os
import glob
import nltk
import google.generativeai as genai
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from IPython.display import Markdown, display
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredPowerPointLoader

# Load environment variables from .env file

In [3]:
load_dotenv()

False

# Download NLTK resources

In [None]:
nltk.download('punkt')

# Get API key from environment variable

In [None]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    print("API key not found. Please make sure your .env file contains the GEMINI_API_KEY.")
    #GEMINI_API_KEY = "API_KEY"

# Set up Gemini model

In [None]:
def setup_gemini(api_key):
    """Configure and initialize the Gemini model."""
    genai.configure(api_key=api_key)
    
    generation_config = {
        "temperature": 0.2,
        "top_p": 0.95,
        "max_output_tokens": 1024,
    }
    
    model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config
    )
    
    return model

# Load documents from datasets folder

In [None]:
def load_documents():
    """Load CTSE lecture notes from the datasets folder."""
    documents = []
    data_folder = 'datasets'
    
    # Check if folder exists
    if not os.path.exists(data_folder):
        print(f"Warning: The '{data_folder}' folder does not exist.")
        return documents
    
    # Load PDF files
    pdf_files = glob.glob(os.path.join(data_folder, '**', '*.pdf'), recursive=True)
    for pdf_file in pdf_files:
        try:
            loader = PyPDFLoader(pdf_file)
            documents.extend(loader.load())
            print(f"Loaded PDF: {pdf_file}")
        except Exception as e:
            print(f"Error loading PDF {pdf_file}: {e}")
    
    # Load PowerPoint files (PPT and PPTX)
    # PPT files (older format)
    ppt_files = glob.glob(os.path.join(data_folder, '**', '*.ppt'), recursive=True)
    # PPTX files (newer format)
    pptx_files = glob.glob(os.path.join(data_folder, '**', '*.pptx'), recursive=True)
    
    # Combine both lists
    all_ppt_files = ppt_files + pptx_files
    
    for ppt_file in all_ppt_files:
        try:
            loader = UnstructuredPowerPointLoader(ppt_file)
            documents.extend(loader.load())
            print(f"Loaded PowerPoint: {ppt_file}")
        except Exception as e:
            print(f"Error loading PowerPoint {ppt_file}: {e}")
    
    return documents

# Process documents

In [None]:
def create_vector_store(documents):
    """Create a searchable vector database from documents."""
    if not documents:
        print("No documents to process.")
        return None
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks.")
    
    # Create embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2", 
        model_kwargs={'device': 'cpu'}
    )
    
    # Create vector store
    vector_store = FAISS.from_documents(chunks, embeddings)
    return vector_store

# Generate response using RAG

In [None]:
def answer_question(question, vector_store, model):
    """Generate an answer using Retrieval-Augmented Generation."""
    # Retrieve relevant documents
    docs = vector_store.similarity_search(question, k=5)
    
    # Create context from retrieved documents
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Generate prompt for Gemini
    prompt = f"""
    You are a teaching assistant helping students with Computer Science and Technology for Software Engineering (CTSE) concepts.
    Answer the following question based ONLY on the provided context from CTSE lecture notes.
    If you cannot find the answer in the context, state that you don't have that information.
    
    Context:
    {context}
    
    Question: {question}
    
    Answer:
    """
    
    # Generate response
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Main execution

In [None]:
print("=== CTSE Lecture Notes Chatbot ===")

# Setup model
print("Setting up the Gemini model...")
model = setup_gemini(GEMINI_API_KEY)

# Load documents
print("\nLoading documents from the 'datasets' folder...")
documents = load_documents()

if not documents:
    print("No documents found. Please make sure your CTSE lecture notes are in the 'datasets' folder.")
else:
    # Process documents
    print("\nProcessing documents and creating vector store...")
    vector_store = create_vector_store(documents)
    
    if vector_store:
        print("\nChatbot ready! You can now ask questions about your CTSE lecture notes.")
        print("Type 'exit' to end the conversation.")
        
        # Chat loop
        while True:
            question = input("\nYour question: ")
            
            if question.lower() in ['exit', 'quit', 'bye']:
                print("Goodbye!")
                break
            
            # Answer the question
            print("Generating answer...")
            answer = answer_question(question, vector_store, model)
            display(Markdown(f"**Answer:**\n{answer}"))