# CTSE - Assignment 2 - Lecture notes chatbot

# Install all the dependencies


In [5]:
%pip install google-generativeai langchain langchain_community pypdf nltk sentence-transformers faiss-cpu python-dotenv python-pptx unstructured

Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting unstructured
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25

# Import necessary libraries

In [6]:
import os
import glob
import nltk
import google.generativeai as genai
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from IPython.display import Markdown, display
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredPowerPointLoader

# Load environment variables from .env file

In [7]:
load_dotenv()

False

# Download NLTK resources

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Get API key from environment variable

In [None]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    #print("API key not found. Please make sure your .env file contains the GEMINI_API_KEY.")
    GEMINI_API_KEY = "GEMINI_API_KEY"

# Set up Gemini model

In [10]:
def setup_gemini(api_key):
    """Configure and initialize the Gemini model."""
    genai.configure(api_key=api_key)

    generation_config = {
        "temperature": 0.2,
        "top_p": 0.95,
        "max_output_tokens": 1024,
    }

    model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config
    )

    return model

# Load documents from datasets folder

In [17]:
def load_documents():
    """Load CTSE lecture notes from the datasets folder."""
    documents = []
    data_folder = 'datasets'

    # Check if folder exists
    if not os.path.exists(data_folder):
        print(f"Warning: The '{data_folder}' folder does not exist.")
        return documents

    # Load PDF files
    pdf_files = glob.glob(os.path.join(data_folder, '**', '*.pdf'), recursive=True)
    for pdf_file in pdf_files:
        try:
            loader = PyPDFLoader(pdf_file)
            documents.extend(loader.load())
            print(f"Loaded PDF: {pdf_file}")
        except Exception as e:
            print(f"Error loading PDF {pdf_file}: {e}")

    # Load PowerPoint files (PPT and PPTX)
    # PPT files (older format)
    ppt_files = glob.glob(os.path.join(data_folder, '**', '*.ppt'), recursive=True)
    # PPTX files (newer format)
    pptx_files = glob.glob(os.path.join(data_folder, '**', '*.pptx'), recursive=True)

    # Combine both lists
    all_ppt_files = ppt_files + pptx_files

    for ppt_file in all_ppt_files:
        try:
            loader = UnstructuredPowerPointLoader(ppt_file)
            documents.extend(loader.load())
            print(f"Loaded PowerPoint: {ppt_file}")
        except Exception as e:
            print(f"Error loading PowerPoint {ppt_file}: {e}")

    return documents

# Process documents

In [12]:
def create_vector_store(documents):
    """Create a searchable vector database from documents."""
    if not documents:
        print("No documents to process.")
        return None

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks.")

    # Create embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )

    # Create vector store
    vector_store = FAISS.from_documents(chunks, embeddings)
    return vector_store

# Generate response using RAG

In [13]:
def answer_question(question, vector_store, model):
    """Generate an answer using Retrieval-Augmented Generation."""
    # Retrieve relevant documents
    docs = vector_store.similarity_search(question, k=5)

    # Create context from retrieved documents
    context = "\n\n".join([doc.page_content for doc in docs])

    # Generate prompt for Gemini
    prompt = f"""
    You are a teaching assistant helping students with Computer Science and Technology for Software Engineering (CTSE) concepts.
    Answer the following question based ONLY on the provided context from CTSE lecture notes.
    If you cannot find the answer in the context, state that you don't have that information.

    Context:
    {context}

    Question: {question}

    Answer:
    """

    # Generate response
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Main execution

In [18]:
print("=== CTSE Lecture Notes Chatbot ===")

# Setup model
print("Setting up the Gemini model...")
model = setup_gemini(GEMINI_API_KEY)

# Load documents
print("\nLoading documents from the 'datasets' folder...")
documents = load_documents()

if not documents:
    print("No documents found. Please make sure your CTSE lecture notes are in the 'datasets' folder.")
else:
    # Process documents
    print("\nProcessing documents and creating vector store...")
    vector_store = create_vector_store(documents)

    if vector_store:
        print("\nChatbot ready! You can now ask questions about your CTSE lecture notes.")
        print("Type 'exit' to end the conversation.")

        # Chat loop
        while True:
            question = input("\nYour question: ")

            if question.lower() in ['exit', 'quit', 'bye']:
                print("Goodbye!")
                break

            # Answer the question
            print("Generating answer...")
            answer = answer_question(question, vector_store, model)
            display(Markdown(f"**Answer:**\n{answer}"))

=== CTSE Lecture Notes Chatbot ===
Setting up the Gemini model...

Loading documents from the 'datasets' folder...
Loaded PDF: datasets/ML Lec 2 - Part 2 LLM.pdf
Loaded PDF: datasets/cloud-computing-concepts-technology-amp-architecture-by-thomas-erl.pdf
Loaded PDF: datasets/ML Lec 2 - Part 1.pdf
Loaded PowerPoint: datasets/AWS User Groups Colombo - Introduction to AWS Cloud Platform.pptx
Loaded PowerPoint: datasets/Lecture 01-Introduction to AI ML - Updated(2025).pptx
Loaded PowerPoint: datasets/Microservice Design Patterns.pptx
Loaded PowerPoint: datasets/Lecture 2 - Part 1.pptx
Loaded PowerPoint: datasets/Cloud Computing 101.pptx
Loaded PowerPoint: datasets/Introduction to Microservices.pptx
Loaded PowerPoint: datasets/Cloud Design Patterns - 1.pptx
Loaded PowerPoint: datasets/Cloud Design Patterns - 2.pptx
Loaded PowerPoint: datasets/Lecture 2 - Part 2.pptx
Loaded PowerPoint: datasets/CAP Theorem.pptx
Loaded PowerPoint: datasets/Intro to DevOps and Beyond.pptx
Loaded PowerPoint: dat

**Answer:**
Cloud computing is a specialized form of distributed computing that introduces utilization models for remotely provisioning scalable and measured resources.  A cloud refers to a distinct IT environment designed for remotely provisioning scalable and measured IT resources.  The term originated as a metaphor for the Internet, a network of networks providing remote access to decentralized IT resources.



Your question: exit
Goodbye!
