# CTSE Lecture Notes Chatbot using LangChain + HuggingFace
This notebook demonstrates a simple chatbot that answers questions based on CTSE lecture notes using a Retrieval-Augmented Generation (RAG) approach.

In [None]:
!pip install langchain langchain-community langchain-huggingface chromadb transformers unstructured sentence-transformers

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from transformers import pipeline

# Step 1: Load the Lecture Notes PDF
We'll use LangChain's `PyPDFLoader` to extract text from the CTSE lecture notes.

In [None]:
loader = PyPDFLoader("data/ctse_notes.pdf")
documents = loader.load()

print("="*45)
print("ðŸ“„ CTSE Lecture Notes Loaded Successfully")
print("="*45)
print(f"ðŸ“š Total Pages Loaded : {len(documents)}")
print("âœ… Source File        : data/ctse_notes.pdf")
print("="*45)

# Step 2: Split Documents into Chunks
We use `RecursiveCharacterTextSplitter` to chunk the content into overlapping sections.

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

print("="*45)
print("ðŸ§© Document Chunking Completed")
print("="*45)
print(f"ðŸ”¹ Total Chunks Created : {len(docs)}")
print("="*45)

# Step 3: Generate Embeddings
We'll use HuggingFace Sentence Transformers for vector representation.

In [5]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 4: Store Chunks in Chroma Vector DB
We'll persist the chunks and embeddings to local vector storage.

In [6]:
persist_directory = "./ctse_db"
vectordb = Chroma.from_documents(documents=docs, embedding=embedding_model, persist_directory=persist_directory)

# Step 5: Load a Local LLM (FLAN-T5)
We'll use HuggingFace's pipeline for open-source inference.

In [None]:
hf_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512,
    do_sample=False
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Step 6: Setup the QA Chain
Combining retriever and LLM using LangChain's RetrievalQA.

In [8]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Step 7: Ask Questions (Notebook-Friendly)
You can ask any question based on the CTSE lecture notes.  
Type `'exit'` to end the interaction.

In [None]:
def interactive_ctse_bot():
    # Step 1: Introduction to the Chatbot
    print("Welcome to the Interactive Chatbot! Ask questions based on the loaded PDF document.\n")
    
    while True:
        # Step 2: Take user input for the query
        query = input("\nAsk a question (or type 'exit' to quit): ")
        
        # Step 3: Exit condition
        if query.lower().strip() == "exit":
            print("Exiting the chatbot session. Goodbye!")
            break
        
        # Step 4: Try to retrieve the answer from the QA chain
        try:
            # Step 5: Retrieve the answer using the QA chain (which utilizes the retriever and language model)
            answer = qa_chain.invoke({"query": query})
            
            # Step 6: Print the question and the generated answer
            print("\n" + "-"*50)
            print("Your Question: ")
            print(f"{query}\n")

            print("-" * 50)
            print("Answer: ")
            print(f"{answer['result']}\n")
            print("-" * 50 + "\n")
        
        # Step 7: Handle any errors during the process
        except Exception as e:
            print(f"Error while processing the query: {e}")
            print("Please try asking again.\n")

# Step 8: Start the chatbot interaction
interactive_ctse_bot()