In [1]:
# Install required packages
# Run this cell only once if not already installed
# The '!' is used to run shell commands in Jupyter/Colab

!pip install langchain-openai langchain-community huggingface_hub PyPDF2 langchain-huggingface faiss-cpu

# langchain-openai: Provides OpenAI-specific integrations for LangChain (e.g., ChatOpenAI, OpenAIEmbeddings)
# langchain-community:  Community-contributed integrations for LangChain like FAISS, HuggingFace, etc.
# huggingface_hub: Python client to interact with the Hugging Face Hub (download models, datasets)
# PyPDF2: Used for reading and extracting text from PDF files
# langchain-huggingface: Official LangChain support for HuggingFace embedding models (replaces older APIs)
# faiss-cpu: Facebook AI Similarity Search (CPU version) — used for efficient vector similarity search

Collecting langchain-openai
  Downloading langchain_openai-0.3.24-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.25-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3

In [3]:
# Load environment variables from a `.env` file into the environment
# Useful for storing API keys and configuration settings securely
from dotenv import load_dotenv

# PDF reading library — allows you to open and extract text from PDF files
from PyPDF2 import PdfReader

# Utility from LangChain to split long text into smaller chunks based on character count
# Important for chunking documents before embedding them
from langchain.text_splitter import CharacterTextSplitter

# Embedding model wrapper for HuggingFace models (like sentence-transformers)
# Used to convert text chunks into vector embeddings
from langchain_huggingface import HuggingFaceEmbeddings

# Wrapper for OpenAI’s GPT models (e.g., GPT-3.5, GPT-4) for use with LangChain
# Enables natural language generation
from langchain_openai import ChatOpenAI

# FAISS is a vector store used to store and retrieve vector embeddings efficiently
# langchain.vectorstores.FAISS integrates FAISS with LangChain
from langchain.vectorstores import FAISS

# Conversation memory buffer that stores past user-assistant interactions
# Helps the model maintain context across multiple turns in a conversation
from langchain.memory import ConversationBufferMemory

# Combines a retriever (e.g., FAISS) with a language model for answering questions based on retrieved context
# Ideal for building conversational retrieval-based QA systems
from langchain.chains import ConversationalRetrievalChain

# Used to authenticate to Hugging Face Hub (for downloading private models or using APIs)
from huggingface_hub import login

# Python’s built-in module for interacting with the operating system
# Used to access environment variables, file paths, etc.
import os


###Log-In to HuggingFace Hub

Create a .env file and add your API keys in it


```
HUGGINGFACEHUB_API_TOKEN='your_actual_token_here'
OPENAI_API_KEY='your_actual_token_here'
```



Go to the left sidebar → Files tab (📁 icon) → Click "Upload" → Choose your `.env` file

In [12]:
# Load environment variables and login to Hugging Face

load_dotenv()  # Loads .env file containing API keys

# Login to Hugging Face Hub using your token from https://huggingface.co/settings/tokens
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

###Upload your PDF (KB)

In [4]:
from google.colab import files

# Opens a file picker dialog in Colab to allow the user to upload local files
# You can hold Ctrl (or Cmd on Mac) and select multiple PDF files at once.
# The uploaded files will be stored in the current Colab runtime
# Upload your PDF here
uploaded = files.upload()

Saving Managing_Diabetes.pdf to Managing_Diabetes.pdf


In [5]:
# Create a list of file paths pointing to the uploaded PDF files
# `uploaded.keys()` returns a dict_keys object with filenames as keys (e.g., {'example.pdf': ...})
pdf_paths = [f"/content/{filename}" for filename in uploaded.keys()]
# Print the list of selected PDF paths to confirm successful upload
print(f"Selected PDFs: {pdf_paths}")

Selected PDFs: ['/content/Managing_Diabetes.pdf']


In [6]:
def get_pdf_text(pdf_docs):
    '''
    loop over all the pdf files and concatenate
    the content in a single string
    '''
    text = ""    # Initialize an empty string to accumulate text from all PDFs
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)  # Create a PdfReader object for the current PDF file
        for page in pdf_reader.pages:  # Iterate through all pages in the PDF
            page_text = page.extract_text()  # Extract text from the current page
            if page_text:
                text += page_text  # Append the extracted text to the result string
    return text  # Return the combined text from all PDF pages


# Call the function with the list of uploaded PDF paths
raw_text = get_pdf_text(pdf_paths)  # Extract text from all uploaded PDFs
# Print the length of the extracted text to verify it worked
print("✅ Extracted text length:", len(raw_text))

✅ Extracted text length: 4064


In [7]:
def get_text_chunks(raw_text):
    '''
    takes a single string of text and returns a list
    of text strings that can be fed to vector database
    '''
    # CharacterTextSplitter is a LangChain utility that splits text into chunks
    # based on characters — useful for preparing text for embeddings and retrieval
    text_splitter = CharacterTextSplitter(
        separator='\n',        # Split text primarily at newline characters (paragraph breaks)
        chunk_size=1000,       # Each chunk will be up to 1000 characters long
        chunk_overlap=200,     # Overlap of 200 characters between chunks to preserve context across splits
        length_function=len    # Function to measure chunk length (here, just using the built-in len())
    )
    # Split the raw text into chunks using the specified rules
    chunks = text_splitter.split_text(raw_text)
    return chunks  # Return the list of text chunks

# Call the function and print number of resulting chunks
text_chunks = get_text_chunks(raw_text)  # Split the PDF text into manageable chunks
print("✅ Number of text chunks:", len(text_chunks))  # Verify how many chunks were created

✅ Number of text chunks: 5


In [15]:
def get_vectorstore(text_chunks):
    '''
    Creates a FAISS vector store from text chunks using embeddings
    '''
    # Initialize HuggingFaceEmbeddings with an instruction-tuned transformer model
    embeddings = HuggingFaceEmbeddings(
        model_name="hkunlp/instructor-xl",   # This model supports instruction-based embeddings (semantic understanding)
        model_kwargs={"device": "cuda"}      # Run on GPU if available (requires Colab with GPU runtime like T4 or A100)
    )

    # FAISS is used to store and retrieve embeddings efficiently
    # `from_texts` computes embeddings for each chunk and stores them in the FAISS index
    vectorstore = FAISS.from_texts(
        texts=text_chunks,       # List of text chunks from the PDF
        embedding=embeddings     # Embedding model used to convert text to vectors
    )

    return vectorstore  # Return the FAISS vector store

# Create the vector store from the previously created text chunks
vectorstore = get_vectorstore(text_chunks)
print("✅ Vectorstore created")

✅ Vectorstore created


In [16]:
def get_conversation_chain(vectorstore):
    '''
    Creates a Conversational Retrieval Chain using OpenAI's Chat mode
    '''
    # Initialize the OpenAI chat model (LLM)
    llm = ChatOpenAI(
        model_name="gpt-4o-mini",      # "gpt-4o-mini" is lightweight and fast, but has token limit 128 (use "gpt-3.5-turbo" on free tier)
        temperature=0.7,               # Controls randomness in responses (0 = deterministic, 1 = creative)
        request_timeout=30             # Max time (in seconds) to wait for a response from the model
    )

    # Memory to store and recall past user-AI messages across turns in the conversation
    memory = ConversationBufferMemory(
        memory_key='chat_history',     # Key used internally by LangChain to track message history
        return_messages=True           # Returns history as a list of message objects (needed by chat models)
    )

    # Create the full conversational retrieval chain
    # Combines:
    #  - the LLM (`llm`)
    #  - a retriever (built on top of FAISS vector store)
    #  - and a memory buffer for chat history
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),  # Retrieves relevant chunks from FAISS based on the current user query
        memory=memory                          # Maintains context across interactions
    )

# Initialize the conversational chain using the vector store built earlier
conversation = get_conversation_chain(vectorstore)


  memory = ConversationBufferMemory(


In [17]:
# Function to ask questions about the uploaded PDFs

def chat_with_pdf(question):
    # Call the LangChain ConversationalRetrievalChain with the user's question
    # This internally:
    #  - Retrieves relevant PDF chunks from the vectorstore using semantic similarity
    #  - Sends those chunks as context to the OpenAI chat model
    #  - Returns a response while maintaining conversation history
    response = conversation({'question': question})

    # Loop through the entire chat history to display it (alternating User/Bot)
    # The history includes both the user's previous questions and the bot's responses
    for i, msg in enumerate(response['chat_history']):
        sender = "User" if i % 2 == 0 else "Bot"  # Even index = User, Odd index = Bot
        print(f"{sender}: {msg.content}")         # Print the message content

In [18]:
chat_with_pdf("Give me a summary of the document.")

  response = conversation({'question': question})


User: Give me a summary of the document.
Bot: The document provides a comprehensive overview of managing diabetes, a chronic condition that affects blood sugar processing. It outlines the two main types of diabetes: Type 1, which requires insulin, and Type 2, which may be managed with medications. Effective diabetes management is crucial to prevent serious complications and involves a combination of medical treatment, lifestyle changes, monitoring, and education.

Key components of management include maintaining a balanced diet rich in whole grains, lean proteins, vegetables, and healthy fats, as well as engaging in regular physical activity—aiming for at least 150 minutes of moderate exercise per week. Monitoring blood sugar levels is essential to understand how various factors affect glucose levels, and medications may be necessary for effective management.

The document emphasizes the importance of education, support groups, and a proactive approach to overall well-being in managing

In [19]:
chat_with_pdf("How does diet help in managing diabetes?") # change your question according to your own document

User: Give me a summary of the document.
Bot: The document provides a comprehensive overview of managing diabetes, a chronic condition that affects blood sugar processing. It outlines the two main types of diabetes: Type 1, which requires insulin, and Type 2, which may be managed with medications. Effective diabetes management is crucial to prevent serious complications and involves a combination of medical treatment, lifestyle changes, monitoring, and education.

Key components of management include maintaining a balanced diet rich in whole grains, lean proteins, vegetables, and healthy fats, as well as engaging in regular physical activity—aiming for at least 150 minutes of moderate exercise per week. Monitoring blood sugar levels is essential to understand how various factors affect glucose levels, and medications may be necessary for effective management.

The document emphasizes the importance of education, support groups, and a proactive approach to overall well-being in managing

In [20]:
chat_with_pdf("How does meditation help in diabetes?")

User: Give me a summary of the document.
Bot: The document provides a comprehensive overview of managing diabetes, a chronic condition that affects blood sugar processing. It outlines the two main types of diabetes: Type 1, which requires insulin, and Type 2, which may be managed with medications. Effective diabetes management is crucial to prevent serious complications and involves a combination of medical treatment, lifestyle changes, monitoring, and education.

Key components of management include maintaining a balanced diet rich in whole grains, lean proteins, vegetables, and healthy fats, as well as engaging in regular physical activity—aiming for at least 150 minutes of moderate exercise per week. Monitoring blood sugar levels is essential to understand how various factors affect glucose levels, and medications may be necessary for effective management.

The document emphasizes the importance of education, support groups, and a proactive approach to overall well-being in managing