# Install Required libraries

#### langchina, langchain_openai, langchain_community : core langchain libraries for buidling RAG based application that includes document loader, text splitter, create embeddings, create and store vector db, initiate and connect LLM, provide document chain and rag chain .

#### openAI : official python library for openAI . used by langchain for interacting with openAI LLMs

#### pinecone-client : official pinecone library for interacting with vector db



In [None]:
!pip install --quiet langchain langchain-openai langchain-community openai  pinecone-client pypdf python-dotenv "unstructured[md,txt,pdf]"

In [19]:
!pip install gradio_client==0.2.10
!pip install gradio==3.38.0

Collecting gradio_client==0.2.10
  Downloading gradio_client-0.2.10-py3-none-any.whl.metadata (7.1 kB)
Collecting websockets<12.0,>=10.0 (from gradio_client==0.2.10)
  Downloading websockets-11.0.3-py3-none-any.whl.metadata (6.6 kB)
Downloading gradio_client-0.2.10-py3-none-any.whl (288 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.0/289.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading websockets-11.0.3-py3-none-any.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.1/118.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: websockets, gradio_client
  Attempting uninstall: websockets
    Found existing installation: websockets 15.0.1
    Uninstalling websockets-15.0.1:
      Successfully uninstalled websockets-15.0.1
  Attempting uninstall: gradio_client
    Found existing installation: gradio_client 1.13.3
    Uninstalling gradio_client-1.13.3:
      Successfully uninstalled gradio_cli

In [4]:
!pip install --quiet langchain-pinecone

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/587.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/587.6 kB[0m [31m32.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/259.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
!pip install --quiet pinecone

# Load Environment Variables and Initialize core components

In [10]:
import os
import getpass
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader , TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Pinecone as langChainPineCone
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

from pinecone import Pinecone, ServerlessSpec

OPENAI_API_KEY = getpass.getpass('OpenAI API Key:')
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

PINECONE_API_KEY = getpass.getpass('Pinecone API Key:')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY



text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # Max characters per chunk
    chunk_overlap=200,    # Characters of overlap between chunks
    length_function=len   # How to measure chunk size (using len() for characters)
)

# Initialize the pineconeClient
pc = Pinecone(api_key=PINECONE_API_KEY)

# Your index name
INDEX_NAME = "smart-study-buddy-index"

# 2. Get the embedding dimension
# OpenAI provides: small = 1536, large = 3072
#OpenAI text-embedding-3-small → 1,536-dimensional embeddings
#OpenAI text-embedding-3-large → 3,072-dimensional embeddings

EMBEDDING_DIMENSION = 1536
METRIC = "cosine"

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. Create the index (your snippet)
try:
     # Check if the index already exists
    existing_indexes = pc.list_indexes()
    print(f"Existing Indexes: {existing_indexes}")

    if INDEX_NAME not in [index.name for index in existing_indexes]:
        print(f"Index '{INDEX_NAME}' does not exist. Creating new index...")
        pc.create_index(
          name=INDEX_NAME,
          dimension=EMBEDDING_DIMENSION,
          metric=METRIC,
          spec=ServerlessSpec(
                cloud="aws",        # or "gcp"
                region="us-east-1"  # pick the same region as your Pinecone project
            )
       )
        print(f"Successfully created new index: '{INDEX_NAME}' with dimension {EMBEDDING_DIMENSION} and metric '{METRIC}'.")
    else:
        print(f"Using existing index: '{INDEX_NAME}'")
    # Connect to the index (this is more for direct operations, LangChain will also connect)
    index = pc.Index(INDEX_NAME)
    print(f"Successfully connected to index '{INDEX_NAME}'.")
    print(f"Index stats: {index.describe_index_stats()}")

except Exception as e:
   print(f"Error creating/connecting to Pinecone index '{INDEX_NAME}': {str(e)}")





OpenAI API Key:··········
Pinecone API Key:··········
Existing Indexes: [{
    "name": "my-embedding-index",
    "metric": "cosine",
    "host": "my-embedding-index-czeoi6f.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}]
Index 'smart-study-buddy-index' does not exist. Creating new index...
Successfully created new index: 'smart-study-buddy-index' with dimension 1536 and metric 'cosine'.
Successfully connected to index 'smart-study-buddy-index'.
Index stats: {'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


## Load Documents and Chunking

In [11]:
import os

# Load Documents and Chunk them
#4. Data Ingestion & Processing:

SMART_STUDY_BUDDY_DIR = "./smart_study_buddy_dir"

# Create directory if it doesn't exist (and inform the user)
if not os.path.exists(SMART_STUDY_BUDDY_DIR):
    os.makedirs(SMART_STUDY_BUDDY_DIR)
    print(f"Created input directory: {SMART_STUDY_BUDDY_DIR}. Please add your sample lecture note files (.txt, .pdf) to this folder.")
elif not os.listdir(SMART_STUDY_BUDDY_DIR):
    print(f"Input directory {SMART_STUDY_BUDDY_DIR} is empty. Please add your sample lecture notes to this folder for the Study Buddy to work.")


all_documents = []
if(os.path.exists(SMART_STUDY_BUDDY_DIR) and os.listdir(SMART_STUDY_BUDDY_DIR)):
  for filename in os.listdir(SMART_STUDY_BUDDY_DIR):
    print(f"loading the document {filename}")
    file_path = os.path.join(SMART_STUDY_BUDDY_DIR, filename)
    try:
      if(filename.endswith(".pdf")):
        loader = PyPDFLoader(file_path)
        loader_docs = loader.load()
        print(f"Loaded PDF : {filename} , with number of pages : {len(loader_docs)}")
        all_documents.extend(loader_docs)
      elif(filename.endswith(".txt")):
        loader = TextLoader(file_path)
        loader_docs = loader.load()
        print(f"Loaded TXT : {filename} , with number of pages : {len(loader_docs)}")
        all_documents.extend(loader_docs)
      else:
        print(f"-- Skipped unsupported file: {filename}")
        continue
    except Exception as e:
            print(f"Error loading file {filename}: {e}")

if all_documents:
  print(f"Total number of documents loaded : {len(all_documents)}")
else:
  print("No processable documents were found. ")


# Chunking : Split the documents into chunking

chunks = []

if all_documents:
  chunks = text_splitter.split_documents(all_documents)
  print(f"Total number of chunks : {len(chunks)}")
else:
  print("No processable documents were found. ")



loading the document Software_Design_Patterns_for_AI-Systems.pdf
Loaded PDF : Software_Design_Patterns_for_AI-Systems.pdf , with number of pages : 7
loading the document nosqldb.pdf
Loaded PDF : nosqldb.pdf , with number of pages : 29
loading the document history_lecture_1.txt
Loaded TXT : history_lecture_1.txt , with number of pages : 1
Total number of documents loaded : 37
Total number of chunks : 52


# Initialize Vector Store

In [13]:
from langchain_pinecone import PineconeVectorStore
import time

vectorstore = None

if chunks:
  print(f"Generating Embeddings {len(chunks)} chunks and storing them at  pinecone Index: {INDEX_NAME}")
  try:
    vectorstore = PineconeVectorStore.from_documents(
        documents=chunks,
        embedding=embeddings,
        index_name=INDEX_NAME,
        text_key="text"
    )
    print(f"Successfully stored embeddings in Pinecone index '{INDEX_NAME}'.")
    # It might take a few moments for Pinecone's stats to update after upserting.
    time.sleep(10) # Give Pinecone a moment to update stats
    stats = pc.Index(INDEX_NAME).describe_index_stats()
    print(f"\nUpdated Index Statistics for '{INDEX_NAME}':")
    print(f"Total vectors: {stats.total_vector_count}")
    print(f"Namespaces: {stats.namespaces}")

  except Exception as e:
   print(f"Error creating/connecting to Pinecone index '{INDEX_NAME}': {str(e)}")
else:
    print("\nNo chunks were created from documents. Trying to connect to an existing Pinecone index for Q&A...")
    try:
      vectorstore = PineconeVectorStore.from_existing_index(
        embedding=embeddings,
        index_name=INDEX)
    except Exception as e:
      print(f"Error creating/connecting to Pinecone index '{INDEX_NAME}': {str(e)}")



Generating Embeddings 52 chunks and storing them at  pinecone Index: smart-study-buddy-index
Successfully stored embeddings in Pinecone index 'smart-study-buddy-index'.

Updated Index Statistics for 'smart-study-buddy-index':
Total vectors: 104
Namespaces: {'': {'vector_count': 104}}


# Building Brain : Question and Answering with Session Memory

In [15]:
qa_chain = None

if vectorstore:
  # initialize LLM
  llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

  print(f"llm initialized with model name : {llm.model_name}")

  #Create Retriever from pinecode vector store
  retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

  print(f"retriver created for pinecone index : {INDEX_NAME}. And retrieveing top 3 chunks")

  prompt_template_str = """
    You are a helpful AI Smart Study Buddy. Use the following pieces of context from lecture notes and the chat history to answer the question at the end.
    Your goal is to answer the user's question based *only* on the provided lecture notes context.
    Do not use any external knowledge or make up information.
    If the answer to the question cannot be found in the provided context, clearly state "I'm sorry, but I couldn't find information about that in your lecture notes."
    If the context is empty or irrelevant to the question, also state that you cannot find the answer in the notes.

    Context from lecture notes:
    {context}

    Chat History:
    {chat_history}

    Question: {question}
    Helpful Answer from your lecture notes:
    """
  qa_prompt = PromptTemplate(
        input_variables=["context", "chat_history", "question"],
        template=prompt_template_str)

  print("prompt template defined")

  #Initialize Conversation Memory

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True,output_key='answer')

  #Conversation Memory Initialized
  print("Conversation Memory Initialized")

  qa_chain = ConversationalRetrievalChain.from_llm(llm=llm,
                                                    retriever=retriever,
                                                    memory=memory,
                                                    combine_docs_chain_kwargs={"prompt": qa_prompt},
                                                    verbose=True,
                                                   return_source_documents=True)

  print("Conversation QA Chain Initialized")
else:
  print("Vectore store for pinecone not available. skipping conversation chain setup")



llm initialized with model name : gpt-3.5-turbo
retriver created for pinecone index : smart-study-buddy-index. And retrieveing top 3 chunks
prompt template defined
Conversation Memory Initialized
Conversation QA Chain Initialized


  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True,output_key='answer')


#Chating with your Smart Studdy Buddy

In [2]:
def chat_with_buddy(query:str):
  global chat_history  # uses the chat history managed by memory
  if qa_chain:

    try:
      print(f" User question : {query}")
      result = qa_chain.invoke({"question": query}) # ChatHistory implicitly managed by memory object
      answer = result['answer']
      print(f"Smart Study Buddy Answer : {answer}")

      #Soruce Documents
      print("\nSource Documents:")
      if result.get('source_documents'):
        for i,doc in enumerate(result['source_documents']):
          source_name = doc.metadata.get('source', 'Unknown source')
           # Truncate page_content for display
          content_preview = doc.page_content.replace('\n', ' ').strip()[:150]
          print(f"  {i+1}. Source: {source_name}\n     Content Preview: '{content_preview}...'")
      else:
          print("  No specific source documents were heavily relied upon or returned by the retriever.")

      return answer
    except Exception as e:
      print(f"Error in chat_with_buddy: {e}")
  else:
     print("QA chain is not initialized. Cannot process query. Please check previous steps.")
     return "Error: QA chain not set up."


In [None]:
# --- Test Scenarios ---

if qa_chain:
  #starting chat session with smart studdy buddy
  question1 = "What are common themes in Shakespearean tragedies?"
  response1 = chat_with_buddy(question1)
  print("\n")

  if response1 and "Error:" not in response1  and "I am sorry" not in response1:
    print(response1)
    print("\n")
    question2 = "Can you name a key character in Hamlet?"
    response2 = chat_with_buddy(question2)

    print("Question 2: \n")
    print(response2)
    print("\n")
  else:
     print("\nSkipping chat example as the QA chain is not set up. Please check previous steps.")

In [None]:
import gradio as gr

demo = gr.ChatInterface(chat_with_buddy,
                        chatbot=gr.Chatbot(height=200),
                        textbox=gr.Textbox(placeholder="Hi I am your Smart Study Buddy, How I can help you today?", container=False, scale=7),
                        title="Smart Study Buddy",
                        theme="soft",
                        examples=["What are common themes in Shakespearean tragedies?"],
                        retry_btn=None,
                        undo_btn="Delete Previous")

demo.launch(share=True, debug=True)



IMPORTANT: You are using gradio version 3.38.0, however version 4.44.1 is available, please upgrade.
--------
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://faefad54b0c12dfc07.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
