In [None]:
%pwd

In [None]:
%cd ../Data

In [None]:
%pwd

In [None]:
%cd ..


In [None]:
#pypdf loader:
# Used to load and extract text from a single PDF file. It converts each page of the PDF into a LangChain Document object with text and metadata.
#Directoryloader:
# load multiple files from a directory. 
# In this case, it scans a folder and applies PyPDFLoader to every PDF file found.
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

# This class is used to split large text into smaller chunks.(essential in RAG (Retrieval-Augmented Generation) systems because LLMs have context length limits.)
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
#extract text from pdf files
# This function takes Data, which represents the path to a directory containing PDF files.
def load_pdf_files(Data):
    loader=DirectoryLoader(Data,glob="*.pdf",
                           loader_cls=PyPDFLoader) #Data: Directory path where PDF files are stored,glob="*.pdf": Ensures that only PDF files are loaded,loader_cls=PyPDFLoader: Specifies that each PDF file should be processed using PyPDFLoader
    
    #Loads all PDF files from the directory,Extracts text from each PDF page,onverts them into a list of LangChain Document objects
    documents = loader.load()
    return documents

In [None]:
extracted_data= load_pdf_files('Data')

In [None]:
extracted_data

In [None]:
len(extracted_data)

In [None]:
from typing import List #List: Used for type hints to indicate a list of items
from langchain_core.documents import Document #Document: LangChain’s standard object for storing text and metadata

#Takes a list of Document objects as input
#Returns a new list of Document objects
#The returned documents contain minimal metadata

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document ojects,return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """

    minimal_docs: List[Document] =[]  #Creates an empty list to store the cleaned documents
    for doc in docs: #Iterates through each Document in the input list
        src = doc.metadata.get("source") #Safely retrieves the source value from metadata
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={'source': src}
            )
        ) #Creates a new Document object
        #copies Original text (page_content),Only the source field in metadata

    return minimal_docs #Returns the list of simplified Document objects

In [None]:
minimal_docs= filter_to_minimal_docs(extracted_data)

In [None]:
minimal_docs

In [None]:
#chunking- split the docs into smaller chunks

def text_split(minimal_docs):
    text_splitter= RecursiveCharacterTextSplitter( #Splits text recursively using separators like:["\n\n", "\n", " ", ""]
        chunk_size=500, #Each text chunk will have maximum 1000 characters.
        chunk_overlap=20, #Each chunk shares 200 characters with the previous chunk,Ensures context continuity across chunks.
    )

    texts_chunk=text_splitter.split_documents(minimal_docs) #Splits each document into smaller Document chunks.
    return texts_chunk #Returns a list of chunked Document objects.

In [None]:
texts_chunk= text_split(minimal_docs)
print(f'Number of chunks : {len(texts_chunk)}')

In [None]:
texts_chunk

In [None]:
#Embedding model- to convert text into numbers so LLM can understand
from langchain_community.embeddings import HuggingFaceEmbeddings


def download_embedding():
    '''
    Download and return the HuggingFace embeddings model.
    '''
    model_name='sentence-transformers/all-MiniLM-L6-v2'
    embeddings= HuggingFaceEmbeddings(
        model_name=model_name,
    
    )
    return embeddings

embedding= download_embedding()

In [None]:
embedding

In [None]:
vector=embedding.embed_query("Hello world")
vector


In [None]:
print("Vector length", len(vector))

In [None]:
#Now we will store vector embeddings to pinecone vector DB
#we will load env file where we stored API keys
from dotenv import load_dotenv
import os
load_dotenv()


In [None]:
#To access this credentials:
PINECONE_API_KEY= os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"]=PINECONE_API_KEY #this sets the Pinecone key back into the Python runtime environment.
os.environ['OPENAI_API_KEY']=OPENAI_API_KEY #you want to ensure the key is available during execution


In [None]:
#Now we will import pinecone

from pinecone import Pinecone
Pinecone_api_key=PINECONE_API_KEY

#autenticate pinecone acc
pc= Pinecone(api_key=Pinecone_api_key)


In [None]:
pc

In [None]:
# create a Index/DB
from pinecone import Pinecone, ServerlessSpec #tells which cloud to use

# initialize client
pc = Pinecone()

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384, #dimension of embeddings
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)



In [None]:
#Now we will store our vectors
from langchain_pinecone import PineconeVectorStore

docsearch=PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding= embedding,
    index_name=index_name
)

In [None]:
#add  more data to the existing pinecone index
dswith=Document(
    page_content='learning a full chatbot project',
    metadata={'source':'youtube'}
)

In [None]:
docsearch.add_documents(documents=[dswith])

In [None]:
retriever = docsearch.as_retriever( #Converts Pinecone vector store into a retriever object
    search_type='similarity',
    search_kwargs={'k': 3} #Top 3 most relevant chunks
)


In [None]:
retrieved_docs= retriever.invoke("What is acne?")
retrieved_docs

In [None]:
#connect LLM
from langchain_openai import ChatOpenAI

chatmodel=ChatOpenAI(
    model='gpt-4o'

)


In [None]:
#import some necessary library
from langchain.chains import create_retrieval_chain #Connects Retriever → LLM
from langchain.chains.combine_documents import create_stuff_documents_chain #Combines retrieved documents into one prompt
from langchain_core.prompts import ChatPromptTemplate #how context + question are given to the LLM

In [None]:
#systemprompt--Defines rules for the LLM
#Tells the model:Use only retrieved context,Don’t hallucinate,Be concise
system_prompt=(
    'You are a Medical assistant for question-answering tasks.'
    'Use the following piece of retrieved context to answer'
    'the question.If you dont know the question,say that you'
    'dont know.Use three sentences maximum and keep the answer concise.'
    '\n\n'
    "{context}"
)


prompt= ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),#rules + context
        ("human","{input}") #user’s question
    ]
)

#User Question
    #  ↓
# Retriever (Pinecone)
    #  ↓
# Top-K Relevant Chunks
    #  ↓
# Prompt (system + context + question)
    #  ↓
# LLM Answer


In [None]:
question_answer_chain=create_stuff_documents_chain(chatmodel,prompt)  #chatmodel:LLM, Prompt-System + Human prompt
rag_chain=create_retrieval_chain(retriever,question_answer_chain)

In [None]:
response=rag_chain.invoke({"input":"what is Acromegaly and gigantism?"})
print(response["answer"])