In [92]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader


In [99]:
%pwd

'c:\\Medibot\\Medibot'

In [108]:
import os
os.chdir('../Medibot/Medibot')

In [109]:
%pwd

'c:\\Medibot\\Medibot'

In [104]:
os.chdir("../")

In [110]:
def get_file(data):
    loader = DirectoryLoader(data,glob='*.pdf',loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents


In [111]:
extracted_data = get_file("data")

In [112]:
from typing import List
from langchain.schema import Document

def filter_data(docs: List[Document]) -> List[Document]:
    filtered_data: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        filtered_data.append(
            Document(page_content=doc.page_content, metadata = {"source":src})
        )
    return filtered_data



In [113]:
filtered_data = filter_data(extracted_data)

In [114]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_docs(filtered_data: List[Document]):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
    chunked_docs = splitter.split_documents(filtered_data)
    return chunked_docs

In [115]:
chunked_docs = chunk_docs(filtered_data)

In [116]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import os
load_dotenv()

True

In [117]:
chunked_docs

[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimedia Content\nKelly A. Quin, Editor, Imaging and Multimedia Content\nLeitha E

In [121]:
def download_embedding():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embedding = HuggingFaceEndpointEmbeddings(model=model_name,huggingfacehub_api_token= os.getenv("HUGGINGFACE_KEY"))
    return embedding

In [122]:
embedding_model = download_embedding()

In [123]:
embedding_model.embed_query("hello")

[-0.06277177482843399,
 0.05495882034301758,
 0.05216483026742935,
 0.08579004555940628,
 -0.08274892717599869,
 -0.07457299530506134,
 0.06855473667383194,
 0.01839638315141201,
 -0.08201133459806442,
 -0.03738482668995857,
 0.012124920263886452,
 0.0035183574073016644,
 -0.004134318325668573,
 -0.04378440976142883,
 0.021807337179780006,
 -0.0051026977598667145,
 0.01954660564661026,
 -0.042348772287368774,
 -0.11035966128110886,
 0.00542446319013834,
 -0.055734872817993164,
 0.028052417561411858,
 -0.023158689960837364,
 0.028481412678956985,
 -0.05370965600013733,
 -0.052601635456085205,
 0.033939313143491745,
 0.04538863152265549,
 0.02371848188340664,
 -0.07312082499265671,
 0.05477772653102875,
 0.01704731211066246,
 0.0813603550195694,
 -0.0028627251740545034,
 0.011958086863160133,
 0.07355855405330658,
 -0.09423749893903732,
 -0.0813620388507843,
 0.04001541808247566,
 0.0006921631866134703,
 -0.013393236324191093,
 -0.05453800782561302,
 0.0051514627411961555,
 -0.0261397957

In [14]:
from dotenv import load_dotenv
import os
load_dotenv()


True

In [124]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [125]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

index_name = 'medical-chatbot'

pc = Pinecone(api_key=PINECONE_API_KEY)

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws',region='us-east-1')
    )



In [126]:
index = pc.Index(name=index_name)


In [127]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(documents=chunked_docs,index_name=index_name,embedding=embedding_model)

In [22]:
retriever = docsearch.as_retriever(search_type='similarity',search_kwargs={"k":5})

In [23]:
retrieved_docs = retriever.invoke("what is acne?")

In [24]:
retrieved_docs

[Document(id='10b12e34-e25b-47fb-b796-e7f9aee379fd', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Isotretinoin (Accutane) is prescribed only for very\nsevere, disfiguring acne.\nAcne is a skin condition that occurs when pores or\nhair follicles become blocked. This allows a waxy\nmaterial, sebum, to collect inside the pores or follicles.\nNormally, sebum flows out onto the skin and hair to\nform a protective coating, but when it cannot get out,\nsmall swellings develop on the skin surface. Bacteria\nand dead skin cells can also collect that can cause\ninflammation. Swellings that are small and not\ninflamed are whiteheads or blackheads. When they\nbecome inflamed, they turn into pimples. Pimples that\nfill with pus are called pustules.\nAcne cannot be cured, but acne drugs can help clear\nthe skin. Benzoyl peroxide and tretinoin work by mildly\nirritating the skin. This encourages skin cells to slough\noff, which helps open blocked pores. Benzoyl peroxide\nalso kills ba

In [28]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [29]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a helpful medical assistant. Use ONLY the following context to answer. "
    "If the answer is not in the context, say \"I don't know based on the provided data.\""
    "\n\nContext:\n{context}\n\n"
    "Answer in a clear, medically accurate way."
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [30]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [46]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [50]:
model = ChatGoogleGenerativeAI(model='gemini-2.5-pro',api_key=GEMINI_API_KEY)

In [51]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [52]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [53]:
question_answer_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [54]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of growth hormone from the pituitary gland after bone growth has stopped, leading to increased growth in bone and soft tissue. When this same hormonal abnormality occurs in children whose bones are still growing, it is called gigantism, which causes unusual height. Both men and women can be affected by these conditions, which are often diagnosed in middle age due to their gradual onset.
