In [None]:
import os
os.chdir("../")

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_files("data")

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src=doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source":src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [None]:
texts_chunk = text_split(minimal_docs)


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [24]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [26]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


In [27]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [None]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is acne?")
retrieved_docs

In [29]:
from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=os.getenv("GOOGLE_API_KEY")
)

In [31]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are a helpful medical assistant providing accurate information based on medical documents. "
    "Your role is to answer health-related questions using the context provided from medical literature.\n\n"
    "Guidelines:\n"
    "1. Answer questions accurately based ONLY on the provided context\n"
    "2. If the information is not in the context, clearly state exactly this and nothing else'I don't have this information.'\n"
    "3. Provide clear, easy-to-understand explanations\n"
    "4. Include relevant medical terms but explain them in simple language\n"
    "6. Never provide emergency medical advice - direct users to seek immediate medical attention for emergencies\n"
    "7. Be empathetic and professional in your responses\n"
    "8.Do not answer in more than 3 sentences\n"
    "Context: {context}\n\n"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human", "{input}"),
    ]
)

In [33]:
question_answer_chain = create_stuff_documents_chain(chatModel,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [36]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly and gigantism are disorders caused by the abnormal release of a chemical from the pituitary gland in the brain, leading to increased growth in bone and soft tissue, among other bodily disturbances.

Here's the distinction between the two:
*   **Gigantism:** Occurs when this abnormality happens *before* bone growth stops, causing an individual to grow to an unusual height.
*   **Acromegaly:** Occurs when this abnormality happens *after* bone growth stops.

Acromegaly is a relatively rare disorder, affecting about 50 out of every one million people. It affects both men and women, and due to the gradual onset of symptoms, diagnosis is often delayed until middle age.

**Please remember to consult a healthcare professional for diagnosis and treatment of any medical condition.**
