In [7]:
%pwd

'f:\\ML\\SAMPLEPROJECTS\\GEN AI PROJECTS\\Medical Chatbot\\Gen-AI-Medical-Chatbot\\research'

In [8]:
import os
os.chdir("../")

In [49]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from langchain.schema import Document

In [60]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [61]:
extracted_data = load_pdf_file('Data/')

In [62]:
print(len(extracted_data))

67


In [63]:
extracted_data

[Document(metadata={'source': 'Data\\Chapter-01.indd.pdf', 'page': 0}, page_content='CHAPTER   1\nCOMMON DISEASES\nACUTE FEVER\nThe overall mean oral temperature for healthy adult individuals is 36.8 + 0.4ºC, with \na nadir at 6 AM and a peak at 4-6 PM. A morning temperature of greater than 37.2ºC \nand an evening temperature of greater than 37.7ºC is often considered as fever. Fever \nmay be continuous, intermittent or remittent. However, with frequent self-medication \nwith antipyretics, classic patterns are not generally seen.\nDiagnosis\nIt is important to work towards ﬁ  nding the cause of fever. A meticulous history of \nchronology of symptoms, any associated focal symptom(s), exposure to infectious \nagents and occupational history may be useful. A thorough physical examination \nrepeated on a regular basis may provide potentially diagnostic clues such as rash, \nlymphadenopathy, hepatomegaly, splenomegaly, abdominal tenderness, altered \nsensorium, neck stiffness, lung crepts, 

In [13]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [14]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

In [16]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [17]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [21]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [22]:
#query_result

In [23]:
from dotenv import load_dotenv
load_dotenv()

True

In [24]:
pinecone_key = os.getenv("PINECONE_API_KEY")
huggingface_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [24]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=pinecone_key)

index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [27]:
import os
os.environ["PINECONE_API_KEY"] = pinecone_key
os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_key

In [26]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [29]:
index_name = "medicalbot"

In [30]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [31]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [32]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='91b29f86-812f-47e9-891a-ed71c169503c', metadata={'page': 39.0, 'source': 'Data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='888753d7-3d45-45bc-93c6-22e2e8e929e1', metadata={'page': 38.0, 'source': 'Data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='71dd87bc-deba-499d-a0a0-ab43ccef1c5f', metadata={'page': 37.0, 'source': 'Data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when

In [40]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain import HuggingFaceHub

In [37]:
llm = HuggingFaceHub(
    repo_id='google/flan-t5-large',
    model_kwargs={"temperature":0, "max_length":180, 'max_new_tokens' : 120, 'top_k' : 10, 'top_p': 0.95, 'repetition_penalty':1.03}
)

# System prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. Your answer must be at least 2 lines "
    "If you don't know the answer, say that you don't know. "
    "Keep the answers informative."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [38]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [39]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])

acne
