In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'e:\\Projects\\AI Projects\\End-to-End-AI-Medical-Chatbot'

In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [18]:
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [7]:
extracted_data = load_pdf_file(data="Data/")

In [9]:
# extracted_data

In [10]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [11]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 6973


In [12]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [20]:
embeddings = download_hugging_face_embeddings()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [21]:
query_result = embeddings.embed_query("hello world")
print("Length", len(query_result))

Length 384


In [23]:
# query_result

In [44]:
from dotenv import load_dotenv
load_dotenv()

True

In [45]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key = PINECONE_API_KEY)

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

In [46]:
import os
os.environ["PINECON_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [36]:
# Embed each chunk and upsert the embeddings into your Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings,
)

In [37]:
# Load existing index

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [38]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1e5169d1e70>

In [39]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k" : 3})

In [40]:
retrieved_docs = retriever.invoke("What is acne?")

In [41]:
retrieved_docs

[Document(id='aab64761-30d3-45fa-86fe-7d137bcf2ef1', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 425.0, 'page_label': '426', 'producer': 'GPL Ghostscript 9.10', 'source': 'Data\\Medical_book.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='Corticosteriod —A group of synthetic hormones\nthat are used to prevent or reduce inflammation.\nToxic effects may result from rapid withdrawal after\nprolonged use or from continued use of large doses.\nPatch test—A skin test that is done to identify aller-\ngens. A suspected substance is applied to the skin.\nAfter 24–48 hours, if the area is red and swollen,\nthe test is positive for that substance. If no reaction\noccurs, another substance is applied. This is con-'),
 Document(id='6faa6217-de36-4e9d-a9fc-4a5996ab8bf8', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'modd

In [47]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens = 500)

In [48]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise. "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [49]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [50]:
response = rag_chain.invoke({"input": "What is acne?"})
print(response["answer"])



Acne is a skin condition that is characterized by the appearance of pimples, blackheads, and whiteheads on the face, neck, chest, and back. It is caused by an overproduction of sebum (oil) in the skin, which clogs pores and leads to inflammation. It is often treated with corticosteroids, which are anti-inflammatory substances, and can also be prevented by avoiding irritants and maintaining good hygiene.


In [52]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are both disorders caused by an overproduction of growth hormone, which is produced by the pituitary gland. Acromegaly occurs in adults and causes abnormal growth of bones and tissues, while gigantism occurs in children and causes excessive growth and height. Both disorders can lead to serious health complications if left untreated.
