In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os

os.chdir("../")  # Change directory to parent folder

In [None]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents


In [5]:
extracted_docs = load_pdf_files("data")

In [9]:
from typing import List
from langchain.schema import Document


def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """Filter documents to only those with at least min_length characters."""
    
    minimal_docs = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(page_content=doc.page_content, metadata={"source": src})
        )
    return minimal_docs

In [10]:
minimal_docs = filter_to_minimal_docs(extracted_docs)

In [None]:
minimal_docs

In [15]:
def text_split(minimal_docs):
	text_splitter = RecursiveCharacterTextSplitter(
		chunk_size=500,
		chunk_overlap=20,
		length_function=len
	)
	text_chunks = text_splitter.split_documents(minimal_docs)
	return text_chunks

In [16]:
text_cunks = text_split(minimal_docs)
len(text_cunks)

5859

In [18]:
from langchain.embeddings import HuggingFaceBgeEmbeddings


def download_embeddings():
	model_name = "sentence-transformers/all-MiniLM-L6-v2"
	embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)
	return embeddings

embeddings = download_embeddings()

  embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)


In [19]:
embeddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [21]:
vector = embeddings.embed_query("Hello world")
vector

[-0.010300830006599426,
 0.18307925760746002,
 0.03081122227013111,
 0.004452867433428764,
 -0.027336133643984795,
 -0.0335625559091568,
 0.037631526589393616,
 -0.03157331794500351,
 -0.0033910172060132027,
 -0.008950826711952686,
 0.03803616762161255,
 -0.051291026175022125,
 0.0003682953247334808,
 -0.023727085441350937,
 0.0927102193236351,
 -0.02779579535126686,
 -0.03515257313847542,
 -0.0032242247834801674,
 -0.07681785523891449,
 -0.057612113654613495,
 0.07257593423128128,
 0.11128546297550201,
 0.016058500856161118,
 0.015908459201455116,
 -0.0823269858956337,
 0.007007301319390535,
 0.0290131364017725,
 0.0011387335835024714,
 0.1167173832654953,
 -0.032327428460121155,
 -0.0322716049849987,
 -0.0012590617407113314,
 0.10591619461774826,
 0.023600803688168526,
 0.009664924815297127,
 0.09834086894989014,
 0.042936380952596664,
 -0.019547685980796814,
 0.01926787942647934,
 -0.06417101621627808,
 0.02392350323498249,
 -0.052880022674798965,
 -0.02646952122449875,
 0.005548785

In [22]:
len(vector)

384

In [27]:
from dotenv import load_dotenv
import os
load_dotenv()  # take environment variables from .env.

True

'/Users/utkarsh/Blogs and Learning/medical-chatbot'

In [61]:
PINECONE_API_KEY=os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY=os.environ.get("OPENAI_API_KEY")
OPEN_AI_MODEL=os.environ.get("OPENAI_MODEL")


In [63]:
PINECONE_API_KEY

'pcsk_ffucx_MXpkAQPqjo1FrVT7jmU9bwtTmQk3351Vh3vD9LodxgeNUm7zU3wvuZfAMkxKMsB'

In [62]:

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["OPENAI_MODEL"] = OPEN_AI_MODEL

In [31]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [32]:
pc

<pinecone.pinecone.Pinecone at 0x1153edfa0>

In [None]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"

if not pc.has_index(index_name):
	pc.create_index(
		name=index_name,
		dimension=384,
		metric="cosine",
		spec=ServerlessSpec(cloud='aws', region='us-east-1')
	)

index = pc.Index(index_name)

In [40]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
	documents=text_cunks,
	embedding=embeddings,
	index_name=index_name,
)


In [None]:
from  langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
	embedding=embeddings,
	index_name=index_name,
)	

# Adding more docuemtts

In [41]:
dswith = Document(
	page_content="This is a test document", metadata={"source": "test.pdf"}
)

In [42]:
docsearch.add_documents(documents=[dswith])

['a268a436-344b-474c-9c68-2966e66f1888']

In [43]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [45]:
retrieved_docs = retriever.invoke("What is Hyderocephalus")
retrieved_docs

[Document(id='ae4dde5b-40b1-4989-b36b-32faa77599ff', metadata={'source': 'data/Medical_book.pdf'}, page_content='sue which serves as a base on which bone is built.\nForamen magnum—The opening at the base of the\nskull, through which the spinal cord and the brain-\nstem pass.\nHydrocephalus —An abnormal accumulation of\nfluid within the brain. This accumulation can be\ndestructive by pressing on brain structures, and\ndamaging them.\nMutation—A new, permanent change in the struc-\nture of a gene, which can result in abnormal struc-\nture or function somewhere in the body.'),
 Document(id='35e95742-e0a6-4c51-8430-52c578bafec3', metadata={'source': 'data/Medical_book.pdf'}, page_content='Adult_brain_tumor_Patient.html (May 2001).\nBrain Tumor, Primary. Nidus Information Services,Well Con-\nnected,2001.\n“Brain Tumors,” Goldman: Cecil Textbook of Medicine, 21st\nEd., Copyright (c) 2000 W. B. Saunders Company\n“Brain Tumors,” Abeloff: Clinical Oncology, 2nd Ed., Copy-\nright (c) 2000 Church

In [49]:

from langchain_openai import ChatOpenAI

chatmodel = ChatOpenAI(model_name=OPEN_AI_MODEL, temperature=0)

In [50]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [57]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
	[
		("system", system_prompt),
		("user", "Answer the question: {input}"),
	]
)

In [58]:
question_answer_chain = create_stuff_documents_chain(chatmodel,prompt=prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [59]:
response = rag_chain.invoke({"input": "What is Hydrocephalus"})
print(response["answer"])

Hydrocephalus is an abnormal accumulation of fluid within the brain. This fluid accumulation can be destructive by pressing on brain structures and damaging them. It can lead to swelling of the brain.


In [60]:
response = rag_chain.invoke({"input": "What is Thrombosis"})
print(response["answer"])

Thrombosis is a blockage in a blood vessel that remains in one place, potentially leading to decreased blood flow and oxygen supply to tissues beyond the blockage. It can be caused by a blood clot, fat cells, or an air bubble in an artery. Thrombosis can result in serious damage to tissues due to the lack of normal blood flow and oxygen supply.
