In [62]:
import os

# Move up one directory (out of 'research')
os.chdir("..")

# Print the current directory to verify
print("Current Directory:", os.getcwd())

Current Directory: d:\


In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# Load the pdf sample data

def load_pdf_file(data):
    loader=DirectoryLoader(data,
                           glob='*.pdf',
                           loader_cls=PyPDFLoader)
    documents=loader.load()

    return  documents

In [4]:
# Extracting data from the whole pdf document

extracted_data=load_pdf_file(data='Data/')

In [5]:
# Splitting the data into Text Chunks

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [6]:
text_chunks=text_split(extracted_data)
print("Length of the text Chunks = ",len(text_chunks))

Length of the text Chunks =  39994


In [7]:
# text_chunks
# !pip install sentence-transformers

In [50]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# vector = embeddings.embed_query("Medical chatbot AI model")
# print(len(vector))  # Returns an embedding vector

In [8]:
# Download an Embedding model from Hugging Face

from langchain.embeddings import HuggingFaceBgeEmbeddings

def download_hugging_face_embeddings():
    embeddings=HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [16]:
embeddings=download_hugging_face_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
# pip install sentence-transformers
# pip install --upgrade sentence-transformers huggingface_hub langchain-community


In [17]:
# query_result=embeddings.embed_query("Hellow World")
# print(len(query_result))

384


In [11]:
# pip install pinecone

In [65]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GOOGLE_API_KEY=os.environ.get('GOOGLE_API_KEY')

In [64]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

# https://app.pinecone.io/organizations/-OHvsYbej_lKGye7b00Y/projects/c3d5c14f-a5cd-43b8-8486-d49d3b96f5f8/indexes
# pip install --upgrade sentence-transformers huggingface_hub langchain-community

In [25]:
index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [43]:
import os
os.environ['PINECONE_API_KEY']=PINECONE_API_KEY
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

In [26]:
# Embed each Chunks and upsert the embeddings into your pinecone index

from langchain_pinecone import PineconeVectorStore

docsearch=PineconeVectorStore.from_documents(
    documents= text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [28]:
# Load Existing Index

from langchain_pinecone import PineconeVectorStore

docsearch=PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [29]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1f8a28eacc0>

In [35]:
retriever= docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [36]:
retriever_docs=retriever.invoke("What is Acne ?")

In [37]:
retriever_docs

[Document(id='e7a3cc6f-8a53-47d3-bac3-672e4ea1f678', metadata={'page': 55.0, 'page_label': '26', 'source': 'Data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='c460bf0f-cbc7-4265-9275-ba453df8ffd7', metadata={'page': 54.0, 'page_label': '25', 'source': 'Data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='occurs when new skin cells are laid down to replace\ndamaged cells.\nThe most common sites of acne are the face, chest,\nshoulders, and back since these are the parts of the\nbody where the most sebaceous follicles are found.\nCauses and symptoms\nThe exact cause of acne is unknown. Several risk\nfactors have been identified:\n/C15Age. Due to the hormonal changes they experience,\nteenagers are more likely to develop acne.\n/C15Gender. Boys have more severe acne and develop it\nmore often th

In [None]:
# pip install google-generativeai langchain langchain-google-genai
# https://aistudio.google.com/prompts/new_chat

In [None]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# vector = embeddings.embed_query("Medical chatbot AI model")
# print(vector)  # Returns an embedding vector

In [44]:
# from langchain_openai import OpenAI
# llm=OpenAI(temperature=0.4,max_tokens=500)


from langchain_google_genai import ChatGoogleGenerativeAI
import os

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 1000,
  "response_mime_type": "text/plain",
}

# Initialize the Gemini model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",generation_config=generation_config)

In [45]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt=(
    "You are the assistance for question-answering task."
    "Use the following pieces for retrieved context to answer"
    "the question.If you don't know the answer, say that you"
    "don't know. Use three sentences emaximum and keep the  "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt=ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}"),
    ]
)

In [46]:
question_answer_chain=create_stuff_documents_chain(llm,prompt)
rag_chain= create_retrieval_chain(retriever,question_answer_chain)

In [61]:
response=rag_chain.invoke({"input":"What is gigantism?"})
print(response["answer"])

Gigantism is a variant of acromegaly, characterized by excessive growth of long bones in children before the closure of growth plates. This accelerated growth is caused by an overproduction of growth hormone (GH) from the pituitary gland, leading to unusual height.  Unlike acromegaly, which occurs after bone growth has stopped, gigantism affects children and leads to exceptional height.
