In [1]:
print('hi')

hi


In [1]:
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings

import os
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.llms import CTransformers


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


In [4]:
index_name = 'medical-chatbot'

In [5]:
# Extract data from the pdf
def load_pdf(data):
    loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents


In [6]:
extracterd_data = load_pdf('data/')


# Create splitter and create chunks

In [7]:
def text_split(extracted_data):
    text_splitter= RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )

    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:
text_chunks = text_split(extracterd_data)

In [9]:
len(text_chunks)

6973

In [10]:
text_chunks[0]

Document(metadata={'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017-05-01T10:37:35-07:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'total_pages': 759, 'page': 0, 'page_label': '1'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION')

# Download embedding model from huggingface

In [11]:
def download_hugging_face_embedding():
    embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings_model


In [12]:
embeddings_model = download_hugging_face_embedding()

In [13]:
embeddings_model


HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [14]:
query_result = embeddings_model.embed_query("Hello, how are you?")
print(len(query_result))


384


In [16]:
print(query_result)

[0.019096748903393745, 0.03446517512202263, 0.09162798523902893, 0.07016526907682419, -0.029946597293019295, -0.08419137448072433, 0.04581356421113014, 0.004958590492606163, -0.09189331531524658, 0.01740063913166523, -0.00881615187972784, -0.0006614578305743635, -0.02855696901679039, -0.021949712187051773, 0.05516669154167175, -0.049836501479148865, 0.08988095074892044, -0.08895706385374069, -0.11235623806715012, 0.03900053724646568, -0.06607074290513992, 0.02609514445066452, 0.03653070330619812, 0.06139037013053894, -0.05712487921118736, -0.05463935807347298, 0.03036552667617798, 0.03238753601908684, 0.012644710019230843, -0.1056857705116272, -0.05834552273154259, 0.06732939928770065, -0.04075591266155243, 0.006439837161451578, 0.005698689725250006, 0.05285317078232765, -0.0397753082215786, -0.11855248361825943, 0.0021161921322345734, -0.016692863777279854, 0.0283381175249815, -0.03743794187903404, -0.021371405571699142, -0.04147521033883095, 0.08497177809476852, -0.06869424879550934,

# initialize pinecone

In [17]:
pinecone_client = Pinecone(api_key = PINECONE_API_KEY)

In [None]:
doc_search = PineconeVectorStore.from_texts(
    [t.page_content for t in text_chunks],
    embeddings_model,
    index_name = index_name
)
# Note: this takes the embedding model, the text chunks, and the index name
# this creates the entries in the Pincone index.
# wen need to the PineconeVectorStore from langchain to do this not the Pinecone class directly.

In [20]:
doc_search = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings_model
)


In [21]:
query = "What are Allergies?"
docs = doc_search.similarity_search(query, k = 3)
print(docs)

[Document(id='1ce2c2ba-45f0-442f-b851-dceb048427a0', metadata={}, page_content='ORGANIZATIONS\nAmerican Academy of Ophthalmology. 655 Beach Street, PO\nBox 7424, San Francisco, CA 94120-7424. <http://www.\neyenet.org>.\nKEY TERMS\nAllergen —A substance capable of inducing an\nallergic response.\nAllergic reaction—An immune system reaction to\na substance in the environment; symptoms\ninclude rash, inflammation, sneezing, itchy watery\neyes, and runny nose.\nConjunctiva—The mucous membrane that covers\nthe white part of the eyes and lines the eyelids.'), Document(id='4121468f-4f84-4ca4-8076-9e60169f3fc8', metadata={}, page_content='Although environmental medicine is gaining more\nrespect within conventional medicine, detoxification\nKEY TERMS\nAllergen —A foreign substance, such as mites in\nhouse dust or animal dander, that when\ninhaled,causes the airways to narrow and pro-\nduces symptoms of asthma.\nAntibody —A protein, also called immunoglobu-\nlin, produced by immune system cells 

In [22]:
for doc in docs:
    print(doc.page_content)

ORGANIZATIONS
American Academy of Ophthalmology. 655 Beach Street, PO
Box 7424, San Francisco, CA 94120-7424. <http://www.
eyenet.org>.
KEY TERMS
Allergen —A substance capable of inducing an
allergic response.
Allergic reaction—An immune system reaction to
a substance in the environment; symptoms
include rash, inflammation, sneezing, itchy watery
eyes, and runny nose.
Conjunctiva—The mucous membrane that covers
the white part of the eyes and lines the eyelids.
Although environmental medicine is gaining more
respect within conventional medicine, detoxification
KEY TERMS
Allergen —A foreign substance, such as mites in
house dust or animal dander, that when
inhaled,causes the airways to narrow and pro-
duces symptoms of asthma.
Antibody —A protein, also called immunoglobu-
lin, produced by immune system cells to remove
antigens (the foreign substances that trigger the
immune response).
Fibromyalgia —A condition of debilitating pain,
among other symptoms, in the muscles and the
myofascia (

# hooking the Pinecone results to llm

In [24]:
prompt_template = """
use the following pieces of information to answer the user's question.
If you dont know the answer, just say that you dont know.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [27]:
PROMPT = PromptTemplate.from_template(
    template = prompt_template,
)



chain_type_kwargs = {"prompt": PROMPT}

In [37]:
llm = CTransformers(
    model = "model/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type = "llama",
    config = {
        "max_new_tokens": 512,
        "temperature": 0.8,
        "context_length": 4096
    }
)

In [38]:
# Create chain using LCEL

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

retriever = doc_search.as_retriever(search_kwargs={"k": 2})
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | PROMPT
    | llm
    | StrOutputParser()
)

In [39]:
# Using the chain
query = "What is acne?"
answer = rag_chain.invoke(query)
print(answer)

Acne is a common skin condition characterized by inflamed red bumps, blackheads, or whiteheads on the face, chest, back, and other areas of the body. It can be caused by a combination of genetic, hormonal, environmental, and lifestyle factors, such as excess oil production, clogged pores, and stress. Acne can lead to emotional distress, social isolation, and low self-esteem, especially if it persists over time or is severe. Treatment options include topical creams, oral medications, and lifestyle changes, such as regular exercise, a balanced diet, and stress management techniques. Preventing acne involves maintaining good hygiene habits, using non-comedogenic products, and avoiding picking or squeezing blemishes, as this can lead to further inflammation and scarring.
