In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [2]:
PINECONE_API_KEY = "XXXXX"
PINECONE_API_ENV = "XXXXX"

In [3]:
# Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)

    documents = loader.load()

    return documents

In [4]:
extracted_data = load_pdf("../data/")

In [5]:
len(extracted_data)

15164

In [6]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [7]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 60459


In [48]:
type(text_chunks)

list

In [9]:
# download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

In [11]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [12]:
query_result

[-0.03447727486491203,
 0.031023165211081505,
 0.006735039874911308,
 0.026108969002962112,
 -0.03936201333999634,
 -0.16030253469944,
 0.06692396104335785,
 -0.006441398523747921,
 -0.04745050147175789,
 0.014758873730897903,
 0.07087531685829163,
 0.05552751198410988,
 0.019193362444639206,
 -0.026251358911395073,
 -0.01010953076183796,
 -0.026940522715449333,
 0.022307388484477997,
 -0.022226607427001,
 -0.1496926248073578,
 -0.017493044957518578,
 0.007676299195736647,
 0.0543522909283638,
 0.0032544569112360477,
 0.031725868582725525,
 -0.08462143689393997,
 -0.029406050220131874,
 0.051595620810985565,
 0.04812408238649368,
 -0.0033148261718451977,
 -0.05827917158603668,
 0.041969314217567444,
 0.022210661321878433,
 0.12818877398967743,
 -0.022338900715112686,
 -0.011656191200017929,
 0.06292837858200073,
 -0.03287629038095474,
 -0.09122606366872787,
 -0.03117535635828972,
 0.052699536085128784,
 0.04703478142619133,
 -0.0842030793428421,
 -0.03005615621805191,
 -0.0207447949796

In [17]:
text_chunks[0].page_content

"21\nst\nEdition\n^\nHARRISON\n'\nS\nP R I N C I P L E S\nO\nF\nI N T E R N A L\nM E D I C I N E\nLOSCALZO\nFAUCI\nKASPER\nHAUSER\nLONGO\nV O L U M E\n1\nJAMESON"

In [26]:
# Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)

index_name = "medical-chatbot"

# Creating Embeddings for Each of The Text Chunks & storing
docsearch = Pinecone.from_texts(
    [t.page_content for t in text_chunks], embeddings, index_name=index_name
)

In [27]:
# If we already have an index we can load it like this
docsearch = Pinecone.from_existing_index(index_name, embeddings)

query = "What are Allergies?"

docs = docsearch.similarity_search(query, k=3)

print("Result", docs)

Result [Document(page_content='hobbies, recreational inhalants) exposures. Allergen-sensitized\npatients may complain of symptoms on exposure to known allergens\nsuch as animals and may complain of increased symptoms during\nspecific pollen seasons. Up to two-thirds of patients with asthma will\nbe atopic (as opposed to half of the U.S. population), and almost half\nwill have a history of rhinitis, with many complaining of intermittent\nsinusitis. In patients with adult-onset asthma, a careful occupational'), Document(page_content='eustachian tubes precipitates secondary infections of the sinuses\nand middle ear , respectively . A growing number of patients with\nseasonal allergic rhinitis demonstrate pollen-associated food\nallergen syndrome characterized by oropharyngeal pruritus and/or\nmild swelling following the ingestion of plant-based foods in the\nsame plant family as a tree, grass, or weed, which contain cross-\nreacting allergens.\nThe nose presents a large mucosal surface ar

In [28]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [41]:
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}

In [43]:
llm = CTransformers(
    model="../models/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type="llama",
    config={"max_new_tokens": 512, "temperature": 0.8},
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [44]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
)

In [46]:
while True:
    user_input = input(f"Input Prompt:")
    result = qa({"query": user_input})
    print("Response : ", result["result"])

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


Response :  ECZEMA is a type of dermatitis, which is a reaction pattern that presents with variable clinical findings and the common histologic finding of spongiosis (intercellular edema of the epidermis).


  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
Number of tokens (513) exceeded maximum context length (512).
Number of tokens (514) exceeded maximum context length (512).
Number of tokens (515) exceeded maximum context length (512).
Number of tokens (516) exceeded maximum context length (512).
Number of tokens (517) exceeded maximum context length (512).
Number of tokens (518) exceeded maximum context length (512).
Number of tokens (519) exceeded maximum context length (512).
Number of tokens (520) exceeded maximum context length (512).
Number of tokens (521) exceeded maximum context length (512).
Number of tokens (522) exceeded maximum context length (512).
Number of tokens (523) exceeded maximum context length (512).
Number of tokens (524) exceeded maximum context length (512).
Number of tokens (525) exceeded maximum context length (512).
Number of tokens (526) exceeded maximum context length (512).
Number of tokens (527) exceeded maximum context length (

KeyboardInterrupt: 