In [3]:
pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [5]:
loader = PyPDFDirectoryLoader("./medical_documents/")

docs_before_split = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)
print(docs_after_split[0])

page_content='Acne  : For mild acne without a prescription, consider antiseptic washes containing benzoyl peroxide \n(Acne Clear) or salicylic acid (Acnevir), or creams or gels containing benzoyl peroxide (Benzac AC), \nazelaic acid (Azelex), resorcinol (R A Acne), dapsone (Aczone) , or products containing tea tree oil.  \n \nPrescription options for mild acne include antibiotics (topical clindamycin [Cleocin T] or topical \nerythromycin [Erygel]) with or without benzoyl peroxide (such as Neuac or Benzamycin) or azelaic \nacid to prevent resistance. Topical prescription retinoids l ike tretinoin (Renova) or adapalene' metadata={'source': 'medical_documents\\midical_advisory.pdf', 'page': 0}


In [6]:
pip install pyPdf

Note: you may need to restart the kernel to use updated packages.


In [7]:
avg_doc_length = lambda docs : sum([len(doc.page_content) for doc in docs]) // len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f"Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}")
print(f"After split, there were {len(docs_after_split)} documents loaded, with average characters equal to {avg_char_after_split}")

Before split, there were 22 documents loaded, with average characters equal to 2020
After split, there were 78 documents loaded, with average characters equal to 572


In [3]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5",
    model_kwargs={"device":"cpu"},
    encode_kwargs={"normalize_embeddings":True}
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
pip install sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
sample_embedding=np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Samples embedding of a document chunk:", sample_embedding)
print("Size of the embedding:",sample_embedding.shape)

Samples embedding of a document chunk: [-2.69559212e-02  1.50481211e-02 -2.09556171e-03  5.81145249e-02
  2.20124461e-02  2.04199515e-02 -2.58001629e-02  2.35745236e-02
 -3.81397642e-02 -3.45515236e-02  2.62100622e-03 -4.37596701e-02
  3.87745984e-02  2.85798442e-02 -1.01080607e-03 -6.38345703e-02
  6.97247013e-02  3.48531194e-02  5.19095408e-03  1.10781258e-02
 -3.90515625e-02  5.56733906e-02 -4.86274064e-02 -6.24861903e-02
  3.52127440e-02 -1.49436633e-03  2.22416557e-02  4.06447574e-02
 -4.48314585e-02 -1.33029938e-01  2.18236391e-02  3.64907831e-03
 -5.06653786e-02 -4.76348773e-02 -1.30508456e-03 -3.13157961e-02
 -3.64684910e-02  2.64485832e-02 -2.45662336e-03  2.60198233e-03
  4.87685800e-02  1.73722636e-02 -2.01952942e-02 -3.97068821e-03
  6.03658035e-02  5.07592335e-02  3.06990035e-02  4.35447646e-03
  1.36733323e-01  2.70399116e-02  4.76301014e-02  1.48191722e-02
  3.39168236e-02 -2.13691615e-03 -1.44032435e-02 -1.95391271e-02
 -2.45672483e-02 -3.64376009e-02 -3.45774144e-02  4

In [11]:
vectorstore=FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [16]:
vectorstore.index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002067D2220A0> >

In [12]:
from faiss import write_index, read_index

In [14]:
type(vectorstore)

langchain_community.vectorstores.faiss.FAISS

In [17]:
write_index(vectorstore.index, "large.index")

In [20]:
index = read_index("large.index")

In [21]:
vectorstore.save_local("medical_vectorestore")

In [4]:
new_vectorstore = FAISS.load_local("medical_vectorestore", huggingface_embeddings, allow_dangerous_deserialization=True)

In [5]:
new_vectorstore.similarity_search(query=query)

NameError: name 'query' is not defined

In [None]:
pip install faiss-cpu




In [6]:
query="""what are the different treatments for weight loss"""
relevant_documents=new_vectorstore.similarity_search(query)
print(f"There are {len(relevant_documents)} documents which are relevant to the query. First one:")
print(relevant_documents[0].page_content)

There are 4 documents which are relevant to the query. First one:
Exercise regularly and lose weight if you are overweight.  
 
 
Hair Loss : Treatments for alopecia aim to grow back missing hair and include topical products such as 
minoxidil, corticosteroids, anthralin, or topical immunotherapy, and systemic medications, such as 
oral corticosteroids, JAK inhibitors, and other immunosuppressants .


In [13]:
retriever = new_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [10]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [11]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1, "max_length":500}
)
query="""what are the medications for hair loss"""
llm=hf
llm.invoke(query)

'what are the medications for hair loss\n\n# What are the medications for hair loss?\n\nThere are a number of medications that can be used to treat hair loss, including minoxidil, finasteride, and dutasteride. Minoxidil is a topical medication that is applied to the scalp. It is available over-the-counter and is generally well tolerated. Finasteride and dutasteride are oral medications that are used to treat hair loss. They are both prescription medications and are'

In [7]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.


{context}


Question: {question}

Helpful Answer:"""
PROMPT=PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [14]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":PROMPT}
)

In [15]:
query="""What are the medications for anxiety."""

In [20]:
result=retrievalQA.invoke({"query":query})
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.


Anxiety : Two classes of medications are commonly used to treat anxiety: SSRIs and benzodiazepines.  
 
SSRIs (selective serotonin reuptake inhibitors) are usually the first medications doctors will prescribe 
to help manage anxiety before considering other options such as benzodiazepines. Common SSRIs

line options for generalized anxiety disorders (GAD) and may be considered when panic attacks or 
severe muscle tension are prominent. They enhance the inhibit ory neurotransmitter GABA, providing 
quick relief but for a short duration. They are effective in reducing physiological symptoms like 
tension and insomnia. While generally well -tol

In [None]:
pip install weaviate-cli

In [None]:
import weaviate

In [None]:
client = weaviate.Client(
    url="https://medical-knowlegde-cluster-8hrw7zhc.weaviate.network",
    auth_client_secret=weaviate.auth.AuthApiKey(api_key="XrtPANuGUQV7ihAjGZ1NSzRPPdl3uzwD7fnb")
)
print(client.is_ready())

True


In [None]:
with client.batch as batch:
    for doc,i in enumerate(docs_after_split):
        batch.add_data_object(data_object={i:doc}, class_name=i)

TypeError: unhashable type: 'Document'