# **Importing required Libraries**

In [None]:
!pip install langchain #LLM Library
!pip install chromadb # Vector Storage
!pip install pypdf    # Loading PDFs
!pip install pytest   # Unit Testing



In [None]:
!pip install langchain-community



# **Importing PyPDFLoader for processing PDF documents**

In [None]:
from langchain.document_loaders import PyPDFLoader

def load_document():
    loader = PyPDFLoader("/content/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND (4).pdf")
    documents = loader.load()
    return documents

In [None]:
document = load_document()

# Print the content of the loaded PDF
for i, page in enumerate(document):
    print(f"Page {i+1}:")
    print(page.page_content)  # page_content contains the text of the page
    print("\n" + "="*80 + "\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Description
Breast fibroadenomas, abnormal growths of glandu-
lar and fibrous tissues, are most common between the
ages of 15 and 30, and are found in 10% of all women
(20% of African-American women). They are found
rarely in postmenopausal women.
Described as feeling like marbles, these firm, round,
movable, and “rubbery” lumps range from 1–5 cm in size.
Giant fibroadenomas are larger, lemon-sized lumps. Usually
single, from 10–15% of women have more than one.
While some types of breast lumps come and go dur-
ing the menstrual cycle, fibroadenomas typically do not
disappear after a woman’s period, and should be checked
by a doctor.
Causes and symptoms
The cause of breast fibroadenomas is unknown.
They may be dependent upon estrogen, because they are
common in premenopausal women, can be found in post-
menopausal women taking estrogen, and because they
grow larger in pregnant women.
Fibroadenomas usually cause no symptoms

# **Importing RecursiveCharacterTextSplitter for splitting text into Chunks**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

In [None]:
def split_document(document):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 200, length_function = len, is_separator_regex=False)
    # Split the text of the document into chunks
    chunks = []
    for page in document:
        chunks += text_splitter.split_text(page.page_content)

    return chunks

In [None]:
chunks = split_document(document)

# Print the first few chunks
for i, chunk in enumerate(chunks[:5]):  # Display the first 5 chunks
    print(f"Chunk {i+1}:")
    print(chunk)
    print("\n" + "="*80 + "\n")

Chunk 1:
The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION


Chunk 2:
The G ALE
ENCYCLOPEDIA
of M EDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
C-F
2


Chunk 3:
STAFF
Jacqueline L. Longe,Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow,Manager, Imaging and Multimedia
Content
Robyn V . Young,Project Manager, Imaging and
Multimedia Content
Dean Dauphinais, Senior Editor, Imaging and
Multimedia Content
Kelly A. Quin, Editor, Imaging and Multimedia Content
Leitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,
Image Catalogers
Pamela A. Reed, Imaging Coordinator
Randy Bassett, Imaging Supervisor
Robert Duncan, Senior Imaging Specialist
Dan Newell, Imaging Specialist
Christine 

In [None]:
!pip install openai



# **Converting chunks into Word embeddings**

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [None]:
!pip install sentence-transformers



In [None]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [None]:
query_result

[-0.034477315843105316,
 0.031023172661662102,
 0.006734910886734724,
 0.02610892429947853,
 -0.03936195746064186,
 -0.1603025197982788,
 0.06692396104335785,
 -0.006441440898925066,
 -0.04745054617524147,
 0.014758836477994919,
 0.07087532430887222,
 0.055527545511722565,
 0.01919332519173622,
 -0.026251299306750298,
 -0.01010951679199934,
 -0.026940451934933662,
 0.022307397797703743,
 -0.022226639091968536,
 -0.1496926248073578,
 -0.01749303936958313,
 0.007676327601075172,
 0.054352276027202606,
 0.0032544792629778385,
 0.03172592446208,
 -0.08462144434452057,
 -0.029405953362584114,
 0.05159562826156616,
 0.048124104738235474,
 -0.003314818488433957,
 -0.05827919766306877,
 0.04196928068995476,
 0.02221069671213627,
 0.12818878889083862,
 -0.02233896404504776,
 -0.011656257323920727,
 0.06292840093374252,
 -0.03287629410624504,
 -0.09122602641582489,
 -0.031175386160612106,
 0.05269954726099968,
 0.047034841030836105,
 -0.08420310169458389,
 -0.030056146904826164,
 -0.020744822919

# **Storing the wordEmbedding over vector store (ChromaDB)**

In [None]:

from langchain.docstore.document import Document
from langchain.vectorstores import Chroma

# Assuming 'chunks' is a list of strings
documents = [Document(page_content=chunk) for chunk in chunks]

vectorstore = Chroma.from_documents(
    documents=documents,  # Pass the list of Document objects
    collection_name="rag-chroma",
    embedding=embeddings,
)
retriever = vectorstore.as_retriever()
# vectorstore = Chroma.from_documents(
#     documents=chunks,
#     collection_name="rag-chroma",
#     embedding=embeddings,
# )
# retriever = vectorstore.as_retriever()

In [None]:
question = "What are the major risk factors for cancer?"
docs = vectorstore.similarity_search(question, k=3)
print(len(docs))
print(docs)

3
[Document(metadata={}, page_content='they also have prolonged exposure to intensive sunlight.\nThere are several different types of cancers:\n• Carcinomas are cancers that arise in the epithelium (the\nlayers of cells covering the body’s surface and lining the\ninternal organs and various glands). Ninety percent of\nhuman cancers fall into this category. Carcinomas can be\nsubdivided into two types: adenocarcinomas and squa-\nmous cell carcinomas. Adenocarcinomas are cancers that\ndevelop in an organ or a gland, while squamous cell car-\ncinomas refer to cancers that originate in the skin.\n• Melanomas also originate in the skin, usually in the\npigment cells (melanocytes).\n• Sarcomas are cancers of the supporting tissues of the\nbody, such as bone, muscle and blood vessels.\n• Cancers of the blood and lymph glands are called\nleukemias and lymphomas respectively.\n• Gliomas are cancers of the nerve tissue.\nCauses and symptoms\nThe major risk factors for cancer are: tobacco, alco-\

In [None]:
prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:{context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [None]:
from langchain import PromptTemplate
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [None]:
!pip install ctransformers



# **Importing RetrievalQA first retrieving relevant information from a document database (like a vector store) using a "Retriever" and then using a large language model (LLM) powered by CTransformers to generate the answer based on the retrieved context**

In [None]:
# from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.llms import CTransformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Pretrained model used :**
https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML

In [None]:
llm = CTransformers(model="/content/drive/MyDrive/Model/llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama", config={'max_new_tokens': 512, 'temperature': 0.8})

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever= retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)


In [None]:
while True:
  user_input = input(f"Input Prompt:")
  result = qa.invoke({"query": user_input})
  print("Response: ",result["result"])




Response:  The major risk factors that'Thank you should consider that'I don'Of course,You have prolongeducdifferent types of course, thank you don'sorry,
I don'I apologize).
Sunfortunately,The type=
The information on the user20
Baseda combination of the skin cancer is it'Cancestimate a)
You'
Thank you cancers.
Unfortunately,
The risk factors
Although they are there are more than  I cannot tell themosteasyous types of the causes and squamong>From the user10
Sunfortunately,these are most cancers). The user5
They cancers)
Thank you may be sure,There are there are there is it'
The major risk factors that'
I don'Thank you don'Thank you have prolongeducancer cancers).
Sorry,Skin, Thank you will definitely possible causes and more thanx
Sunfortunately, depending oncolore than the user80- The main risk factor. There are there are sarcombin/knowingestion, based on average person-
According tobrown,To answer this is there are more information on average of cancer in a type of course of cancer c