In [2]:
# load text books 
# extract vectors
# store the vectors in chromadb
# langchain chain for ollama call

In [13]:
import PyPDF2
import os
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# List of local PDF file paths for the three textbooks
pdf_files = ["TB/DavidsonMedicine24th.pdf"]

# Container to store all extracted text chunks
all_chunks = []

# Define the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)

# Process each PDF file
for file_path in pdf_files:
    extracted_text = extract_text_from_pdf(file_path)
    print(f"Extracted text from {file_path}: {extracted_text[:500]}")  # Print first 500 characters

    # Split the extracted text into chunks
    chunks = text_splitter.split_text(extracted_text)
    print(f"Number of chunks from {file_path}: {len(chunks)}")
    
    # Append the chunks to the all_chunks list
    all_chunks.extend(chunks)

# Create Document objects from the chunks
documents = [Document(page_content=chunk) for chunk in all_chunks]

# Specify the directory to store the Chroma vector database
persist_directory = "chroma_db_davidson"

# Ensure the directory exists
os.makedirs(persist_directory, exist_ok=True)

# Initialize the vector database using Ollama embeddings
vector_db = Chroma.from_documents(
    documents=documents,
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    persist_directory=persist_directory,
    collection_name="local-rag"
)

# Output the number of documents in the vector database
print(f"Total number of documents in Chroma: {vector_db._collection.count()}")

unknown widths : 
[0, IndirectObject(58768, 0, 1862178531280)]
unknown widths : 
[0, IndirectObject(58772, 0, 1862178531280)]
unknown widths : 
[0, IndirectObject(58776, 0, 1862178531280)]
unknown widths : 
[0, IndirectObject(58780, 0, 1862178531280)]
unknown widths : 
[0, IndirectObject(58784, 0, 1862178531280)]
unknown widths : 
[0, IndirectObject(58788, 0, 1862178531280)]
unknown widths : 
[0, IndirectObject(58792, 0, 1862178531280)]
unknown widths : 
[0, IndirectObject(58796, 0, 1862178531280)]


Extracted text from TB/DavidsonMedicine24th.pdf: 24th Edition 
,�. . 
ELSLVlER -Edited by 
Ian D. Penman 
Stuart H. Ralston 
Mark W. J. Strachan 
Richard P. Hobson 
II MedicineDavidson’s
Principles and Practice of
                    P DF  Collected  By:
        Dr. Nazmul Alam FarukiSir Stanley Davidson (1894–1981)
This famous textbook was the brainchild of one of the great Professors of 
Medicine of the 20th century . Stanley Davidson was born in Sri Lanka and 
began his medical undergraduate training at Trinity College, Cambridge; 
this was
Number of chunks from TB/DavidsonMedicine24th.pdf: 1010


OllamaEmbeddings: 100%|██████████| 1010/1010 [1:58:35<00:00,  7.05s/it]


Total number of documents in Chroma: 1010


In [9]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [10]:
local_model = "llama2"
llm = ChatOllama(model=local_model)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

retriever = MultiQueryRetriever.from_llm(
    retriever=vector_db.as_retriever(search_kwargs={"k": 2}),
    llm=llm,
    prompt=QUERY_PROMPT
)

# # RAG prompt
# template = """Answer the question based ONLY on the following context:
# {context}
# Question: {question}
# Make sure the answer is SHORT, CRISP and generated FAST. Try to keep it in one line.
# """

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

  llm = ChatOllama(model=local_model)


In [11]:
chain.invoke("What is the document about?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.01s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


'The document is a clinical practice guideline for the prevention and management of thromboembolic disease in intensive care unit (ICU) patients. It provides recommendations for pharmacological and mechanical prophylaxis against deep vein thrombosis (DVT) and pulmonary embolism (PE), as well as evaluation and management of hypercoagulability states. The document covers various conditions associated with decreased platelets and hypercoagulability, including atrial fibrillation, ventilation-perfusion (VA/V/Q) ratio, activated partial thromboplastin time (aPTT), and heparin-induced thrombocytopenia (HIT). It also provides guidance on the use of alternative anticoagulants and hypercoagulability evaluation.'