In [1]:
import os

In [2]:
# !pip install torch
!pip install -U torch==2.3.1
!pip install torchvision==0.17.2
!pip install PyMuPDF tqdm sentence-transformers accelerate bitsandbytes flash-attn --no-build-isolation

Collecting torchvision==0.17.2
  Using cached torchvision-0.17.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torch==2.2.2 (from torchvision==0.17.2)
  Using cached torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nvidia-nccl-cu12==2.19.3 (from torch==2.2.2->torchvision==0.17.2)
  Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)
Collecting triton==2.2.0 (from torch==2.2.2->torchvision==0.17.2)
  Using cached triton-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Using cached torchvision-0.17.2-cp311-cp311-manylinux1_x86_64.whl (6.9 MB)
Downloading torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl (755.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.6/755.6 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/16

In [3]:
# Download PDF file

import os
import requests

filename = "nutrition.pdf"

response = requests.get("https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf")

if response.status_code == 200:
    with open(filename, "wb") as file:
        file.write(response.content)
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

In [4]:
import fitz
from tqdm import tqdm

def text_formatter(text):
  # do some basic text formatting
  return text.replace("\n", " ").strip() # remove trailing whitespace / line breaks

# load text from pdf
def read_pdf(filepath):
  # open a pdf, read its text by page and collect some statistics
  doc = fitz.open(filepath)
  pages_text = []
  for page_num, page in tqdm(enumerate(doc)):
    text = page.get_text()
    cleaned_text = text_formatter(text)
    pages_text.append({
        "page_num": page_num - 41, # actual content starts from page 41
        "page_char_count": len(cleaned_text),
        "page_word_count": len(cleaned_text.split(" ")),
        "page_sentence_count": len(cleaned_text.split(". ")),
        "page_token_count": len(cleaned_text) / 4, # 1 token ~ 4 chars
        "text": cleaned_text
    })
  return pages_text

pages_text = read_pdf(filename)
pages_text[100]


1208it [00:01, 673.39it/s]


{'page_num': 59,
 'page_char_count': 629,
 'page_word_count': 109,
 'page_sentence_count': 4,
 'page_token_count': 157.25,
 'text': 'Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health. Recent  scientific studies have shown that probiotic supplements positively  affect intestinal microbial flora, which in turn positively affect  immune system function. As good nutrition is known to influence  immunity, there is great interest in using probiotic foods and other  immune-system-friendly foods as a way to prevent illness. In this  chapter we will explore not only immune system function, but also  Introduction  |  59'}

In [5]:
# EDA over text usign pandas
import pandas as pd

df = pd.DataFrame(pages_text)
df.head()

Unnamed: 0,page_num,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [6]:
df.describe().round(2)

Unnamed: 0,page_num,page_char_count,page_word_count,page_sentence_count,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


In [7]:
# using spacy to split text into sentences
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

for page in tqdm(pages_text):
  page["sentences"] = list(nlp(page["text"]).sents)
  page["sentences"] = [str(sen) for sen in page["sentences"]]
  page["sentence_count_spacy"] = len(page["sentences"])

pages_text[100]

100%|██████████| 1208/1208 [00:03<00:00, 372.97it/s]


{'page_num': 59,
 'page_char_count': 629,
 'page_word_count': 109,
 'page_sentence_count': 4,
 'page_token_count': 157.25,
 'text': 'Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health. Recent  scientific studies have shown that probiotic supplements positively  affect intestinal microbial flora, which in turn positively affect  immune system function. As good nutrition is known to influence  immunity, there is great interest in using probiotic foods and other  immune-system-friendly foods as a way to prevent illness. In this  chapter we will explore not only immune system function, but also  Introduction  |  59',
 'sentences': ['Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health.',
  'Recent  scientific

In [8]:
df = pd.DataFrame(pages_text)
df.describe()

Unnamed: 0,page_num,page_char_count,page_word_count,page_sentence_count,page_token_count,sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,198.299669,9.972682,287.001035,10.319536
std,348.86387,560.382275,95.759336,6.187226,140.095569,6.300843
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.875,10.0
75%,864.25,1603.5,271.0,14.0,400.875,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


In [9]:
# split the senteences into chunks of 10 that can fit in the context window of embedding model
chunk_size = 10

def generate_sentence_chunks(sentences, chunk_size):
  chunks = [sentences[i:i+chunk_size] for i in range(0, len(sentences), chunk_size)]
  return chunks

for page in pages_text:
  page["sentence_chunks"] = generate_sentence_chunks(page["sentences"], chunk_size)
  page["num_sentence_chunks"] = len(page["sentence_chunks"])

pages_text[100]

{'page_num': 59,
 'page_char_count': 629,
 'page_word_count': 109,
 'page_sentence_count': 4,
 'page_token_count': 157.25,
 'text': 'Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health. Recent  scientific studies have shown that probiotic supplements positively  affect intestinal microbial flora, which in turn positively affect  immune system function. As good nutrition is known to influence  immunity, there is great interest in using probiotic foods and other  immune-system-friendly foods as a way to prevent illness. In this  chapter we will explore not only immune system function, but also  Introduction  |  59',
 'sentences': ['Digestive  system  without  labels by  Mariana  Ruiz / Public  Domain  Knowing how to maintain the balance of friendly bacteria in your  intestines through proper diet can promote overall health.',
  'Recent  scientific

In [10]:
df = pd.DataFrame(pages_text)
df.describe()

Unnamed: 0,page_num,page_char_count,page_word_count,page_sentence_count,page_token_count,sentence_count_spacy,num_sentence_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,198.299669,9.972682,287.001035,10.319536,1.525662
std,348.86387,560.382275,95.759336,6.187226,140.095569,6.300843,0.644397
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.875,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.875,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


In [11]:
# create dict for indvidual chunk and its releated information
import re

pages_chunks = []
for page in tqdm(pages_text):
  for chunk in page["sentence_chunks"]:
    chunk_dict = {}
    chunk_dict["page_num"] = page["page_num"]
    # join the sentences in a chunk into a paragraph
    joined_sentence_chunk = "".join(chunk).replace("  ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
    chunk_dict["sentence_chunk"] = joined_sentence_chunk

    # Get stats about the chunk
    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

    pages_chunks.append(chunk_dict)

pages_chunks[100]


100%|██████████| 1208/1208 [00:00<00:00, 28143.75it/s]


{'page_num': 40,
 'sentence_chunk': 'http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=61 An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=61 An interactive or media element has been excluded from this version of the text. You can view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=61 \xa0 40 | Research and the Scientific Method',
 'chunk_char_count': 429,
 'chunk_word_count': 54,
 'chunk_token_count': 107.25}

In [12]:
df = pd.DataFrame(pages_chunks)
df.describe()

Unnamed: 0,page_num,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.381443,734.442756,112.333152,183.610689
std,347.78867,447.541546,71.220313,111.885387
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.625
max,1166.0,1831.0,297.0,457.75


In [13]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 24.25 | Text: There are several lecithin supplements on the market Nonessential and Essential Fatty Acids | 315
Chunk token count: 4.25 | Text: Introduction | 61
Chunk token count: 24.5 | Text: view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=130   Introduction | 149
Chunk token count: 13.25 | Text: 1036 | Efforts on the Consumer Level: What You Can Do
Chunk token count: 20.25 | Text: Honor your health – gentle nutrition       Calories In Versus Calories Out | 1075


In [14]:
pages_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_chunks_over_min_token_len[:2]

[{'page_num': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_num': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [15]:
# generate sentence embeddings using sentence_transformer lib

from sentence_transformers import SentenceTransformer, util
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")

%%time
for chunk in tqdm(pages_chunks_over_min_token_len):
    chunk["embedding"] = embedding_model.encode(chunk["sentence_chunk"])

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
/usr/local/lib/python3.11/dist-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so: undefined symbol: _ZNK3c105Error4whatEv

In [None]:
# save embedding to a file
text_chunks_and_embeddings_df = pd.DataFrame(pages_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

In [None]:
import random

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

In [None]:
text_chunks_and_embedding_df.head()

In [None]:
# find the embeddings similar to some query usign dot product / cosine similarity
query = "macronutrients functions"

query_embedding = embedding_model.encode(query, convert_to_tensor=True)

dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]

# 4. Get the top-k results
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
# Define helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

In [None]:
# retreival pipeline

# returns the top k results similar to the query
def retrieve_relevant_resources(query, embeddings, model=embedding_model, top_k=5):
    query_embedding = model.encode(query, convert_to_tensor=True)
    # Get dot product scores on embeddings
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    # get teh top k results
    scores, indices = torch.topk(input=dot_scores, k=top_k)
    return scores, indices

def print_top_results_and_scores(query, embeddings: torch.tensor,
                                 pages_chunks = pages_chunks, top_k = 5):
    # finds the top k senteces similar to the query and print them
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  top_k=top_k)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_chunks[index]['page_number']}")
        print("\n")

In [None]:
query = "symptoms of pellagra"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"

model_id = "google/gemma-7b-it"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use


In [None]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [16]:
query_list = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins"
]

In [17]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

NameError: name 'random' is not defined

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

In [None]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items