In [16]:
# What is RAG

In [17]:
# Download PDF file
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


Opening the PDF file

In [4]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace('\n', " ").strip()

# Potentially more text formatting functions can go here...
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

  from .autonotebook import tqdm as notebook_tqdm
1208it [00:01, 649.95it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [5]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 70,
  'page_char_count': 799,
  'page_word_count': 140,
  'page_sentence_count_raw': 6,
  'page_token_count': 199.75,
  'text': 'Image by  Gabriel Lee  / CC  BY-NC-SA  Everyday Connection  There has been significant talk about pre- and probiotic  foods in the mainstream media. The World Health  Organization defines probiotics as live bacteria that confer  beneficial health effects on their host. They are sometimes  called “friendly bacteria.” The most common bacteria  labeled as probiotic is lactic acid bacteria (lactobacilli). They  are added as live cultures to certain fermented foods such  as yogurt. Prebiotics are indigestible foods, primarily  soluble fibers, that stimulate the growth of certain strains  of bacteria in the large intestine and provide health benefits  to the host. A review article in the June 2008 issue of the  Journal of Nutrition concludes that there is scientific  70  |  The Digestive System'},
 {'page_number': 461,
  'page_char_count': 1339,
  

In [6]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [7]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


Token count is important here because:
Embedding models and  LLMs dont deal with infinite tokens.
For Example: An embedding model might have been trained to embed sequences of 384 tokens into numerical space.
LLMs also cannot accept infinite tokens in their context windows.

### Further Text Processing
Splitting pages into sentences.
There are two ways to do this:
1) Split sentences based on `"."` characters.
2) Using NLP libraries like spaCy and nltk.

In [8]:
from spacy.lang.en import English
nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# Create document instance as an example
doc = nlp("This is a sentence. This is another sentence. I like elephants")
assert len(list(doc.sents)) == 3

# print out our sentences split
list(doc.sents)

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


[This is a sentence., This is another sentence., I like elephants]

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:02<00:00, 544.33it/s]


In [10]:
random.sample(pages_and_texts, k=1)

[{'page_number': 212,
  'page_char_count': 1325,
  'page_word_count': 226,
  'page_sentence_count_raw': 12,
  'page_token_count': 331.25,
  'text': 'The Beverage Panel recommends an even lower intake of calories  from beverages than IOM—10 percent or less of total caloric intake.  Table 3.10 Recommendations of the Beverage Panel  Beverage  Servings per day*  Water  ≥ 4 (women), ≥ 6 (men)  Unsweetened coffee and tea  ≤ 8 for tea, ≤ 4 for coffee  Nonfat and low-fat milk; fortified soy drinks ≤ 2  Diet beverages with sugar substitutes  ≤ 4  100 percent fruit juices, whole milk, sports  drinks  ≤ 1  Calorie-rich beverages without nutrients  ≤ 1, less if trying to lose  weight  *One serving is eight ounces.  Source: Beverage Panel Recommendations and Analysis. University  of North Carolina, Chapel Hill. US Beverage Guidance Council.  http:/ /www.cpc.unc.edu/projects/nutrans/policy/beverage/us- beverage-panel. Accessed November 6, 2012.  Sources of Drinking Water  The Beverage Panel recommen

In [11]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32
std,348.86,560.44,95.75,6.19,140.11,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


### Chunking out sentences together

The concept of splitting larger pieces of text into smaller ones is often referred to as text splitting or chunking.

There is no 100% correct way.

We'll keep it simple and split into groups of 10 sentences.

Why we do chunking?

- So our texts are easier to filter.
- So our text chunks can fit into our embedding model context window.
- So our contexts passed into the LLM can be more specific and focused.

In [12]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function to split lists of text recursively into chunk size.
# e.g. [20] -> [10, 10] or [25] -> [10, 10, 5]

def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [13]:
# Loop through pages and text and sentences into chuncks
for item in tqdm(pages_and_texts):
    item['sentence_chunks'] = split_list(input_list=item['sentences'],
                                                     slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])
    

100%|██████████| 1208/1208 [00:00<00:00, 252088.13it/s]


In [14]:
random.sample(pages_and_texts,k=1)

[{'page_number': 981,
  'page_char_count': 1474,
  'page_word_count': 246,
  'page_sentence_count_raw': 14,
  'page_token_count': 368.5,
  'text': 'health and cannot be duplicated with a pill or a regimen of  supplements. Therefore, vitamins and other dietary supplements  should never be a substitute for food. Nutrients should always be  derived from food first.  Food: The Best Medicine  Poor dietary choices and a sedentary lifestyle account for about  300–600 thousand deaths every year according to the US  Department of Health and Human Services. That number is thirteen  times higher than the deaths due to gun violence.4 The typical  North American diet is too high in saturated fat, sodium, and sugar,  and too low in fiber in the form of whole fruits, vegetables, and  whole grains to keep people healthy. With so many threats to  optimal health it is vital to address those factors that are under your  control, namely dietary and lifestyle choices. A diet that supplies  your body with t

In [15]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32,1.53
std,348.86,560.44,95.75,6.19,140.11,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0,1.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0,1.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Splitting each chunk into its own item

We'd like to embed each chunk of sentences into its own numerical representation.

Will provide more granularity.

Meaning, we can dive specifically into the text sample that was used in our model.

In [20]:
import re

# Split each chunk into its own item
pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like , aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any capital letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk


        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

100%|██████████| 1208/1208 [00:00<00:00, 24227.14it/s]


1843

In [23]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 757,
  'sentence_chunk': 'Updated March 2010. Accessed November 22, 2017. MyPlate Planner | 757',
  'chunk_char_count': 69,
  'chunk_word_count': 11,
  'chunk_token_count': 17.25}]

In [24]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.83,112.72,183.71
std,347.79,447.43,71.07,111.86
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,45.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


### Filtering Shorter Chunks without much useful information

In [31]:
# Show random chunks with under 30 tokens in length
min_token_length = 30

for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk']}")

Chunk token count: 20.25 | Text: Published 2002. Accessed December 2, 2017. Pacific Based Dietary Guidelines | 761
Chunk token count: 20.75 | Text: http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=118   132 | The Immune System
Chunk token count: 21.75 | Text: http:/ /www.health.gov.fj/?page_id=1406. Accessed November 12, 2017. 652 | Introduction
Chunk token count: 24.25 | Text: biological, chemicals, or physical) and identify preventative 1014 | Protecting the Public Health
Chunk token count: 29.25 | Text: You can view it online here: http:/ /pressbooks.oer.hawaii.edu/ humannutrition2/?p=55   28 | Lifestyles and Nutrition


In [None]:
# Filter our DataFrame for rows with under 30 tokens
pages_and_chunks_over_min_token_len