In [4]:
# What is RAG

In [5]:
# Download PDF file
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


Opening the PDF file

In [6]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace('\n', " ").strip()

# Potentially more text formatting functions can go here...
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

  from .autonotebook import tqdm as notebook_tqdm
1208it [00:01, 700.34it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [7]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 540,
  'page_char_count': 1421,
  'page_word_count': 232,
  'page_sentence_count_raw': 11,
  'page_token_count': 355.25,
  'text': 'However, scientists do believe vitamin E in combination with other  antioxidants such as zinc and copper may slow the progression of  macular degeneration in people with early-stage disease.  Dementia  The brain’s high glucose consumption makes it more vulnerable  than other organs to oxidative stress. Oxidative stress has been  implicated as a major contributing factor to dementia and  Alzheimer’s disease. Some studies suggest vitamin E supplements  delay the progression of Alzheimer’s disease and cognitive decline,  but again, not all of the studies confirm the relationship. A recent  study with over five thousand participants published in the July 2010  issue of the Archives of Neurology demonstrated that people with  the highest intakes of dietary vitamin E were 25 percent less likely  to develop dementia than those with the lowest int

In [8]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [9]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15
std,348.86,560.44,95.75,6.19,140.11
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.75,134.0,4.0,190.69
50%,562.5,1232.5,215.0,10.0,308.12
75%,864.25,1605.25,271.25,14.0,401.31
max,1166.0,2308.0,429.0,32.0,577.0


Token count is important here because:
Embedding models and  LLMs dont deal with infinite tokens.
For Example: An embedding model might have been trained to embed sequences of 384 tokens into numerical space.
LLMs also cannot accept infinite tokens in their context windows.

### Further Text Processing
Splitting pages into sentences.
There are two ways to do this:
1) Split sentences based on `"."` characters.
2) Using NLP libraries like spaCy and nltk.

In [10]:
from spacy.lang.en import English
nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# Create document instance as an example
doc = nlp("This is a sentence. This is another sentence. I like elephants")
assert len(list(doc.sents)) == 3

# print out our sentences split
list(doc.sents)

[This is a sentence., This is another sentence., I like elephants]

In [11]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:02<00:00, 556.70it/s]


In [14]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1004,
  'page_char_count': 1375,
  'page_word_count': 250,
  'page_sentence_count_raw': 15,
  'page_token_count': 343.75,
  'text': 'Moldy  nectarines by  Roger  McLassus  1951 / CC  BY-SA 3.0  Warm, humid, or damp conditions encourage mold to grow on food.  Molds are microscopic fungi that live on animals and plants. No  one knows how many species of fungi exist, but estimates range  from ten- to three-hundred thousand. Unlike single-celled bacteria,  molds are multicellular, and under a microscope look like slender  mushrooms. They have stalks with spores that form at the ends. The  spores give molds their color and can be transported by air, water,  or insects. Spores also enable mold to reproduce. Additionally,  molds have root-like threads that may grow deep into food and  be difficult to see. The threads are very deep when a food shows  heavy mold growth. Foods that contain mold may also have bacteria  growing alongside it.  Some molds, like the kind found in blu

In [15]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.59,198.89,9.97,287.15,10.32
std,348.86,560.44,95.75,6.19,140.11,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.75,134.0,4.0,190.69,5.0
50%,562.5,1232.5,215.0,10.0,308.12,10.0
75%,864.25,1605.25,271.25,14.0,401.31,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


### Chunking out sentences together

The concept of splitting larger pieces of text into smaller ones is often referred to as text splitting or chunking.

There is no 100% correct way.

We'll keep it simple and split into groups of 10 sentences.

Why we do chunking?

- So our texts are easier to filter.
- So our text chunks can fit into our embedding model context window.
- So our contexts passed into the LLM can be more specific and focused.

In [16]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function to split lists of text recursively into chunk size.
# e.g. [20] -> [10, 10] or [25] -> [10, 10, 5]

def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]