In [42]:
import os
import requests

pdf_path = r"data\human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  filename = pdf_path
  response = requests.get(url)

  if response.status_code == 200:
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File data\human-nutrition-text.pdf exists.


In [43]:
import fitz 
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number + 1,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

1208it [00:04, 296.39it/s]


[{'page_number': 1,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': 2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [44]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 456,
  'page_char_count': 1808,
  'page_word_count': 333,
  'page_sentence_count_raw': 14,
  'page_token_count': 452.0,
  'text': 'Additionally, a person should consume 8 ounces of cooked seafood  every week (typically as two 4-ounce servings) to assure they are  getting the healthy omega-3 fatty acids that have been linked to a  lower risk for heart disease. Another tip is choosing to eat dry beans,  peas, or soy products as a main dish. Some of the menu choices  include chili with kidney and pinto beans, hummus on pita bread,  and black bean enchiladas. You could also enjoy nuts in a variety of  ways. You can put them on a salad, in a stir-fry, or use them as a  topping for steamed vegetables in place of meat or cheese. If you do  not eat meat, the USDA has much more information on how to get  all the protein you need from a plant-based diet. When choosing the  best protein-rich foods to eat, pay attention to the whole nutrient  package and remember to select from a 

In [45]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()  

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,29,4,1,7.25,Human Nutrition: 2020 Edition
1,2,0,1,1,0.0,
2,3,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,4,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,5,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [46]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,1.0,0.0,1.0,1.0,0.0
25%,302.75,762.0,134.0,4.0,190.5
50%,604.5,1231.5,214.5,10.0,307.88
75%,906.25,1603.5,271.0,14.0,400.88
max,1208.0,2308.0,429.0,32.0,577.0


## Further text processing

In [47]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x74f22204dcc0>

In [48]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count"] = len(item["sentences"])

100%|██████████| 1208/1208 [00:04<00:00, 261.51it/s]


In [49]:
random.sample(pages_and_texts, k=1)

[{'page_number': 654,
  'page_char_count': 993,
  'page_word_count': 158,
  'page_sentence_count_raw': 5,
  'page_token_count': 248.25,
  'text': 'gastrointestinal disorders and diseases, such as Crohn’s disease and  kidney disease, as well as the aging process, impair mineral  absorption, putting people with malabsorption conditions and the  elderly at higher risk for mineral deficiencies.  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  exclu

In [50]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,1.0,0.0,1.0,1.0,0.0,0.0
25%,302.75,762.0,134.0,4.0,190.5,5.0
50%,604.5,1231.5,214.5,10.0,307.88,10.0
75%,906.25,1603.5,271.0,14.0,400.88,15.0
max,1208.0,2308.0,429.0,32.0,577.0,28.0


In [51]:
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count
0,1,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1
1,2,0,1,1,0.0,,[],0
2,3,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1
3,4,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1
4,5,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2


In [52]:
num_sentence_chunk_size = 10

def split_lists(input_list: list[str], slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    new_list = []
    for item in range(0, len(input_list), slice_size):
        new_list.append(input_list[item:item + slice_size])
        
    return new_list

In [53]:
test_list = list(range(25))
split_lists(input_list=test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [54]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_lists(input_list=item["sentences"])
    item["sentence_chunk_count"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 423745.02it/s]


In [55]:
random.sample(pages_and_texts, k=1)

[{'page_number': 104,
  'page_char_count': 1693,
  'page_word_count': 290,
  'page_sentence_count_raw': 13,
  'page_token_count': 423.25,
  'text': 'Basic Biology, Anatomy, and  Physiology  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  The Basic Structural and Functional Unit of Life:  The Cell  What distinguishes a living\xa0organism from an inanimate object? A  living organism conducts self-sustaining biological processes. A cell  is the smallest and most basic form of life.  The cell theory incorporates three principles:  Cells are the most basic building units of life.\xa0All living things  are composed of cells. New cells are made from preexisting cells,  which divide in two. Who you are has been determined because  of two cells that came together inside your mother’s womb. The  two cells containing all of your genetic information (DNA) united to  begin making new life. Cells divided and differentiated into other  cells with 