In [1]:
import os
import requests

pdf_path = r"data\human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  filename = pdf_path
  response = requests.get(url)

  if response.status_code == 200:
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as data\human-nutrition-text.pdf


In [2]:
import fitz 
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number + 1,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

  from .autonotebook import tqdm as notebook_tqdm
1208it [00:01, 979.00it/s]


[{'page_number': 1,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': 2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [7]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 964,
  'page_char_count': 1596,
  'page_word_count': 281,
  'page_sentence_count_raw': 13,
  'page_token_count': 399.0,
  'text': 'should be lean, and healthy fats, such as omega-3 fatty acids, are  part of any good diet.  Micronutrients  An increase in certain micronutrients can help maintain health  during this life stage. The recommendations for calcium increase to  1,200 milligrams per day for both men and women to slow bone loss.  Also to help protect bones, vitamin D recommendations increase  to 10–15 micrograms per day for men and women. Vitamin B6  recommendations rise to 1.7 milligrams per day for older men and  1.5 milligrams per day for older women to help lower levels of  homocysteine and protect against cardiovascular disease. As adults  age, the production of stomach acid can decrease and lead to an  overgrowth of bacteria in the small intestine. This can affect the  absorption of vitamin B12\xa0and cause a deficiency. As a result, older  adults need more

In [8]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()  

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,29,4,1,7.25,Human Nutrition: 2020 Edition
1,2,0,1,1,0.0,
2,3,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,4,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,5,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [9]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,1.0,0.0,1.0,1.0,0.0
25%,302.75,762.0,134.0,4.0,190.5
50%,604.5,1231.5,214.5,10.0,307.88
75%,906.25,1603.5,271.0,14.0,400.88
max,1208.0,2308.0,429.0,32.0,577.0
