In [6]:
import os 

data_folder_path = "../data/"
file_name = "human-nutrition-text.pdf"
file_path = os.path.join(data_folder_path, file_name)

In [7]:
import pymupdf
from tqdm.auto import tqdm

pdf_document = pymupdf.open(file_path)

for page_no, page in tqdm(enumerate(pdf_document[:5])):
    print(page.get_text())


0it [00:00, ?it/s]

Human Nutrition: 2020 Edition 


Human Nutrition: 2020 
Edition 
UNIVERSITY OF HAWAI‘I AT MĀNOA 
FOOD SCIENCE AND HUMAN 
NUTRITION PROGRAM 
ALAN TITCHENAL, SKYLAR HARA, 
NOEMI ARCEO CAACBAY, WILLIAM 
MEINKE-LAU, YA-YUN YANG, MARIE 
KAINOA FIALKOWSKI REVILLA, 
JENNIFER DRAPER, GEMADY 
LANGFELDER, CHERYL GIBBY, CHYNA 
NICOLE CHUN, AND ALLISON 
CALABRESE 

Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and 
Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 
International License, except where otherwise noted. 

Contents 
Preface 
University of Hawai‘i at Mānoa Food Science and 
Human Nutrition Program and Human Nutrition 
Program 
xxv 
About the Contributors 
University of Hawai‘i at Mānoa Food Science and 
Human Nutrition Program and Human Nutrition 
Program 
xxvi 
Acknowledgements 
University of Hawai‘i at Mānoa Food Science and 
Human Nutrition Program and Human Nutrition 
Program 
xl 
Part I. Chapter 1. Basic Concepts in Nutriti

In [8]:
import pymupdf
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def read_pdf(pdf_path: str) -> list[dict]:
    document = pymupdf.open(pdf_path)
    pages = []
    for page_number, page in tqdm(enumerate(document)):
        text = page.get_text()
        text = text_formatter(text)
        pages.append(
            {
                "page_number": page_number - 41, #actual pdf content starts from page 41
                "text": text,
                "page_char_count": len(text),
                "page_word_count": len(text.split(" ")),
                "sentence_count": len(text.split(". ")),
                "page_token_count": len(text)/4
            }
        )

    return pages

pages = read_pdf(file_path)

0it [00:00, ?it/s]

In [9]:
import pandas as pd

df = pd.DataFrame(pages)
df.head()

Unnamed: 0,page_number,text,page_char_count,page_word_count,sentence_count,page_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,1,7.25
1,-40,,0,1,1,0.0
2,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,1,80.0
3,-38,Human Nutrition: 2020 Edition by University of...,212,32,1,53.0
4,-37,Contents Preface University of Hawai‘i at Mā...,797,145,2,199.25


## Text Processing

In [10]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")
doc = nlp("This is a sentence. This is another sentence. This is the last sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.
This is the last sentence.


In [11]:
for item in tqdm(pages):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sent) for sent in item["sentences"]]
    item["page_sentence_count"] = len(item["sentences"])


  0%|          | 0/1208 [00:00<?, ?it/s]

### split text

In [12]:

def split_list(input_list: list[str], slice_size: int) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]


for item in tqdm(pages):
    item["sentence_chunks"] = split_list(item["sentences"], 10)
    item["num_chunks"] = len(item["sentence_chunks"])


  0%|          | 0/1208 [00:00<?, ?it/s]

In [13]:
df = pd.DataFrame(pages)
df.head()

Unnamed: 0,page_number,text,page_char_count,page_word_count,sentence_count,page_token_count,sentences,page_sentence_count,sentence_chunks,num_chunks
0,-41,Human Nutrition: 2020 Edition,29,4,1,7.25,[Human Nutrition: 2020 Edition],1,[[Human Nutrition: 2020 Edition]],1
1,-40,,0,1,1,0.0,[],0,[],0
2,-39,Human Nutrition: 2020 Edition UNIVERSITY OF ...,320,54,1,80.0,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1,[[Human Nutrition: 2020 Edition UNIVERSITY O...,1
3,-38,Human Nutrition: 2020 Edition by University of...,212,32,1,53.0,[Human Nutrition: 2020 Edition by University o...,1,[[Human Nutrition: 2020 Edition by University ...,1
4,-37,Contents Preface University of Hawai‘i at Mā...,797,145,2,199.25,[Contents Preface University of Hawai‘i at M...,2,[[Contents Preface University of Hawai‘i at ...,1


In [15]:
import re

chunks = []
for item in tqdm(pages):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        chunk_dict["chunk"] = " ".join(sentence_chunk).strip()
        chunk_dict["chunk_char_count"] = len(chunk_dict["chunk"])
        chunk_dict["chunk_word_count"] = len(chunk_dict["chunk"].split(" "))
        chunk_dict["chunk_token_count"] = len(chunk_dict["chunk"])/4

        chunks.append(chunk_dict)


  0%|          | 0/1208 [00:00<?, ?it/s]

In [19]:
chunks[560]

{'page_number': 345,
 'chunk': 'Lipids and Disease  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Because heart disease, cancer, and stroke are the three leading  causes of death in the United States, it is critical to address dietary  and lifestyle choices that will ultimately decrease risk factors for  these diseases. According to the US Department of Health and  Human Services (HHS), the following risk factors are controllable:  high blood pressure, high cholesterol, cigarette smoking, diabetes,  poor diet, physical inactivity, being overweight, and obesity.  In light of that, we present the following informational tips to help  you define, evaluate, and implement healthy dietary choices to last  a lifetime. The amount and the type of fat that composes a person’s  dietary profile will have a profound effect upon the way fat and  cholesterol is metabolized in the body.  Watch Out for Saturated Fat and  Cholesterol  In proper amo

In [20]:
df = pd.DataFrame(chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,752.84,130.73,188.21
std,347.79,456.18,80.4,114.05
min,-41.0,14.0,4.0,3.5
25%,280.5,323.0,54.0,80.75
50%,586.0,766.0,134.0,191.5
75%,890.0,1140.0,198.0,285.0
max,1166.0,1871.0,413.0,467.75


In [21]:
#remove unnecessary chunks

min_token_length = 20

for row in df[df["chunk_token_count"] < min_token_length].iterrows():
    df.drop(row[0], inplace=True)

chunks = df.to_dict(orient="records")


In [23]:
len(chunks)

1755