### Imports and Directory

In [3]:
import os
from pypdf import PdfReader
import re

In [2]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.4.0


In [4]:
# Create output for chunk processing

os.makedirs('processed_chunks', exist_ok=True)

In [5]:
pdf_path = "Advisory-report-Dutch-dietary-guidelines-for-people-with-type-2-diabetes.pdf"

### Text Extraction

In [7]:
print(f"Extracting text from {pdf_path}...")
all_text = ""
reader = PdfReader(pdf_path)
    
for page_num in range(len(reader.pages)):
    if page_num < 2 or page_num >= 43:  # 0-indexed, so 0-1 and 43+ pages
        continue       
    page_text = reader.pages[page_num].extract_text()
    all_text += page_text + "\n\n"

print("Text extraction complete")

Extracting text from Advisory-report-Dutch-dietary-guidelines-for-people-with-type-2-diabetes.pdf...
Text extraction complete


### Cleaning

In [8]:
def clean_basic_text(text):

    # Remove headers like "Health Council of the Netherlands | No. 2021/41e"
    text = re.sub(r'Health Council of the Netherlands \| No\. 2021\/41e', '', text)
    
    # Remove page indicators like "Dutch dietary guidelines for people with type 2 diabetes | page x of 53"
    text = re.sub(r'Dutch dietary guidelines for people with type 2 diabetes \| page \d+ of \d+', '', text)
   
    return text.strip()

In [9]:
clean_text = clean_basic_text(all_text)

In [10]:
with open("processed_data/cleaned_dietary_guidelines.txt", "w", encoding="utf-8") as f:
    f.write(clean_text)