In [1]:
import os
from pdfminer.high_level import extract_text
import re
import shutil

In [2]:
# PROCESS FILES
# extract text from pdfs and save as .txt files
PDF_DIR = "../data/pdfs"
PROCESSED_DIR = "../data/processed"

os.makedirs(PROCESSED_DIR, exist_ok=True)


for filename in os.listdir(PDF_DIR):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(PDF_DIR, filename)
        print(f"Processing: {filename}")

        text = extract_text(pdf_path) # magic done in this line from pdfminer

        txt_filename = "PROCESSED_" + filename.replace(".pdf", ".txt")
        out_path = os.path.join(PROCESSED_DIR, txt_filename)

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"Saved: {txt_filename}")


Processing: cultivating_kindness.docx.pdf
Saved: PROCESSED_cultivating_kindness.docx.txt
Processing: cureus-0015-00000040650.pdf
Saved: PROCESSED_cureus-0015-00000040650.txt
Processing: guided_meditation_befriending_yourself.docx.pdf
Saved: PROCESSED_guided_meditation_befriending_yourself.docx.txt
Processing: guided_meditation_connecting_to_community.docx.pdf
Saved: PROCESSED_guided_meditation_connecting_to_community.docx.txt
Processing: guided_meditation_connecting_with_values.docx.pdf
Saved: PROCESSED_guided_meditation_connecting_with_values.docx.txt
Processing: guided_meditation_no_agenda.docx.pdf
Saved: PROCESSED_guided_meditation_no_agenda.docx.txt
Processing: guided_meditation_self-compassion.docx.pdf
Saved: PROCESSED_guided_meditation_self-compassion.docx.txt
Processing: guided_meditation_thoughts_and_emotions.docx.pdf
Saved: PROCESSED_guided_meditation_thoughts_and_emotions.docx.txt
Processing: introduction_to_no_agenda_practice.docx.pdf
Saved: PROCESSED_introduction_to_no_agen

In [3]:
# cleaning function

def clean_text(text: str) -> str:
    # Remove form feed characters
    text = text.replace("\x0c", "")

    # Remove parentheses
    text = re.sub(r"\([^)]*\)", "", text)

    # Remove brackets
    text = re.sub(r"\[[^\]]*\]", "", text)

    # Remove timestamps XX:XX:
    text = re.sub(r"\b\d{1,2}:\d{2}:\s*", "", text)

    # Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [4]:
# CLEAN FILES
CLEANED_DIR = "../data/cleaned"

os.makedirs(CLEANED_DIR, exist_ok=True)

# we want to clean guided meditations differently than the scientific article
# so we skip it for now
SCIENCE_FILE = "PROCESSED_cureus-0015-00000040650.txt"

# loop over all text files
for filename in os.listdir(PROCESSED_DIR):
    if filename.lower().endswith(".txt") and filename != SCIENCE_FILE:
        file_path = os.path.join(PROCESSED_DIR, filename)

        print(f"Loading: {filename}")

        # read current text file
        with open(file_path, "r", encoding="utf-8") as f:
            raw_text = f.read()

        cleaned_text = clean_text(raw_text) # call cleaned_text method

        # saves output
        cleaned_filename = "CLEAN_" + filename.replace(".txt", "_cleaned.txt")
        out_path = os.path.join(CLEANED_DIR, cleaned_filename)

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        print(f"Saved cleaned version: {cleaned_filename}")

Loading: PROCESSED_cultivating_kindness.docx.txt
Saved cleaned version: CLEAN_PROCESSED_cultivating_kindness.docx_cleaned.txt
Loading: PROCESSED_guided_meditation_befriending_yourself.docx.txt
Saved cleaned version: CLEAN_PROCESSED_guided_meditation_befriending_yourself.docx_cleaned.txt
Loading: PROCESSED_guided_meditation_connecting_to_community.docx.txt
Saved cleaned version: CLEAN_PROCESSED_guided_meditation_connecting_to_community.docx_cleaned.txt
Loading: PROCESSED_guided_meditation_connecting_with_values.docx.txt
Saved cleaned version: CLEAN_PROCESSED_guided_meditation_connecting_with_values.docx_cleaned.txt
Loading: PROCESSED_guided_meditation_no_agenda.docx.txt
Saved cleaned version: CLEAN_PROCESSED_guided_meditation_no_agenda.docx_cleaned.txt
Loading: PROCESSED_guided_meditation_self-compassion.docx.txt
Saved cleaned version: CLEAN_PROCESSED_guided_meditation_self-compassion.docx_cleaned.txt
Loading: PROCESSED_guided_meditation_thoughts_and_emotions.docx.txt
Saved cleaned vers

NOTE: Files were cleaned manually afterwards and placed in finalized folder. Minimal changes like removing page numbers and cleaning the scientific paper.

In [8]:
# clean scientific article
def clean_science_article(text: str) -> str:
    # 1. Remove everything inside brackets (citations, figure refs, etc.)
    text = re.sub(r"\[[^\]]*\]", "", text)

    # 2. Remove all line breaks
    text = text.replace("\n", " ")

    # 3. Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

SCIENCE_FILE = "../data/processed/PROCESSED_cureus-0015-00000040650.txt"
OUTPUT_FILE = "../data/finalized/cureus_cleaned.txt"

with open(SCIENCE_FILE, "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_science_article(raw_text)

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("Saved cleaned scientific article.")


Saved cleaned scientific article.


In [None]:
# rename files after manually cleaning and finalizing


os.makedirs(CLEANED_DIR, exist_ok=True)

# loop over all text files
for filename in os.listdir(CLEANED_DIR):
    if filename.lower().endswith(".txt"):
        old_path = os.path.join(CLEANED_DIR, filename)

        new_name = filename[16:]

        new_path = os.path.join(CLEANED_DIR, new_name)

        # copy file with new name to finalized folder
        shutil.copy(old_path, new_path)

        print(f"Renamed and saved: {new_name}")


Renamed and saved: cultivating_kindness.docx_cleaned.txt
Renamed and saved: guided_meditation_befriending_yourself.docx_cleaned.txt
Renamed and saved: guided_meditation_connecting_to_community.docx_cleaned.txt
Renamed and saved: guided_meditation_connecting_with_values.docx_cleaned.txt
Renamed and saved: guided_meditation_no_agenda.docx_cleaned.txt
Renamed and saved: guided_meditation_self-compassion.docx_cleaned.txt
Renamed and saved: guided_meditation_thoughts_and_emotions.docx_cleaned.txt
Renamed and saved: introduction_to_no_agenda_practice.docx_cleaned.txt
Renamed and saved: just_like_me_female_voice.docx_cleaned.txt
Renamed and saved: week_five_just_like_me_male_voice.docx_cleaned.txt
Renamed and saved: week_four_thoughts_and_emotions.docx_cleaned.txt
Renamed and saved: week_four_working_with_thoughts.docx_1_cleaned.txt
Renamed and saved: week_one_awareness_of_breath_and_body_8_minutes.docx_cleaned.txt
Renamed and saved: week_one_grounding_meditation_5_minutes.docx_cleaned.txt
Re