In [100]:
import re
import os
import pdfplumber
from bs4 import BeautifulSoup
import csv

In [101]:
def extract_html_text(html_path, selectors=None):
    soup = BeautifulSoup(html_path, "html.parser")
    
    # If no selectors provided, fallback to whole body text
    selectors = selectors or []
    
    texts = []
    for sel in selectors:
        panels = soup.find_all(class_=sel)
        for panel in panels:
            panel_text = panel.get_text(separator="\n", strip=True)
            texts.append(panel_text)
    
    # If nothing found from selectors, fallback to entire page text
    if not texts:
        texts = [soup.get_text(separator="\n", strip=True)]
    
    full_text = "\n\n".join(texts)
    
    # Clean extra whitespace
    cleaned_text = re.sub(r'\n\s*\n+', '\n\n', full_text)  # multiple blank lines → 2 newlines
    cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text)    # multiple spaces → 1 space
    
    return cleaned_text

In [102]:
def extract_pdf_text_and_tables(pdf_path):

    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            # Extract text from the page
            text = page.extract_text() or ""
            all_text.append(f"\n\n{text}")

    return all_text


In [103]:
# Common selectors to try for extracting main content
selectors = [
    "accordion__panel",
    "main-content",
    "article-body",
    "content",
    "post-content",
    "entry-content",
    "page-content",
    "body-content",
    "text-content",
    "article",
    "post",
    # New: any class that starts with 'grid column'
    re.compile(r"^grid column"),
    "main", "div.content"
]

In [104]:
# Paths
raw_folders = ["../cmu_oie_scrape/html", "../cmu_oie_scrape/pdfs"]
output_folder = "../cmu_oie_scrape/cleaned"

os.makedirs(output_folder, exist_ok=True)

In [105]:
error_list = []

# Loop through both HTML and PDF folders
for raw_folder in raw_folders:
    for filename in os.listdir(raw_folder):
        file_path = os.path.join(raw_folder, filename)
        name, ext = os.path.splitext(filename)
        ext = ext.lower()

        try:
            prefix = "html_" if ext == ".html" else "pdf_" if ext == ".pdf" else None
            if prefix is None:
                continue

            output_path = os.path.join(output_folder, f"{prefix}{name.replace(' ', '_').lower()}.txt")

            if ext == ".html":
                with open(file_path, "r", encoding="utf-8") as f:
                    html = f.read()

                cleaned_text = extract_html_text(html, selectors)
                with open(output_path, "w", encoding="utf-8") as out_file:
                    out_file.write(cleaned_text)

                print(f"Saved cleaned HTML: {output_path}")

            elif ext == ".pdf":
                cleaned_text = extract_pdf_text_and_tables(file_path)
                with open(output_path, "w", encoding="utf-8") as out_file:
                    out_file.write("".join(cleaned_text))

                print(f"Saved cleaned PDF: {output_path}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            error_list.append({"file": file_path, "error": str(e)})

# Save error list to CSV
if error_list:
    error_csv_path = os.path.join(output_folder, "error_log.csv")
    with open(error_csv_path, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["file", "error"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(error_list)

    print(f"\nErrors logged in: {error_csv_path}")
else:
    print("\nAll files processed successfully — no errors.")

Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_curricular-practical-training.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_index.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_on-campus-employment.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_opt-stem-opt-extension_h1b-cap-gap-extension.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_opt-stem-opt-extension_i765-instructions.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_opt-stem-opt-extension_index.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_on-campus-employment.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_opt-stem-opt-extension_h1b-cap-gap-extension.txt
Saved cleaned HTML: ../cmu_oie_scrape/cleaned\html_oie_employment_f1-students_opt-stem-opt-extension_i765-

Cannot set gray non-stroke color because /'Paint10' is an invalid float value
Cannot set gray non-stroke color because /'Paint14' is an invalid float value
Cannot set gray non-stroke color because /'Paint14' is an invalid float value


Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_power-points_employment-authorization-overview.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_power-points_opt-pre.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_power-points_opt-pre.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_power-points_stem-opt-slides.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_power-points_stem-opt-slides.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_pre-opt-academic-advisor-recommendation-form.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_pre-opt-academic-advisor-recommendation-form.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_program-extension.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_program-extension.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_reduced.txt
Saved cleaned PDF: ../cmu_oie_scrape/cleaned\pdf_oie_docs_reduced.txt
Saved cleaned PDF: ../cmu_oie