In [5]:
import os, re, glob
import pandas as pd
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader

In [8]:
RAW_DIR = "../data/raw"
OUT_PATH = "../data/processed/chunks.csv"

def extract_text(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    elif file_path.endswith(".html") or file_path.endswith(".htm"):
        with open(file_path, encoding="utf-8") as f:
            html = f.read()
        text = BeautifulSoup(html, "html.parser").get_text()
    else:
        print(f"Skipping unsupported file: {file_path}")
    return text

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def chunk_text(text, n_words=500):
    words = text.split()
    return [" ".join(words[i:i+n_words]) for i in range(0, len(words), n_words)]

all_chunks = []
for path in glob.glob(os.path.join(RAW_DIR, "*")):
    raw = extract_text(path)
    clean = clean_text(raw)
    chunks = chunk_text(clean)
    for c in chunks:
        all_chunks.append({"source": os.path.basename(path), "text": c})

df = pd.DataFrame(all_chunks)
os.makedirs("../data/processed", exist_ok=True)
df.to_csv(OUT_PATH, index=False)
print(f"Saved {len(df)} chunks to {OUT_PATH}")


Saved 471 chunks to ../data/processed/chunks.csv
