In [None]:
import fitz
import re
import pandas as pd
import google.generativeai as genai
import os

In [None]:
# Step 1: Extract text from PDF
def extract_pdf_text(path: str) -> str:
    doc = fitz.open(path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

# Step 2: Parse text into structured chunks with title
def parse_into_chunks(full_text: str):
    # Split by § section
    sections = re.split(r'(?=§\s*\d+\.\s+)', full_text)
    data = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # Extract § number and title, e.g., § 2. GENERAL REGULATIONS
        header_match = re.match(r'§\s*(\d+)\.\s*(.+)', section)
        if not header_match:
            continue

        base_number = header_match.group(1)
        title = header_match.group(2).strip()
        base_ref = f"§{base_number}"

        # Remove the title line before parsing points
        body_text = section[len(header_match.group(0)):].strip()

        # Split into numbered points like 1., 2., 3a.
        points = re.split(r'\n(?=\d+[a-zA-Z]?\.)', body_text)

        for point in points:
            point = point.strip()
            submatch = re.match(r'(\d+[a-zA-Z]?)\.', point)
            if submatch:
                subref = submatch.group(1)
                ref = f"{base_ref}({subref})"
                content = point[len(subref) + 1:].strip()
                data.append((ref, title, content))

    return pd.DataFrame(data, columns=["reference", "title", "content"])

In [None]:
pdf_path = "data/2023_10_01_RS_ENG.pdf"

text = extract_pdf_text(pdf_path)
df = parse_into_chunks(text)

In [None]:
df = df[df['content'] != '']

In [None]:
df["chunk"] = df.apply(lambda row: f"{row['reference']} - {row['title']}: {row['content']}", axis=1)

In [None]:
df