In [None]:
# Libraries

from pypdf import PdfReader, PdfWriter
import camelot
import pandas as pd
import os

In [None]:
# Define a function to extract specific pages from a PDF file

def extract_pdf(source_path, new_pdf, start_page, end_page):
    """
    Splits a PDF into a new PDF containing only the specified page range.
    Creates the output directory if it does not exist.

    Parameters:
        source_path (str): Path to the input PDF file.
        new_pdf (str): Path to save the output PDF file.
        start_page (int): Starting page number (inclusive, 1-based).
        end_page (int): Ending page number (inclusive, 1-based).
    """
    output_dir = os.path.dirname(new_pdf)
    os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

    reader = PdfReader(source_path)
    writer = PdfWriter()

    for page_num in range(start_page - 1, end_page):
        writer.add_page(reader.pages[page_num])

    with open(new_pdf, "wb") as f:
        writer.write(f)

    print(f"Saved new PDF to {new_pdf} with pages {start_page}-{end_page}")

# Define a function to crop odd and even pages to specific respective dimensions. This is useful for eliminating extraneous text that will interefere with table extraction.
# Cropping can be delicate, so better to start with loose coordinates then adjust by hand.

def split_and_crop(input_path, output_path, odd_crop, even_crop):
    """
    Crops odd and even pages of a PDF with different dimensions and saves the result.

    Parameters:
        input_path (str): Path to the input PDF file.
        output_path (str): Full path to save the output PDF file.
        odd_crop (dict): Crop dimensions for odd pages.
        even_crop (dict): Crop dimensions for even pages.
    """
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

    reader = PdfReader(input_path)
    writer = PdfWriter()

    for i, page in enumerate(reader.pages):
        mediabox = page.mediabox
        crop = odd_crop if i % 2 == 0 else even_crop
        mediabox.lower_left = (mediabox.lower_left[0] + crop["left"], mediabox.lower_left[1] + crop["bottom"])
        mediabox.upper_right = (mediabox.upper_right[0] - crop["right"], mediabox.upper_right[1] - crop["top"])
        writer.add_page(page)

    with open(output_path, "wb") as f:
        writer.write(f)

    print(f"Cropped PDF saved to {output_path}")

# Define a function to extract tables.
# Camelot saves each page as a DF (Camelot's usual output), then each DF is saved as a CSV.
# This flow allows for the hand cleaning step of data wrangling.

def extract_tables_to_csv(input_pdf, output_dir, flavor='stream', strip_text='\n'):
    """
    Extracts tables from a PDF using Camelot and saves each table as a separate CSV file.
    Creates the output directory if it does not exist.

    Parameters:
        input_pdf (str): Path to the input PDF file.
        output_dir (str): Directory to save the CSV files.
        flavor (str): Camelot extraction flavor ('stream' or 'lattice').
        strip_text (str): Characters to strip from extracted text.
    """
   
    os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

    tables = camelot.read_pdf(input_pdf, pages='all', flavor=flavor, strip_text=strip_text)
    print(f"Found {tables.n} tables")
    for i, table in enumerate(tables):
        output_csv = f"{output_dir}\\table_{i+1}.csv"
        table.df.to_csv(output_csv, index=False)
        print(f"Saved table {i+1} to {output_csv}")

In [None]:
# Define your sections as a list of dictionaries
sections = [
    {
        "name": "manufacturing",
        "start_page": 50,
        "end_page": 55,
        "output_pdf": "bls_1312-10_1976_manufac.pdf",
        "odd_crop": {"left": 0, "bottom": 82, "right": 0, "top": 34},
        "even_crop": {"left": 0, "bottom": 90, "right": 0, "top": 31}
    },
    {
        "name": "durable goods",
        "start_page": 55,
        "end_page": 58,
        "output_pdf": "bls_1312-10_1976_durgoods.pdf",
        "odd_crop": {"left": 0, "bottom": 87, "right": 0, "top": 31},
        "even_crop": {"left": 0, "bottom": 90, "right": 0, "top": 28}
    },
    {
        "name": "ordnance",
        "start_page": 58,
        "end_page": 62,
        "output_pdf": "bls_1312-10_1976_ordnance.pdf",
        "odd_crop": {"left": 0, "bottom": 87, "right": 0, "top": 31},
        "even_crop": {"left": 0, "bottom": 90, "right": 0, "top": 28}
    },
    {
        "name": "blast furnaces and steel mills",
        "start_page": 134,
        "end_page": 136,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 82, "right": 0, "top": 30},
        "even_crop": {"left": 0, "bottom": 82, "right": 0, "top": 28}
    },
    {
        "name": "iron and steel foundries",
        "start_page": 136,
        "end_page": 138,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 80, "right": 0, "top": 30},
        "even_crop": {"left": 0, "bottom": 82, "right": 0, "top": 28}
    },
    {
        "name": "iron and steel forgings",
        "start_page": 163,
        "end_page": 165,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 80, "right": 0, "top": 30},
        "even_crop": {"left": 0, "bottom": 82, "right": 0, "top": 32}
    },
    {
        "name": "communication",
        "start_page": 295,
        "end_page": 297,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 80, "right": 0, "top": 34},
        "even_crop": {"left": 0, "bottom": 82, "right": 0, "top": 31}
    },
    {
        "name": "motor vehicles and equipment",
        "start_page": 314,
        "end_page": 319,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 80, "right": 0, "top": 34},
        "even_crop": {"left": 0, "bottom": 82, "right": 0, "top": 31}
    },
    {
        "name": "aircraft and parts",
        "start_page": 325,
        "end_page": 328,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 85, "right": 0, "top": 30},
        "even_crop": {"left": 0, "bottom": 85, "right": 0, "top": 28}
    },
    {
        "name": "ship building and repairing",
        "start_page": 338,
        "end_page": 340,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 85, "right": 0, "top": 30},
        "even_crop": {"left": 0, "bottom": 85, "right": 0, "top": 28}
    },
    {
        "name": "other transportation equipment",
        "start_page": 344,
        "end_page": 346,
        "output_pdf": "bls_1312-10_1976_steel_mills.pdf",
        "odd_crop": {"left": 0, "bottom": 85, "right": 0, "top": 32},
        "even_crop": {"left": 0, "bottom": 85, "right": 0, "top": 32}
    },
    {
        "name": "electronic components and accessories",
        "start_page": 301,
        "end_page": 303,
        "output_pdf": "bls_1312-10_1976_electronics.pdf",
        "odd_crop": {"left": 0, "bottom": 87, "right": 0, "top": 31},
        "even_crop": {"left": 0, "bottom": 90, "right": 0, "top": 28}
    },
    {
        "name": "furniture and fixtures",
        "start_page": 85,
        "end_page": 88,
        "output_pdf": "bls_1312-10_1976_furniture.pdf",
        "odd_crop": {"left": 0, "bottom": 87, "right": 0, "top": 31},
        "even_crop": {"left": 0, "bottom": 90, "right": 0, "top": 28}
    },
    {
        "name": "household appliances",
        "start_page": 277,
        "end_page": 279,
        "output_pdf": "bls_1312-10_1976_appliances.pdf",
        "odd_crop": {"left": 0, "bottom": 87, "right": 0, "top": 31},
        "even_crop": {"left": 0, "bottom": 90, "right": 0, "top": 28}
    }
    # Add more sections as needed
]
source_path = r"C:\Users\alex\coding_projects\bls_data\data\source\bls_1312-10_1976.pdf"
output_dir = r"C:\Users\alex\coding_projects\bls_data\data\output"

for idx, section in enumerate(sections, start=1):
    # Create one subdirectory for each section, prefixed with its index
    section_dir = os.path.join(output_dir, f"{idx:02d}_{section['name']}")
    os.makedirs(section_dir, exist_ok=True)

    # Step 1: Extract the section pages
    extracted_pdf_path = os.path.join(section_dir, section['output_pdf'])
    extract_pdf(
        source_path,
        extracted_pdf_path,
        section["start_page"],
        section["end_page"]
    )

    # Step 2: Crop the pages
    cropped_pdf_filename = f"{section['name']}_cropped.pdf"
    cropped_pdf_path = os.path.join(section_dir, cropped_pdf_filename)
    split_and_crop(
        extracted_pdf_path,
        cropped_pdf_path,  # Pass the full path now
        section["odd_crop"],
        section["even_crop"]
    )

    # Step 3: Extract tables to CSV
    extract_tables_to_csv(
        cropped_pdf_path,
        section_dir  # Save CSVs in the same section_dir
    )

Saved new PDF to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\bls_1312-10_1976_manufac.pdf with pages 50-55
Cropped PDF saved to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\manufacturing_cropped.pdf


  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


Found 7 tables
Saved table 1 to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\table_1.csv
Saved table 2 to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\table_2.csv
Saved table 3 to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\table_3.csv
Saved table 4 to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\table_4.csv
Saved table 5 to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\table_5.csv
Saved table 6 to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\table_6.csv
Saved table 7 to C:\Users\alex\coding_projects\bls_data\data\output\01_manufacturing\table_7.csv
Saved new PDF to C:\Users\alex\coding_projects\bls_data\data\output\02_durable goods\bls_1312-10_1976_durgoods.pdf with pages 55-58
Cropped PDF saved to C:\Users\alex\coding_projects\bls_data\data\output\02_durable goods\durable goods_cropped.pdf
Found 4 tables
Saved table 1 to C:\Users\alex\coding_projec