In [1]:
# Libraries
from pypdf import PdfReader, PdfWriter
import camelot
import pandas as pd
import os
import glob

In [2]:
# Define a function to extract specific pages from a PDF file
def extract_pdf(source_path, new_pdf, start_page, end_page):
    """
    Splits a PDF into a new PDF containing only the specified page range.
    Creates the output directory if it does not exist.

    Parameters:
        source_path (str): Path to the input PDF file.
        new_pdf (str): Path to save the output PDF file.
        start_page (int): Starting page number (inclusive, 1-based).
        end_page (int): Ending page number (inclusive, 1-based).
    """
    output_dir = os.path.dirname(new_pdf)
    os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

    reader = PdfReader(source_path)
    writer = PdfWriter()

    for page_num in range(start_page - 1, end_page):
        writer.add_page(reader.pages[page_num])

    with open(new_pdf, "wb") as f:
        writer.write(f)

    print(f"Saved new PDF to {new_pdf} with pages {start_page}-{end_page}")

# Define a function to crop odd and even pages to specific respective dimensions.
def split_and_crop(input_path, output_path, odd_crop, even_crop):
    """
    Crops odd and even pages of a PDF with different dimensions and saves the result.

    Parameters:
        input_path (str): Path to the input PDF file.
        output_path (str): Full path to save the output PDF file.
        odd_crop (dict): Crop dimensions for odd pages.
        even_crop (dict): Crop dimensions for even pages.
    """
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

    reader = PdfReader(input_path)
    writer = PdfWriter()

    for i, page in enumerate(reader.pages):
        mediabox = page.mediabox
        crop = odd_crop if i % 2 == 0 else even_crop
        mediabox.lower_left = (mediabox.lower_left[0] + crop["left"], mediabox.lower_left[1] + crop["bottom"])
        mediabox.upper_right = (mediabox.upper_right[0] - crop["right"], mediabox.upper_right[1] - crop["top"])
        writer.add_page(page)

    with open(output_path, "wb") as f:
        writer.write(f)

    print(f"Cropped PDF saved to {output_path}")

# Define a function to extract tables.
def extract_tables_to_csv(input_pdf, output_dir, flavor='stream', strip_text='\n'):
    """
    Extracts tables from a PDF using Camelot and saves each table as a separate CSV file.
    Creates the output directory if it does not exist.

    Parameters:
        input_pdf (str): Path to the input PDF file.
        output_dir (str): Directory to save the CSV files.
        flavor (str): Camelot extraction flavor ('stream' or 'lattice').
        strip_text (str): Characters to strip from extracted text.
    """
   
    os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

    tables = camelot.read_pdf(input_pdf, pages='all', flavor=flavor, strip_text=strip_text)
    print(f"Found {tables.n} tables")
    for i, table in enumerate(tables):
        output_csv = os.path.join(output_dir, f"table_{i+1}.csv")  # Cross-platform path
        table.df.to_csv(output_csv, index=False)
        print(f"Saved table {i+1} to {output_csv}")

In [4]:

# define paths
source_path = r"C:\Users\alex\coding_projects\bls_data\data\source\workstoppage"
output_dir = r"C:\Users\alex\coding_projects\bls_data\data\output"

# NEW: Scan the source directory for PDF files
pdf_files = sorted(glob.glob(os.path.join(source_path, "*.pdf")))
pdf_list = []

print("PDF files found in source directory:")
print("-" * 50)
for i, pdf_path in enumerate(pdf_files, start=1):
    pdf_name = os.path.basename(pdf_path)
    pdf_list.append(pdf_name)
    print(f"{i:2d}. {pdf_name}")

print("-" * 50)
print(f"Total PDFs found: {len(pdf_files)}")
print("\nYou can now reference these PDFs by their number in the pdf_configs list below.\n")

PDF files found in source directory:
--------------------------------------------------
 1. work-stoppages-1950.pdf
 2. work-stoppages-1951.pdf
 3. work-stoppages-1952.pdf
 4. work-stoppages-1953.pdf
 5. work-stoppages-1954.pdf
 6. work-stoppages-1955.pdf
 7. work-stoppages-1956.pdf
 8. work-stoppages-1957.pdf
 9. work-stoppages-1958.pdf
10. work-stoppages-1959.pdf
11. work-stoppages-1960.pdf
12. work-stoppages-1961.pdf
13. work-stoppages-1962.pdf
14. work-stoppages-1963.pdf
15. work-stoppages-1964.pdf
16. work-stoppages-1965.pdf
17. work-stoppages-1966.pdf
18. work-stoppages-1967.pdf
19. work-stoppages-1968.pdf
20. work-stoppages-1969.pdf
21. work-stoppages-1970.pdf
22. work-stoppages-1971.pdf
23. work-stoppages-1972.pdf
24. work-stoppages-1973.pdf
25. work-stoppages-1974.pdf
--------------------------------------------------
Total PDFs found: 25

You can now reference these PDFs by their number in the pdf_configs list below.



In [None]:
# Define your PDF configurations using the numbers from the list above
pdf_configs = [
    {
        "pdf_number": 1,  # Reference the PDF by its number from the list above
        "output_name": "1950",
        "start_page": 11,
        "end_page": 12,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 2,
        "output_name": "1951",
        "start_page": 15,
        "end_page": 16,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 3,
        "output_name": "1952", 
        "start_page": 12,
        "end_page": 14,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 4,  
        "output_name": "1953", 
        "start_page": 8,
        "end_page": 10,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 5,  
        "output_name": "1954", 
        "start_page": 15,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 6,  
        "output_name": "1955", 
        "start_page": 10,
        "end_page": 12,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 7,  
        "output_name": "1956", 
        "start_page": 14,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 8,  
        "output_name": "1957", 
        "start_page": 14,
        "end_page": 16,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 9,  
        "output_name": "1958", 
        "start_page": 12,
        "end_page": 14,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 10,  
        "output_name": "1959", 
        "start_page": 13,
        "end_page": 15,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 11,  
        "output_name": "1960", 
        "start_page": 11,
        "end_page": 13,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 12,  
        "output_name": "1961", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 13,  
        "output_name": "1962", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 14,  
        "output_name": "1963", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 15,  
        "output_name": "1964", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 16,  
        "output_name": "1965", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 17,  
        "output_name": "1966", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 18,  
        "output_name": "1967", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 19,  
        "output_name": "1968", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 20,  
        "output_name": "1969", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 21,  
        "output_name": "1970", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 22,  
        "output_name": "1971", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 23,  
        "output_name": "1972", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 24,  
        "output_name": "1973", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    },
    {
        "pdf_number": 25,  
        "output_name": "1974", 
        "start_page": 8,
        "end_page": 17,
        "odd_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0},
        "even_crop": {"left": 0, "bottom": 0, "right": 0, "top": 0}
    }
    # Add more configurations as needed
]

In [None]:
# Process each PDF configuration
for idx, config in enumerate(pdf_configs, start=1):
    try:
        # Create input and output paths
        input_pdf_path = os.path.join(source_dir, config["input_pdf"])
        pdf_output_dir = os.path.join(output_dir, f"{idx:02d}_{config['output_name']}")
        os.makedirs(pdf_output_dir, exist_ok=True)

        print(f"\nProcessing {config['input_pdf']} (pages {config['start_page']}-{config['end_page']})...")

        # Step 1: Extract the specific pages
        extracted_pdf_path = os.path.join(pdf_output_dir, f"{config['output_name']}_extracted.pdf")
        extract_pdf(
            input_pdf_path,
            extracted_pdf_path,
            config["start_page"],
            config["end_page"]
        )

        # Step 2: Crop the pages
        cropped_pdf_path = os.path.join(pdf_output_dir, f"{config['output_name']}_cropped.pdf")
        split_and_crop(
            extracted_pdf_path,
            cropped_pdf_path,
            config["odd_crop"],
            config["even_crop"]
        )

        # Step 3: Extract tables to CSV
        extract_tables_to_csv(
            cropped_pdf_path,
            pdf_output_dir  # Save CSVs in the same directory
        )
        
        print(f"Successfully processed {config['input_pdf']}\n")
    
    except FileNotFoundError as e:
        print(f"ERROR: Could not find file {config['input_pdf']}: {e}")
        continue
    except Exception as e:
        print(f"ERROR: Failed to process {config['input_pdf']}. Reason: {e}")
        continue