In [1]:
import camelot

In [2]:
print(f"Using camelot v{camelot.__version__}.")

Using camelot v1.0.0.


In [3]:
import logging
import pandas as pd

from pathlib import Path
from pypdf import PdfReader
from IPython.display import display

# Set up logging
logging.getLogger("camelot").setLevel(logging.INFO)
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def process_pdf(pdf_file, output_dir):
    print(f"Processing {pdf_file.name}")
    logging.info(f"Processing {pdf_file.name}")

    # Verify PDF can be opened with PdfReader before processing
    try:
        reader = PdfReader(str(pdf_file))
        if len(reader.pages) == 0:
            raise ValueError(f"No pages found in PDF {pdf_file.name}")
    except Exception as e:
        print(f"Failed to open PDF {pdf_file.name} with PdfReader: {e}")
        logging.error(f"Failed to open PDF {pdf_file.name} with PdfReader: {e}")
        return

    # Read tables from the PDF using camelot
    try:
        tables = camelot.read_pdf(str(pdf_file))
    except Exception as e:
        print(f"Failed to read PDF {pdf_file.name}: {e}")
        logging.error(f"Failed to read PDF {pdf_file.name}: {e}")
        return

    if len(tables) == 0:
        print(f"No tables detected in {pdf_file.name}")
        logging.warning(f"No tables detected in {pdf_file.name}")
        return

    # Create a subdirectory for this PDF's output
    pdf_output_dir = output_dir / pdf_file.stem
    pdf_output_dir.mkdir(exist_ok=True)

    # Process individual tables
    for i, table in enumerate(tables):
        try:
            # Convert table to pandas DataFrame
            df = table.df

            # Display the DataFrame
            print(f"\nTable {i+1} from {pdf_file.name}:")
            display(df)

            # Save individual table to CSV
            csv_path = pdf_output_dir / f"{pdf_file.stem}_table_{i+1}.csv"
            df.to_csv(csv_path, index=False)
            print(f"Saved to {csv_path}")

            # Log parsing report for each table
            print(f"\nTable {i+1} Parsing Report:")
            logging.info(f"Table {i+1} Parsing Report:")
            print(table.parsing_report)
            logging.info(table.parsing_report)
        except Exception as e:
            print(f"Failed to process or save table {i+1} from {pdf_file.name}: {e}")
            logging.error(
                f"Failed to process or save table {i+1} from {pdf_file.name}: {e}"
            )

In [4]:
# Define input_dir and output_dir
input_dir = Path("./data/")
output_dir = Path("./data/output")
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)

print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

Input directory: data
Output directory: data/output


In [5]:
# Process each PDF in the input directory
pdf_files = list(input_dir.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files")

if len(pdf_files) == 0:
    print("No PDF files found in the input directory.")
    logging.warning("No PDF files found in the input directory.")
else:
    for pdf_file in pdf_files:
        process_pdf(pdf_file, output_dir)

    print("Processing complete. Check the 'output' folder for results.")
    logging.info("Processing complete. Check the 'output' folder for results.")

print("Script execution finished.")

2025-02-22 15:57:40,036 - INFO - Processing 20AKSAE_A017.pdf


Found 4 PDF files
Processing 20AKSAE_A017.pdf


2025-02-22 15:57:40,576 - INFO - Processing sample_pdf.pdf


No tables detected in 20AKSAE_A017.pdf
Processing sample_pdf.pdf


2025-02-22 15:57:41,204 - INFO - Processing 2502.11946v2.pdf


No tables detected in sample_pdf.pdf
Processing 2502.11946v2.pdf


2025-02-22 15:57:41,858 - INFO - Processing foo.pdf


No tables detected in 2502.11946v2.pdf
Processing foo.pdf

Table 1 from foo.pdf:


Unnamed: 0,0,1,2,3,4,5,6
0,Cycle \nName,KI \n(1/km),Distance \n(mi),Percent Fuel Savings,,,
1,,,,Improved \nSpeed,Decreased \nAccel,Eliminate \nStops,Decreased \nIdle
2,2012_2,3.30,1.3,5.9%,9.5%,29.2%,17.4%
3,2145_1,0.68,11.2,2.4%,0.1%,9.5%,2.7%
4,4234_1,0.59,58.7,8.5%,1.3%,8.5%,3.3%
5,2032_2,0.17,57.8,21.7%,0.3%,2.7%,1.2%
6,4171_1,0.07,173.9,58.1%,1.6%,2.1%,0.5%


2025-02-22 15:57:42,441 - INFO - Table 1 Parsing Report:
2025-02-22 15:57:42,441 - INFO - {'accuracy': 99.02, 'whitespace': 12.24, 'order': 1, 'page': 1}
2025-02-22 15:57:42,442 - INFO - Processing complete. Check the 'output' folder for results.


Saved to data/output/foo/foo_table_1.csv

Table 1 Parsing Report:
{'accuracy': 99.02, 'whitespace': 12.24, 'order': 1, 'page': 1}
Processing complete. Check the 'output' folder for results.
Script execution finished.


In [6]:
pdf_files

[PosixPath('data/20AKSAE_A017.pdf'),
 PosixPath('data/sample_pdf.pdf'),
 PosixPath('data/2502.11946v2.pdf'),
 PosixPath('data/foo.pdf')]