# 2011 - 2017 NCAU PDF Extraction
This notebook is part of the workflow of creating NCAU dataset based on the Summary Report Data, covering 2004 – 2024.

In [None]:
import PyPDF2
import pdfplumber
import pandas as pd
import os

def pdf_table_to_excel(pdf_path, excel_path):
    """
    Extract tables from a PDF file and save them to an Excel file for 2007-2011 data "
    """
    tables = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_table = page.extract_table()
            if extracted_table:
                df = pd.DataFrame(extracted_table)
                tables.append(df)

    if tables:
        with pd.ExcelWriter(excel_path) as writer:
            for i, df in enumerate(tables):
                df.to_excel(writer, sheet_name=f'Page_{i+1}', index=False, header=False)
        print(f"Excel file saved at: {excel_path}")
    else:
        print("No tables found in the PDF.")


In [None]:
input_dir = 'Data/pdf_summary/2011-2017'

output_dir = 'Data/pdf_summary/2011-2017_tables'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for filename in os.listdir(input_dir):
        if filename.lower().endswith(".pdf"):
            input_pdf_path = os.path.join(input_dir, filename)
            output_excel_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.xlsx")

            pdf_table_to_excel(input_pdf_path, output_excel_path)
            print(f'Processed {filename}')

Excel file saved at: Data/pdf_summary/2015-2017/tables/paca-facts-2016-12 (dragged).xlsx
Processed paca-facts-2016-12 (dragged).pdf
Excel file saved at: Data/pdf_summary/2015-2017/tables/paca-facts-2017-q4 (dragged).xlsx
Processed paca-facts-2017-q4 (dragged).pdf
Excel file saved at: Data/pdf_summary/2015-2017/tables/PACA-Facts-2015-12 (dragged).xlsx
Processed PACA-Facts-2015-12 (dragged).pdf
