In [None]:
# import sys

In [None]:
#!pip install pandas
#!{sys.executable} -m pip install --user --upgrade tabula-py==2.9.0

In [None]:
import boto3, os
import pandas as pd
import tabula
from io import BytesIO
from pypdf import PdfReader

print(f"boto3 version {boto3.__version__}")

In [None]:
BUCKET_NAME="scivias-medreports"
VERIFY_HOST=True

session = boto3.session.Session(
    aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
)
s3 = session.resource('s3', endpoint_url = os.environ.get('S3_ENDPOINT'), verify=VERIFY_HOST)
bucket = s3.Bucket(BUCKET_NAME)

In [None]:
def get_s3_pdf_bytesio(s3, bucket_name, key: str):
    obj = s3.Object(bucket_name, key)
    return obj.get()['Body'].read()

In [None]:
# file1 = 'KK-SCIVIAS-00003^0055176166^2020-12-28^KIITCH.pdf' # do not have this file yet. Contains table
file1 = 'KK-SCIVIAS-00004^0051726752^2015-12-17^KIIS1.pdf' # Contains table
file2 = 'KK-SCIVIAS-00004^0054584394^2021-01-17^KIITCH.pdf' # No table

In [None]:
fs = get_s3_pdf_bytesio(s3=s3, bucket_name=BUCKET_NAME, key=file1)

### tabula-py
* https://pypi.org/project/tabula-py/
* https://community.dataiku.com/t5/Using-Dataiku/read-pdf-with-tabula-on-S3/m-p/38023

In [None]:
def extract_tables(pdf_bytesio, page_numbers='all'):
    """
    Extract tables from a PDF and organize the data into a list of combined dictionaries.
    
    Parameters:
    - pdf_path: Path to the PDF file.
    - page_numbers: Pages to extract tables from ('all' for all pages).
    
    Returns:
    - A list of dictionaries, each representing combined data from the same column across all tables.
    """
    # Extract all tables from the specified pages of the PDF
    # tables is a list of pandas DataFrame
    df_list = tabula.read_pdf(pdf_bytesio, pages=page_numbers, multiple_tables=True, pandas_options={'header': None})
    return df_list

def contains_table(pdf_bytesio):
    df_list = extract_tables(pdf_bytesio, page_numbers='all')
    return df_list is not None and len(df_list) > 0
    

In [None]:
output = "contains table" if contains_table(BytesIO(fs)) else "no table"
print(output)

In [None]:
df_list = extract_tables(BytesIO(fs))

for df in df_list:
    print(df)

In [None]:
def extract_columns_to_dict(pdf_bytesio, page_numbers='all'):
    """
    Extract tables from a PDF and organize the data into a list of combined dictionaries.
    
    Parameters:
    - pdf_path: Path to the PDF file.
    - page_numbers: Pages to extract tables from ('all' for all pages).
    
    Returns:
    - A list of dictionaries, each representing combined data from the same column across all tables.
    """
    # Extract all tables from the specified pages of the PDF
    # tables is a list of pandas DataFrame
    tables = tabula.read_pdf(pdf_bytesio, pages=page_numbers, multiple_tables=True, pandas_options={'header': None})
    
    # Initialize a list to collect data from each table
    collected_data = []

    # Iterate through each extracted table
    for table in tables:
        # Iterate through each row in the current table
        for index, row in table.iterrows():
            # Skip rows where the first column is NaN
            if pd.isna(row[0]):
                continue
            
            # Create a dictionary for the current row, excluding the first column
            row_dict = {f"col{col_idx}": row[col_idx] for col_idx in range(1, len(row))}
            
            # Use the value of the first column as the key for the current row's data
            collected_data.append({row[0]: row_dict})

    if not collected_data:
        print("No tables found")
        return []

    column_keys = next(iter(collected_data[0].values())).keys()
    
    # Initialize a list for data dictionaries
    combined_list = []

    # Combine data across the same column from all dictionaries
    for key in column_keys:
        combined_dict = {d_key: d_val[key] for data in collected_data for d_key, d_val in data.items()}
        combined_list.append(combined_dict)
    
    return combined_list, tables


In [None]:
combined_list, df_list = extract_columns_to_dict(BytesIO(fs))

for df in df_list:
    print(df)