In [1]:
#!pip install pandas
#!{sys.executable} -m pip install --user --upgrade tabula-py==2.9.0

In [2]:
import boto3, os
import pandas as pd
import tabula
from io import BytesIO
from pypdf import PdfReader

print(f"boto3 version {boto3.__version__}")

boto3 version 1.34.50


In [3]:
from applyllm.io import (
    S3AccessConf,
    S3BucketHelper,
)

In [4]:
BUCKET_NAME="scivias-medreports"
VERIFY_HOST=True
pdf_report_prefix="KK-SCIVIAS"
LIMIT_COUNT = -1
# LIMIT_COUNT = 3

In [5]:
s3_conf = S3AccessConf(
    access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
    secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
    endpoint = os.environ.get('S3_ENDPOINT'),
    bucket_name = BUCKET_NAME,
    verify_host = VERIFY_HOST,
)
s3_text_reports_helper = S3BucketHelper(conf=s3_conf, file_prefix=pdf_report_prefix)

In [6]:
session = boto3.session.Session(
    aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
)
s3 = session.resource('s3', endpoint_url = os.environ.get('S3_ENDPOINT'), verify=VERIFY_HOST)
bucket = s3.Bucket(BUCKET_NAME)

In [7]:
# file1 = 'KK-SCIVIAS-00004^0051726752^2015-12-17^KIIS1.pdf' # Contains table
# file2 = 'KK-SCIVIAS-00004^0054584394^2021-01-17^KIITCH.pdf' # No table

In [8]:
def get_s3_pdf_bytesio(s3, bucket_name, key: str):
    obj = s3.Object(bucket_name, key)
    return obj.get()['Body'].read()

In [9]:
# fs = get_s3_pdf_bytesio(s3=s3, bucket_name=BUCKET_NAME, key=file1)

### tabula-py
* https://pypi.org/project/tabula-py/
* https://community.dataiku.com/t5/Using-Dataiku/read-pdf-with-tabula-on-S3/m-p/38023

`tabula-py` has fallback subprocess now, if there is no jpype module to interact with java
* https://stackoverflow.com/questions/77077943/pyspark-tabula-py-read-pdf-error-no-module-named-org-apache-commons/77161052#77161052

In [10]:
def extract_tables(pdf_bytesio, page_numbers='all', stream=True):
    """
    Extract tables from a PDF and organize the data into a list of combined dictionaries.
    
    Parameters:
    - pdf_path: Path to the PDF file.
    - page_numbers: Pages to extract tables from ('all' for all pages).
    
    Returns:
    - A list of dictionaries, each representing combined data from the same column across all tables.
    """
    # Extract all tables from the specified pages of the PDF
    # tables is a list of pandas DataFrame
    df_list = tabula.read_pdf(pdf_bytesio, pages=page_numbers, multiple_tables=True, pandas_options={'header': None}, stream=stream)
    return df_list

def contains_table(pdf_bytesio):
    df_list = extract_tables(pdf_bytesio, page_numbers='all')
    return df_list is not None and len(df_list) > 0
    

In [11]:
contains_table_map = map(
    lambda x: {
        "key": str(x),
        "contains_table": contains_table(BytesIO(get_s3_pdf_bytesio(s3, BUCKET_NAME, x)))
    }, s3_text_reports_helper.get_object_keys(limit_count=LIMIT_COUNT))

### ignore warning
* https://stackoverflow.com/questions/14463277/how-to-disable-python-warnings

In [None]:
from applyllm.utils import time_func 
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="ICC profile")
warnings.filterwarnings("ignore", message="org.apache.pdfbox")

@time_func
def get_results():
    return list(contains_table_map)

contains_table_dict_list = get_results()

# with warnings.catch_warnings():
#    warnings.simplefilter("ignore")
#    contains_table_dict_list = list(contains_table_map)

Error importing jpype dependencies. Fallback to subprocess.
No module named 'jpype'
Got stderr: Feb 27, 2024 10:28:27 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2024 10:28:28 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2024 10:28:28 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2024 10:28:28 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2024 10:28:28 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile

Got stderr: Feb 27, 2024 10:28:29 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2024 10:28:29 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2024 10:28:30 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2024 10:28:30 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Feb 27, 2

In [None]:
# contains_table_dict_list

In [None]:
key_list = []
has_table_list = []
for dict in contains_table_dict_list:
    key_list.append(dict['key'])
    has_table_list.append(dict['contains_table'])

In [None]:
import pandas as pd
data = {'key': key_list,'contains_table': has_table_list}
result_df = pd.DataFrame.from_dict(data)

In [None]:
result_df.to_csv("./contains_tables.csv", index=False)