In [2]:
%pip install -Uqq duckdb
%pip install -Uqq duckdb-engine

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
import duckdb

# connect to an existing database, or create one if it doesn't exist
conn = duckdb.connect("loan_data.duckdb")

In [7]:
# we can query data using SQL directly from the CSV file
sample_df = conn.execute("SELECT * FROM 'data/ln_large.csv' LIMIT 5").df()
sample_df.head()

Unnamed: 0,TI_CU_CUSTOMER_ID,TI_CU_DATE_OF_BIRTH,TI_CU_DATE_FIRST_REL,TI_CU_CUST_TYPE,TI_CU_NUM_CURR_ACCT,TI_CU_NUM_REV_ACCT,TI_CU_NUM_MTGE_ACCT,TI_CU_NUM_LOAN_ACCT,TI_CU_NUM_DEP_ACCT,TI_LN_ACCOUNT_ID,...,TI_LN_REMAINING_TERM,TI_LN_BLOCK_CODE,TI_LN_BALANCE,TI_LN_INSTALLMENT_DUE,TI_LN_VAL_PAYMENTS,TI_LN_VAL_ARREARS,TI_LN_VAL_INTEREST,TI_LN_VAL_TOTAL_FEES,TI_LN_NUM_MTHS_IN_ARREARS,TI_LN_FINAL_CHARGE_CYCLE
0,C_1,1947-10-31,1985-05-22,310,11,4,5,3,4,C_1_LN_1,...,36,0,412,265,16,0,212,121,0,0
1,C_1,1947-10-31,1985-05-22,310,11,4,5,3,4,C_1_LN_1,...,35,0,943,279,87,200,218,73,1,0
2,C_1,1947-10-31,1985-05-22,310,11,4,5,3,4,C_1_LN_1,...,34,0,270,292,69,400,276,25,2,0
3,C_1,1947-10-31,1985-05-22,310,11,4,5,3,4,C_1_LN_1,...,33,0,2339,104,36,600,134,127,3,0
4,C_1,1947-10-31,1985-05-22,310,11,4,5,3,4,C_1_LN_1,...,32,0,287,97,283,800,177,9,4,0


In [9]:
# we can also validate how well the data types were inferred from the CSV file
sample_df.dtypes

TI_CU_CUSTOMER_ID                       object
TI_CU_DATE_OF_BIRTH             datetime64[us]
TI_CU_DATE_FIRST_REL            datetime64[us]
TI_CU_CUST_TYPE                          int64
TI_CU_NUM_CURR_ACCT                      int64
TI_CU_NUM_REV_ACCT                       int64
TI_CU_NUM_MTGE_ACCT                      int64
TI_CU_NUM_LOAN_ACCT                      int64
TI_CU_NUM_DEP_ACCT                       int64
TI_LN_ACCOUNT_ID                        object
TI_LN_DATE_OPEN                 datetime64[us]
TI_LN_DATE_CLOSED                       object
TI_LN_WRITE_OFF_DATE                    object
TI_LN_REASON_CLOSED                      int64
TI_LN_NUM_PARTIES                        int64
TI_LN_ACCOUNT_TYPE                       int64
TI_LN_PURPOSE                           object
TI_LN_ORIGINAL_TERM                      int64
TI_LN_ORIGINAL_LOAN_AMOUNT               int64
TI_LN_DATE_FIRST_INSTALLMENT    datetime64[us]
TI_LN_PAYMENT_FREQUENCY                  int64
TI_LN_PAYMENT

In [12]:
# for better performance, we can ingest the CSV file into a datble within the database
conn.execute("create table if not exists loan_data as select * from 'data/ln_large.csv'")

<duckdb.duckdb.DuckDBPyConnection at 0x7f083a94bc30>

In [14]:
# validate that the table was created
conn.execute("show tables").df()

Unnamed: 0,name
0,loan_data


In [21]:
# we can now query the data from the table
conn.execute("select count(*) from loan_data").df()

Unnamed: 0,count_star()
0,2522277


In [28]:
s3_csv_data

's3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/csv/ln_large.csv'

In [15]:
# let's try a more complex query to profile the numeric columns
profile_numeric_sql = """
WITH percentiles AS (
    SELECT
        'ti_ln_remaining_term' AS column_name,
        MIN(ti_ln_remaining_term) AS min_value,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY ti_ln_remaining_term) AS p25,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY ti_ln_remaining_term) AS p50,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY ti_ln_remaining_term) AS p75,
        MAX(ti_ln_remaining_term) AS max_value
    FROM loan_data
    UNION ALL
    SELECT
        'ti_ln_balance' AS column_name,
        MIN(ti_ln_balance) AS min_value,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY ti_ln_balance) AS p25,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY ti_ln_balance) AS p50,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY ti_ln_balance) AS p75,
        MAX(ti_ln_balance) AS max_value
    FROM loan_data
    UNION ALL
    SELECT
        'ti_ln_installment_due' AS column_name,
        MIN(ti_ln_installment_due) AS min_value,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY ti_ln_installment_due) AS p25,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY ti_ln_installment_due) AS p50,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY ti_ln_installment_due) AS p75,
        MAX(ti_ln_installment_due) AS max_value
    FROM loan_data
    UNION ALL
    SELECT
        'ti_ln_val_payments' AS column_name,
        MIN(ti_ln_val_payments) AS min_value,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY ti_ln_val_payments) AS p25,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY ti_ln_val_payments) AS p50,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY ti_ln_val_payments) AS p75,
        MAX(ti_ln_val_payments) AS max_value
    FROM loan_data
    UNION ALL
    SELECT
        'ti_ln_val_interest' AS column_name,
        MIN(ti_ln_val_interest) AS min_value,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY ti_ln_val_interest) AS p25,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY ti_ln_val_interest) AS p50,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY ti_ln_val_interest) AS p75,
        MAX(ti_ln_val_interest) AS max_value
    FROM loan_data
    UNION ALL
    SELECT
        'ti_ln_val_total_fees' AS column_name,
        MIN(ti_ln_val_total_fees) AS min_value,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY ti_ln_val_total_fees) AS p25,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY ti_ln_val_total_fees) AS p50,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY ti_ln_val_total_fees) AS p75,
        MAX(ti_ln_val_total_fees) AS max_value
    FROM loan_data
    UNION ALL
    SELECT
        'ti_ln_final_charge_cycle' AS column_name,
        MIN(ti_ln_final_charge_cycle) AS min_value,
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY ti_ln_final_charge_cycle) AS p25,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY ti_ln_final_charge_cycle) AS p50,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY ti_ln_final_charge_cycle) AS p75,
        MAX(ti_ln_final_charge_cycle) AS max_value
    FROM loan_data
)
SELECT * FROM percentiles;
"""

In [16]:
conn.execute(profile_numeric_sql).df()

Unnamed: 0,column_name,min_value,p25,p50,p75,max_value
0,ti_ln_remaining_term,1,13.0,25.0,39.0,60
1,ti_ln_balance,0,749.0,1498.0,2250.0,2999
2,ti_ln_installment_due,0,75.0,150.0,224.0,299
3,ti_ln_val_payments,0,75.0,149.0,225.0,299
4,ti_ln_val_interest,0,74.0,149.0,225.0,299
5,ti_ln_val_total_fees,0,75.0,149.0,224.0,299
6,ti_ln_final_charge_cycle,0,0.0,0.0,0.0,199


In [17]:
# we can also use duckdb to convert the data to parquet format for better performance and interoperability
conn.execute(
    """copy (select *, 
    year(TI_LN_DATE_OPEN) as TI_LN_DATE_OPEN_YEAR, 
    month(ti_ln_date_open) as TI_LN_DATE_OPEN_MONTH 
    from loan_data) 
    to 'parquet_output' 
    (FORMAT PARQUET, PARTITION_BY (TI_LN_DATE_OPEN_YEAR, TI_LN_DATE_OPEN_MONTH), OVERWRITE_OR_IGNORE true)"""
)

<duckdb.duckdb.DuckDBPyConnection at 0x7f083a94bc30>

In [20]:
import boto3
import sagemaker
from pathlib import Path
from sagemaker.pytorch.processing import PyTorchProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
bucket = sess.default_bucket()  # default bucket name
account_id = sess.account_id() 

In [21]:
processor = PyTorchProcessor(
    framework_version='2.2',
    py_version='py310',
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    base_job_name='processing-job'
)

In [23]:
s3_csv_data = (
    "s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/csv/ln_large.csv"
)

s3_output_location = f"s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/processing_output"

job_inputs = [
    ProcessingInput(
        input_name="data",
        source=s3_csv_data,                     # the S3 location from where the data will be read and copied to the processing instance
        destination="/opt/ml/processing/input", # the folder inside the processing instance where the data will be copied to
    )
]

job_outputs = [
    ProcessingOutput(
        output_name="data_structured",
        source="/opt/ml/processing/output",   # the folder inside the processing instance where script the output will be written to
        destination=s3_output_location,       # the S3 location where the output will be stored
    ),
]

In [26]:
job = processor.run(
    code="convert_to_parquet.py",
    source_dir="processing_script",
    inputs=job_inputs,
    outputs=job_outputs,
    arguments=[
        "--input_dir",
        "/opt/ml/processing/input",
        "--output_dir",
        "/opt/ml/processing/output",
    ],
)

INFO:sagemaker:Creating processing-job with name processing-job-2024-09-09-15-28-46-164


Collecting duckdb (from -r requirements.txt (line 1))
  Downloading duckdb-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Collecting duckdb-engine (from -r requirements.txt (line 2))
  Downloading duckdb_engine-0.13.2-py3-none-any.whl.metadata (7.9 kB)
Collecting sqlalchemy>=1.3.22 (from duckdb-engine->-r requirements.txt (line 2))
  Downloading SQLAlchemy-2.0.34-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Downloading duckdb-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.1/20.1 MB 58.0 MB/s eta 0:00:00
Downloading duckdb_engine-0.13.2-py3-none-any.whl (47 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 47.4/47.4 kB 7.8 MB/s eta 0:00:00
Downloading SQLAlchemy-2.0.34-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 106.3 MB/s eta 0:00:00
Installing collected packages: sq

In [27]:
!aws s3 ls $s3_output_location/ --recursive

2024-09-09 15:31:27    2375015 fico_ml_workshop/data/processing_output/TI_LN_DATE_OPEN_YEAR=2019/TI_LN_DATE_OPEN_MONTH=1/data_0.parquet
2024-09-09 15:31:27    2058040 fico_ml_workshop/data/processing_output/TI_LN_DATE_OPEN_YEAR=2019/TI_LN_DATE_OPEN_MONTH=10/data_0.parquet
2024-09-09 15:31:27    1944341 fico_ml_workshop/data/processing_output/TI_LN_DATE_OPEN_YEAR=2019/TI_LN_DATE_OPEN_MONTH=11/data_0.parquet
2024-09-09 15:31:27    2035199 fico_ml_workshop/data/processing_output/TI_LN_DATE_OPEN_YEAR=2019/TI_LN_DATE_OPEN_MONTH=12/data_0.parquet
2024-09-09 15:31:27    2110977 fico_ml_workshop/data/processing_output/TI_LN_DATE_OPEN_YEAR=2019/TI_LN_DATE_OPEN_MONTH=2/data_0.parquet
2024-09-09 15:31:27    2267208 fico_ml_workshop/data/processing_output/TI_LN_DATE_OPEN_YEAR=2019/TI_LN_DATE_OPEN_MONTH=3/data_0.parquet
2024-09-09 15:31:27    2168262 fico_ml_workshop/data/processing_output/TI_LN_DATE_OPEN_YEAR=2019/TI_LN_DATE_OPEN_MONTH=4/data_0.parquet
2024-09-09 15:31:27    2119934 fico_ml_worksh

In [48]:
from sagemaker.remote_function import remote
from sagemaker import image_uris

image_uri = image_uri = image_uris.retrieve(
    framework="pytorch",
    image_scope="training",
    region=region,
    version="2.3",
    py_version="py311",
    instance_type="ml.m5.xlarge",
)


@remote(
    instance_type="ml.m5.xlarge",
    dependencies="processing_script/requirements.txt",
    image_uri=image_uri,
)
def convert_to_parquet(input_s3_path: str, output_s3_path: str):

    conn = duckdb.connect("temp_data.duckdb")

    # configure S3 access
    conn.execute(
        """CREATE SECRET s3_access (
           TYPE S3,
           PROVIDER CREDENTIAL_CHAIN
        );"""
    )

    # create a temporary table from data in S3
    conn.execute(f"CREATE TABLE temp_table AS SELECT * FROM '{input_s3_path}'")

    # convert the data to parquet format
    conn.execute(
        f"""copy (select *, 
    year(TI_LN_DATE_OPEN) as TI_LN_DATE_OPEN_YEAR, 
    month(ti_ln_date_open) as TI_LN_DATE_OPEN_MONTH 
    from temp_table) 
    to '{output_s3_path}' 
    (FORMAT PARQUET, PARTITION_BY (TI_LN_DATE_OPEN_YEAR, TI_LN_DATE_OPEN_MONTH), OVERWRITE_OR_IGNORE true)"""
    )

    return output_s3_path

In [49]:
convert_to_parquet(s3_csv_data, "s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/remote_func_output")

2024-09-09 16:46:28,931 sagemaker.remote_function INFO     Serializing function code to s3://sagemaker-us-east-1-152804913371/convert-to-parquet-2024-09-09-16-46-28-930/function
2024-09-09 16:46:29,054 sagemaker.remote_function INFO     Serializing function arguments to s3://sagemaker-us-east-1-152804913371/convert-to-parquet-2024-09-09-16-46-28-930/arguments
2024-09-09 16:46:29,352 sagemaker.remote_function INFO     Copied dependencies file at 'processing_script/requirements.txt' to '/tmp/tmpakwarjfl/temp_workspace/sagemaker_remote_function_workspace/requirements.txt'
2024-09-09 16:46:29,353 sagemaker.remote_function INFO     Successfully created workdir archive at '/tmp/tmpakwarjfl/workspace.zip'
2024-09-09 16:46:29,385 sagemaker.remote_function INFO     Successfully uploaded workdir to 's3://sagemaker-us-east-1-152804913371/convert-to-parquet-2024-09-09-16-46-28-930/sm_rf_user_ws/workspace.zip'
2024-09-09 16:46:29,386 sagemaker.remote_function INFO     Creating job: convert-to-parqu

2024-09-09 16:46:29 Starting - Starting the training job...
2024-09-09 16:46:44 Starting - Preparing the instances for training...
2024-09-09 16:47:07 Downloading - Downloading input data...
2024-09-09 16:47:33 Downloading - Downloading the training image.....INFO: CONDA_PKGS_DIRS is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/conda/pkgs'
INFO: PIP_CACHE_DIR is set to '/opt/ml/sagemaker/warmpoolcache/sm_remotefunction_user_dependencies_cache/pip'
INFO: Bootstraping runtime environment.
2024-09-09 16:48:42,999 sagemaker.remote_function INFO     Successfully unpacked workspace archive at '/'.
2024-09-09 16:48:42,999 sagemaker.remote_function INFO     '/sagemaker_remote_function_workspace/pre_exec.sh' does not exist. Assuming no pre-execution commands to run
2024-09-09 16:48:43,000 sagemaker.remote_function INFO     Running command: '/opt/conda/bin/python -m pip install -r /sagemaker_remote_function_workspace/requirements.txt -U' in the dir: '/' 
2024

's3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/remote_func_output'

In [51]:
!aws s3 ls s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/remote_func_output/

                           PRE TI_LN_DATE_OPEN_YEAR=2019/
                           PRE TI_LN_DATE_OPEN_YEAR=2020/
                           PRE TI_LN_DATE_OPEN_YEAR=2021/
                           PRE TI_LN_DATE_OPEN_YEAR=2022/
