In [1]:
import boto3
import sagemaker
from pathlib import Path
from sagemaker.pytorch import PyTorch

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
bucket = sess.default_bucket()  # default bucket name
account_id = sess.account_id()

In [30]:
input_data_location = "s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/csv/ln_large.csv"
output_location = f"s3://{bucket}/fico_ml_workshop/aggregation-job/output"

In [37]:
USE_RAY = True

job = PyTorch(
    source_dir="ray_script",
    entry_point="compute_aggregations.py",
    framework_version="2.2",
    py_version="py310",
    role=role,
    environment={"USE_RAY": str(USE_RAY)},
    hyperparameters={
        "input_data_location": input_data_location,
        "output_data_location": output_location,
    },
    instance_type="ml.m5.xlarge",
    instance_count = 3 if USE_RAY else 1,
    max_run=1000,
    keep_alive_period_in_seconds=300
)

In [38]:
job.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-09-11-18-24-14-866


2024-09-11 18:24:20 Starting - Found matching resource for reuse..bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-09-11 18:24:33,141 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-09-11 18:24:33,142 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-11 18:24:33,143 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-09-11 18:24:33,153 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-09-11 18:24:33,155 sagemaker_pytorch_container.training INFO     Invoking user training script.
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-09-11 18:24:33,165 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-09-11 18:24:33,166 sagemaker-training-to

In [12]:
import awswrangler as wr
# import modin.pandas as pd
import pandas as pd

2024-09-11 17:27:46,008	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-09-11 17:27:46,141	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [13]:
# wr.engine.set("ray")
# wr.memory_format.set("modin")

wr.engine.set("ray")
wr.memory_format.set("modin")

# wr.engine.set("python")
# wr.memory_format.set("pandas")


In [14]:
df = wr.s3.read_csv("s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/data/csv/ln_large.csv")

INFO:awswrangler.distributed.ray._core:Initializing a Ray instance
2024-09-11 17:27:54,115	INFO worker.py:1598 -- Connecting to existing Ray cluster at address: 172.31.42.252:9339...
2024-09-11 17:27:54,121	INFO worker.py:1774 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2024-09-11 17:27:55,354	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-09_14-33-33_297169_76/logs/ray-data
2024-09-11 17:27:55,355	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadArrowCSV]
                                                                                                       
✔️  Dataset execution finished in 16.60 seconds: 100%|██████████| 2.52M/2.52M [00:16<00:00, 152k row/s]         

- ReadArrowCSV->SplitBlocks(64): 0 active, 0 queued, [cpu: 0.0, objects: 5.0MB]: : 2.52M row [00:16, 152k row/s]
2024-09-11 17:28:11,973	INFO streamin

In [15]:
account_aggregation = df.groupby(["TI_CU_CUSTOMER_ID", "TI_LN_ACCOUNT_ID"]).agg(
    AVERAGE_ACCOUNT_BALANCE=("TI_LN_BALANCE", "mean"),
    AVERAGE_MIN_PAYMENT=("TI_LN_VAL_PAYMENTS", "mean"),
    LATE_PAYMENTS=("TI_LN_NUM_MTHS_IN_ARREARS", "sum"),
).reset_index()

In [16]:
df["TI_LN_DATE_OPEN"] = pd.to_datetime(df["TI_LN_DATE_OPEN"])
df["months_elapsed"] = df["TI_LN_ORIGINAL_TERM"] - df["TI_LN_REMAINING_TERM"]
df["payment_date"] = df.apply(
    lambda row: row["TI_LN_DATE_OPEN"] + pd.DateOffset(months=row["months_elapsed"]),
    axis=1,
)
monthly_balances = (
    df.groupby(["TI_CU_CUSTOMER_ID", df["payment_date"].dt.to_period("M")])
    .agg(
        TOTAL_MOPNTHLY_BALANCE=("TI_LN_BALANCE", "sum"),
        TOTAL_ARREARS=("TI_LN_NUM_MTHS_IN_ARREARS", "sum"),
        NUM_ACCOUNTS=("TI_LN_ACCOUNT_ID", "count"),
    )
    .reset_index()
)

INFO:modin.logger.default:Using sequential splitting in '.from_pandas()' because of some of the conditions are False: enough_elements=False; all_numeric_types=False; async_mode_on=False
Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.
INFO:modin.logger.default:Using sequential splitting in '.from_pandas()' because of some of the conditions are False: enough_elements=False; all_numeric_types=True; async_mode_on=False


In [23]:
monthly_balances["payment_date"] = monthly_balances["payment_date"].dt.to_timestamp()

In [25]:
wr.s3.to_csv(account_aggregation, "s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/aggregation-job/output/account_aggregation.csv")
wr.s3.to_csv(monthly_balances, "s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/aggregation-job/output/monthly_balances.csv")

2024-09-11 17:30:35,225	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-09_14-33-33_297169_76/logs/ray-data
2024-09-11 17:30:35,226	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Repartition] -> TaskPoolMapOperator[Write]
Running 0: 0.00 row [00:00, ? row/s]
[A

[A[A

[A[A

                                                                                          
[A                                                                                                                                     

[A[A                                                            


✔️  Dataset execution finished in 1.41 seconds: : 1.00 row [00:01, 1.41s/ row]        

[A

[A[A
[A                                                                                                                                   

[A[A                                                            


-

{'paths': ['s3://sagemaker-us-east-1-152804913371/fico_ml_workshop/aggregation-job/output/monthly_balances.csv'],
 'partitions_values': {}}