In [44]:
from scripts.functions import *

import os
import pandas as pd
import numpy as np
import boto3
from io import StringIO
from urllib.parse import urlparse

In [45]:
print("Settings:")
settings = read_settings('scripts/settings.json')
print(settings)

Settings:
{'project_name': '01-churn', 'bucket_name': 'sagemaker-bucket-ds', 'project_path_s3': '01-churn/v1-prod', 'mlflow_arn': 'arn:aws:sagemaker:eu-west-1:211125740051:mlflow-tracking-server/Sample-server', 'mlflow_experiment_name': '01-churn', 'mlflow_model_name': '01-churn', 'preprocessing_job_name': '01-churn-preprocessing', 'training_job_name': '01-churn', 'transform_job_name': '01-churn', 'preprocessing_output_train': '/opt/ml/processing/output/train', 'preprocessing_output_valid': '/opt/ml/processing/output/valid', 'preprocessing_output_test': '/opt/ml/processing/output/test', 'preprocessing_output_inference_train': '/opt/ml/processing/output/inference_train', 'preprocessing_step_name': '01-churn-preprocessing', 'training_step_name': '01-churn-training', 'modelcreate_step_name': '01-churn-model-create', 'transformer_step_name': '01-churn-transformer', 'reference_inference_train': 's3://sagemaker-bucket-ds/01-churn/v1/output/base_pred/train.csv', 'max_diff_pred_train_accept': 

In [46]:
def read_csv_from_s3(s3_path):
    # Parse the S3 path
    print(f"Reading dataframe from following S3 path: {s3_path}")
    
    parsed_url = urlparse(s3_path)
    bucket_name = parsed_url.netloc
    file_key = parsed_url.path.lstrip('/')

    # Initialize a session using Amazon S3
    s3_client = boto3.client('s3')

    # Get the CSV file content from S3
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    csv_content = response['Body'].read().decode('utf-8')

    # Read the CSV content into a pandas DataFrame
    df = pd.read_csv(StringIO(csv_content))

    return df

df = read_csv_from_s3(settings["reference_inference_train"])

Reading dataframe from following S3 path: s3://sagemaker-bucket-ds/01-churn/v1/output/base_pred/train.csv


In [47]:
df_base = read_csv_from_s3(settings["reference_inference_train"])
df_base.columns = ['pred_base', 'target']

print("DataFrame base head:\n", df_base.head())

Reading dataframe from following S3 path: s3://sagemaker-bucket-ds/01-churn/v1/output/base_pred/train.csv
DataFrame base head:
    pred_base  target
0    0.99987       1
1    0.99387       1
2    0.07881       0
3    0.01232       0
4    0.57458       1


In [48]:
pipeline_prediction_path = os.path.join("s3://",settings['bucket_name'],settings['project_path_s3'],'output','inference_train', 'inference_train.csv.out')

df_pipeline = read_csv_from_s3(pipeline_prediction_path)
df_pipeline.columns = ['pred_pipeline']
print("DataFrame pipeline head:\n", df_pipeline.head())

Reading dataframe from following S3 path: s3://sagemaker-bucket-ds/01-churn/v1-prod/output/inference_train/inference_train.csv.out
DataFrame pipeline head:
    pred_pipeline
0       0.999872
1       0.993868
2       0.078811
3       0.012315
4       0.574609


In [49]:
df_all = pd.concat([df_base, df_pipeline], axis = 1)
df_all['abs_diff'] = abs(df_all['pred_base'] - df_all['pred_pipeline'])
print("DataFrame all head:\n", df_all.head())

DataFrame all head:
    pred_base  target  pred_pipeline      abs_diff
0    0.99987       1       0.999872  1.709674e-06
1    0.99387       1       0.993868  1.866839e-06
2    0.07881       0       0.078811  6.618378e-07
3    0.01232       0       0.012315  5.314762e-06
4    0.57458       1       0.574609  2.897779e-05


In [50]:
max_diff = df_all['abs_diff'].max()
print(f"Maximum difference in abs_diff column: {max_diff}")

threshold = settings["max_diff_pred_train_accept"]
print(f"Maximum threshold for difference in predictions is: {threshold}")

if max_diff > threshold:
    raise ValueError(f"Maximum difference {max_diff} exceeds the acceptable threshold of {threshold}")
else:
    print("Maximum difference is within the acceptable threshold.")

Maximum difference in abs_diff column: 0.00014225056765776456
Maximum threshold for difference in predictions is: 0.0002
Maximum difference is within the acceptable threshold.
