# Train

Based on [the example](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference%20Pipeline%20with%20Scikit-learn%20and%20Linear%20Learner.ipynb).

In [1]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
bucket = 'task-bucket-2021'
prefix = 'sagemaker-sklearn-linearlearner'

# Prepare the train, test, and submit data

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_df = pd.read_csv('../data/dataset.csv', sep=';', na_values='NA', )

label = 'default'
variables = [x for x in data_df.columns if x != 'uuid' and x != label]
numeric_cols = [
    'account_amount_added_12_24m',
    'account_days_in_dc_12_24m',
    'account_days_in_rem_12_24m',
    'account_days_in_term_12_24m',
    'account_incoming_debt_vs_paid_0_24m',
    'age',
    'avg_payment_span_0_12m',
    'avg_payment_span_0_3m',
    'max_paid_inv_0_12m',
    'max_paid_inv_0_24m',
    'num_active_div_by_paid_inv_0_12m',
    'num_active_inv',
    'num_arch_dc_0_12m',
    'num_arch_dc_12_24m',
    'num_arch_ok_0_12m',
    'num_arch_ok_12_24m',
    'num_arch_rem_0_12m',
    'num_arch_written_off_0_12m',
    'num_arch_written_off_12_24m',
    'num_unpaid_bills',
    'recovery_debt',
    'sum_capital_paid_account_0_12m',
    'sum_capital_paid_account_12_24m',
    'sum_paid_inv_0_12m',
    'time_hours'    
]
categorical_cols = [x for x in variables if x not in numeric_cols]

for col in categorical_cols:
    data_df[col] = pd.Categorical(data_df[col]).codes

train_test_df = data_df[data_df[label].notna()][[label] + variables]
train, test = train_test_split(train_test_df, test_size=0.2,  random_state=42)

class_0 = train[train[label] == 0]
class_1 = train[train[label] == 1]
down_class_0 = class_0.sample(4*len(class_1), random_state=42) # class_0 : class_1 = 4 : 1

train_down = pd.concat([down_class_0, class_1]).sample(frac=1, random_state=42)
    
submit_df = data_df[data_df[label].isna()][['uuid'] + variables]

train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)
train_down.to_csv('../data/train_down.csv', index=False)
submit_df.to_csv('../data/submit.csv', index=False)

### Upload the full train data. Later, we will try to use the down sampled train data.

In [3]:
WORK_DIRECTORY = "../data"

train_input = sagemaker_session.upload_data(
    path="{}/{}".format(WORK_DIRECTORY, "train.csv"),
    bucket=bucket,
    key_prefix="{}/{}".format(prefix, "train"),
)

# Create SageMaker Scikit Estimator for preprocessing 

In [92]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
script_path = "../preprocess/preprocess.py"

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.m4.xlarge",
    sagemaker_session=sagemaker_session,
)

In [93]:
sklearn_preprocessor.fit({"train": train_input})

2021-07-20 10:32:40 Starting - Starting the training job...
2021-07-20 10:33:02 Starting - Launching requested ML instancesProfilerReport-1626777159: InProgress
......
2021-07-20 10:34:03 Starting - Preparing the instances for training.........
2021-07-20 10:35:31 Downloading - Downloading input data
2021-07-20 10:35:31 Training - Downloading the training image...
2021-07-20 10:36:06 Training - Training image download completed. Training in progress..[34m2021-07-20 10:36:07,323 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-07-20 10:36:07,325 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-20 10:36:07,336 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-07-20 10:36:07,640 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-20 10:36:07,654 sagemaker-training-toolkit INFO     No GPUs detected (n

In [94]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, instance_type="ml.m4.xlarge", assemble_with="Line", accept="text/csv",
    strategy='SingleRecord', max_payload=100
)

In [95]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

.............................[34m2021-07-20 10:41:33,408 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-20 10:41:33,411 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-20 10:41:33,412 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect

# Fit a LinearLearner Model with the preprocessed data

In [63]:
import boto3
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

In [44]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(predictor_type="binary_classifier", feature_dim=171, wd=1.0) # To reduce overfitting due to multicollinearity, set L2 regularization.

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=True)

2021-07-19 09:45:23 Starting - Starting the training job...
2021-07-19 09:45:46 Starting - Launching requested ML instancesProfilerReport-1626687923: InProgress
...
2021-07-19 09:46:12 Starting - Preparing the instances for training............
2021-07-19 09:48:16 Downloading - Downloading input data...
2021-07-19 09:48:48 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[07/19/2021 09:48:54 INFO 139937165485888] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_

# Create an Inference Pipeline with Scikit preprocessor and Linear Learner

In [45]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "full-train-ep-" + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, role=role, models=[scikit_learn_inferencee_model, linear_learner_model]
)

sm_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", endpoint_name=endpoint_name)

---------------!

# Test the endpoint

In [82]:
import numpy as np
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, serializer=CSVSerializer(), deserializer=JSONDeserializer()
)


test = pd.read_csv("{}/{}".format(WORK_DIRECTORY, "test.csv"))
label = 'default'
variables = [x for x in test.columns if x != 'uuid' and x != label]
test_X, test_y = test[variables], test[label].values
test_X.to_csv('../data/test_X.csv', index=False)

number_lines = len(test_X)

In [None]:
import csv
import json

predictions = []

with open('../data/test_X.csv', 'r') as f:
    reader = csv.reader(f)
    header = ','.join(next(reader)) + '\n'
    
    batch_size = 100
    data = ''
    for i, row in enumerate(reader, start=1):
        data += ','.join(row) + '\n'
        if i%batch_size == 0:
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]
            data = ''
        elif i == number_lines: 
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]

predictions = np.array(predictions)

In [77]:
from sklearn.metrics import classification_report

print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     17730
         1.0       1.00      0.01      0.01       266

    accuracy                           0.99     17996
   macro avg       0.99      0.50      0.50     17996
weighted avg       0.99      0.99      0.98     17996



### As expected, recall of default==1 prediction is very low. 

### In practice, we don't want to miss out positive cases. It is better to have high recall even if the model has high false positive rate.

# Delete the endpoint

In [79]:
sm_client = sagemaker_session.boto_session.client("sagemaker")
sm_client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'a5b53e28-3ea8-4037-a7d3-64e17e87fa54',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a5b53e28-3ea8-4037-a7d3-64e17e87fa54',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 19 Jul 2021 10:43:23 GMT'},
  'RetryAttempts': 0}}

# Set the target recall

In [None]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(predictor_type="binary_classifier", feature_dim=171, wd=1.0,
                                binary_classifier_model_selection_criteria='precision_at_target_recall', target_recall=0.9, # Set the target recall.
                                ) 

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=False)

In [None]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "full-train-ep-" + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, role=role, models=[scikit_learn_inferencee_model, linear_learner_model]
)

sm_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", endpoint_name=endpoint_name)

In [88]:
predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, serializer=CSVSerializer(), deserializer=JSONDeserializer()
)

predictions = []

with open('../data/test_X.csv', 'r') as f:
    reader = csv.reader(f)
    header = ','.join(next(reader)) + '\n'
    
    batch_size = 100
    data = ''
    for i, row in enumerate(reader, start=1):
        data += ','.join(row) + '\n'
        if i%batch_size == 0:
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]
            data = ''
        elif i == number_lines: 
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]

predictions = np.array(predictions)

print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

         0.0       1.00      0.72      0.84     17730
         1.0       0.05      0.92      0.09       266

    accuracy                           0.72     17996
   macro avg       0.52      0.82      0.46     17996
weighted avg       0.98      0.72      0.82     17996



# Taking into account class weight

In [96]:
import boto3
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(predictor_type="binary_classifier", feature_dim=171, wd=1.0,
                                binary_classifier_model_selection_criteria='precision_at_target_recall', target_recall=0.9, # Set the target recall.
                                 positive_example_weight_mult='balanced',
                                ) 

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=False)


2021-07-20 10:42:08 Starting - Starting the training job
2021-07-20 10:42:09 Starting - Launching requested ML instances........
2021-07-20 10:42:57 Starting - Preparing the instances for training................
2021-07-20 10:44:23 Downloading - Downloading input data...
2021-07-20 10:44:41 Training - Downloading the training image......
2021-07-20 10:45:16 Training - Training image download completed. Training in progress....................
2021-07-20 10:46:58 Uploading - Uploading generated training model.
2021-07-20 10:47:07 Completed - Training job completed


In [97]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "full-train-balanced-ep-" + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, role=role, models=[scikit_learn_inferencee_model, linear_learner_model]
)

sm_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", endpoint_name=endpoint_name)

---------------!

In [98]:
import numpy as np
import csv
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.metrics import classification_report

predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, serializer=CSVSerializer(), deserializer=JSONDeserializer()
)

test = pd.read_csv("{}/{}".format(WORK_DIRECTORY, "test.csv"))
label = 'default'
variables = [x for x in test.columns if x != 'uuid' and x != label]
test_X, test_y = test[variables], test[label].values
test_X.to_csv('../data/test_X.csv', index=False)

number_lines = len(test_X)

predictions = []

with open('../data/test_X.csv', 'r') as f:
    reader = csv.reader(f)
    header = ','.join(next(reader)) + '\n'
    
    batch_size = 100
    data = ''
    for i, row in enumerate(reader, start=1):
        data += ','.join(row) + '\n'
        if i%batch_size == 0:
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]
            data = ''
        elif i == number_lines: 
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]

predictions = np.array(predictions)

print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

         0.0       1.00      0.76      0.86     17730
         1.0       0.05      0.89      0.10       266

    accuracy                           0.76     17996
   macro avg       0.53      0.82      0.48     17996
weighted avg       0.98      0.76      0.85     17996



# Test the endpoint in json format

In [99]:
import json
from sagemaker.serializers import JSONSerializer

json_predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, serializer=JSONSerializer(), deserializer=JSONDeserializer(),
    content_type="application/json"
)

data = []
for row in test_X.iterrows():
    data.append({'features': [str(x) for x in row[1].values]})
    
json_data = json.dumps(data)
length = len(json_data)
batch_size = 100

content_type is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [100]:
from io import StringIO

predictions = []
list_data = json.loads(json_data)

for i in range(0, length, batch_size):
    data = json.dumps(list_data[i:i+batch_size])
    if data == '[]':
        break
    result = json_predictor.predict(StringIO(data))
    predictions += [r["predicted_label"] for r in result["predictions"]]

json_predictions = np.array(predictions)

print(sum(np.equal(json_predictions, predictions)))
print(classification_report(test_y, json_predictions))

17996
              precision    recall  f1-score   support

         0.0       1.00      0.45      0.62     17730
         1.0       0.03      0.96      0.05       266

    accuracy                           0.46     17996
   macro avg       0.51      0.71      0.34     17996
weighted avg       0.98      0.46      0.62     17996



### Test data for the REST API

In [101]:
tmp = []
for features in list_data[0:5]:
    tmp.append(','.join(features['features']))
    
string_data = ';'.join(tmp)
string_data

'0.0,0.0,0.0,0.0,0.261935118288709,1.0,1.0,-1.0,-1.0,-1.0,19.0,15.666666666666698,16.0,22.0,4.0,1.0,11270.0,11270.0,0.0,0.666666666666667,2.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,4567.0,0.0,24287.0,21.466944444444398,0.0;72461.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,18.875,34.0,22.0,4.0,1.0,10640.0,31895.0,1.0,0.125,1.0,0.0,0.0,8.0,9.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,7760.0,54235.0,46955.0,22.358611111111106,0.0;0.0,0.0,0.0,0.0,nan,-1.0,-1.0,-1.0,-1.0,-1.0,28.0,nan,nan,56.0,2.0,0.0,0.0,0.0,1.0,nan,0.0,0.0,0.0,0.0,0.0,0.0,nan,nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.77416666666667,-1.0;0.0,0.0,0.0,0.0,nan,-1.0,-1.0,-1.0,-1.0,-1.0,23.0,16.4,13.0,56.0,2.0,1.0,4280.0,4280.0,1.0,0.0,0.0,0.0,0.0,5.0,8.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,12855.0,19.1136111111111,-1.0;0.0,0.0,0.0,0.0,nan,-1.0,-1.0,-1.0,-1.0,-1.0,62.0,13.0,nan,10.0,11.0,1.0,9290.0,9290.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0

# Use the down sampled train data

In [92]:
WORK_DIRECTORY = "../data"

train_input = sagemaker_session.upload_data(
    path="{}/{}".format(WORK_DIRECTORY, "train_down.csv"),
    bucket=bucket,
    key_prefix="{}/{}".format(prefix, "train"),
)

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.m4.xlarge",
    sagemaker_session=sagemaker_session,
)

sklearn_preprocessor.fit({"train": train_input})

# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, instance_type="ml.m4.xlarge", assemble_with="Line", accept="text/csv",
    strategy='SingleRecord', max_payload=100
)

# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

2021-07-19 11:37:21 Starting - Starting the training job...
2021-07-19 11:37:23 Starting - Launching requested ML instancesProfilerReport-1626694641: InProgress
......
2021-07-19 11:38:36 Starting - Preparing the instances for training.........
2021-07-19 11:40:16 Downloading - Downloading input data
2021-07-19 11:40:16 Training - Downloading the training image...
2021-07-19 11:40:53 Uploading - Uploading generated training model[34m2021-07-19 11:40:43,737 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-07-19 11:40:43,739 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-19 11:40:43,756 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-07-19 11:40:44,146 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-19 11:40:47,192 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus insta

In [94]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(predictor_type="binary_classifier", feature_dim=168, wd=1.0, # Different feature_dim due to smaller size of train data.
                                binary_classifier_model_selection_criteria='precision_at_target_recall', target_recall=0.9, # Set the target recall.
                                 positive_example_weight_mult='balanced',
                                ) 

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=False)


2021-07-19 11:53:07 Starting - Starting the training job
2021-07-19 11:53:08 Starting - Launching requested ML instances............
2021-07-19 11:54:15 Starting - Preparing the instances for training.................
2021-07-19 11:55:47 Downloading - Downloading input data.....
2021-07-19 11:56:14 Training - Downloading the training image...
2021-07-19 11:56:36 Training - Training image download completed. Training in progress...
2021-07-19 11:56:51 Uploading - Uploading generated training model.
2021-07-19 11:57:00 Completed - Training job completed


In [96]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "downsample-balanced-ep-" + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, role=role, models=[scikit_learn_inferencee_model, linear_learner_model]
)

sm_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", endpoint_name=endpoint_name)

---------------!

In [97]:
predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, serializer=CSVSerializer(), deserializer=JSONDeserializer()
)

predictions = []

with open('../data/test_X.csv', 'r') as f:
    reader = csv.reader(f)
    header = ','.join(next(reader)) + '\n'
    
    batch_size = 100
    data = ''
    for i, row in enumerate(reader, start=1):
        data += ','.join(row) + '\n'
        if i%batch_size == 0:
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]
            data = ''
        elif i == number_lines: 
            data = header + data
            result = predictor.predict(data)
            predictions += [r["predicted_label"] for r in result["predictions"]]

predictions = np.array(predictions)

print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

         0.0       1.00      0.76      0.86     17730
         1.0       0.05      0.88      0.10       266

    accuracy                           0.76     17996
   macro avg       0.52      0.82      0.48     17996
weighted avg       0.98      0.76      0.85     17996



### No big differnece. 


### We will use the model trained on the full train data and the linear learner hyperparmaeters below.

```
predictor_type="binary_classifier", feature_dim=171, wd=1.0,
binary_classifier_model_selection_criteria='precision_at_target_recall', target_recall=0.9,
positive_example_weight_mult='balanced',
```

# Create the csv data for submission

In [70]:
submit_df.head()

Unnamed: 0,uuid,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,account_worst_status_3_6m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
89976,6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7,0,0.0,0.0,0.0,0.009135,0,0,-1,0,...,1,1,1,1,0,8815,0,27157,19.895556,-1
89977,f6f6d9f3-ef2b-4329-a388-c6a687f27e70,0,0.0,0.0,0.0,,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0.236667,-1
89978,e9c39869-1bc5-4375-b627-a2df70b445ea,50956,0.0,77.0,0.0,0.0,0,0,1,2,...,2,1,1,3,0,36163,39846,93760,20.332778,-1
89979,6beb88a3-9641-4381-beb6-c9a208664dd0,35054,0.0,0.0,0.0,0.0,0,0,0,0,...,0,2,2,2,0,62585,0,1790,6.201111,-1
89980,bb89b735-72fe-42a4-ba06-d63be0f4ca36,0,0.0,0.0,0.0,0.0,0,1,-1,1,...,0,0,0,0,0,14295,0,0,8.451111,-1


In [71]:
submit_df.shape

(10000, 42)

In [72]:
uuid = []
data = []
for row in submit_df.iterrows():
    uuid.append(row[1]['uuid'])
    data.append({'features': [str(x) for x in row[1][variables].values]})
    
json_data = json.dumps(data)
length = len(json_data)
batch_size = 100

In [77]:
from io import StringIO

predictions = []
list_data = json.loads(json_data)

for i in range(0, length, batch_size):
    data = json.dumps(list_data[i:i+batch_size])
    if data == '[]':
        break
    result = json_predictor.predict(StringIO(data))
    predictions += [r["score"] for r in result["predictions"]]

json_predictions = np.array(predictions)

In [78]:
solution_df = pd.DataFrame({'uuid': uuid, 'pd': json_predictions})

solution_df.head(10)

Unnamed: 0,uuid,pd
0,6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7,0.395438
1,f6f6d9f3-ef2b-4329-a388-c6a687f27e70,0.543285
2,e9c39869-1bc5-4375-b627-a2df70b445ea,0.06416
3,6beb88a3-9641-4381-beb6-c9a208664dd0,0.669068
4,bb89b735-72fe-42a4-ba06-d63be0f4ca36,0.678461
5,e4eede99-76a3-4437-a540-3059a1eff67c,0.674228
6,a2af8d9e-9f81-4185-8fff-b2ec49d681a6,0.087495
7,ec910486-1e66-402a-80f2-08c6f04a9a1b,0.565805
8,08973cf0-646a-4fa7-9f1f-d03f76ffd59c,0.485657
9,0591fb4e-5b48-4bac-bce7-f2d5d141e976,0.039967


In [79]:
solution_df.to_csv('../data/solution.csv', index=False)