In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
  
%matplotlib inline


from supervised_utils import *

import boto3
import sagemaker
from sagemaker import get_execution_role


# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = sagemaker_session.default_bucket()

In [2]:
mailout_joint_dataset_imputed_df = pd.read_csv("../arvato_data_processed/MAILOUTimputed.csv", index_col='LNR')

from sklearn.preprocessing import MinMaxScaler

MMS = MinMaxScaler()

mailout_joint_dataset_imputed_df = pd.DataFrame(MMS.fit_transform(mailout_joint_dataset_imputed_df), index = mailout_joint_dataset_imputed_df.index, columns = mailout_joint_dataset_imputed_df.columns)
mailout_joint_dataset_imputed_df

Unnamed: 0_level_0,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_PERSONEN,ANZ_STATISTISCHE_HAUSHALTE,ANZ_TITEL,BALLRAUM,CAMEO_DEUG_2015,CAMEO_DEUINTL_2015,D19_BANKEN_DIREKT_RZ,D19_BANKEN_GROSS_RZ,...,ALTERSKATEGORIE_FEIN_16.0,ALTERSKATEGORIE_FEIN_17.0,ALTERSKATEGORIE_FEIN_18.0,ALTERSKATEGORIE_FEIN_19.0,ALTERSKATEGORIE_FEIN_20.0,ALTERSKATEGORIE_FEIN_21.0,ALTERSKATEGORIE_FEIN_22.0,ALTERSKATEGORIE_FEIN_23.0,ALTERSKATEGORIE_FEIN_24.0,ALTERSKATEGORIE_FEIN_25.0
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1763,0.038622,0.011397,0.048008,0.038811,0.009514,0.666667,0.444444,0.523810,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1771,0.006804,0.011397,0.089399,0.006949,0.009514,0.666667,0.444444,0.428571,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1776,0.004531,0.012199,0.006617,0.006949,0.009514,0.000000,0.111111,0.095238,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1460,0.013622,0.011397,0.089399,0.014914,0.009514,0.166667,0.111111,0.095238,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1783,0.124987,0.011397,0.048008,0.121123,0.009514,0.500000,0.666667,0.619048,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67615,0.010804,0.009730,0.034631,0.011001,0.009510,0.833333,0.444444,0.476190,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67938,0.009076,0.011397,0.048008,0.009604,0.009514,0.833333,0.555556,0.380952,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67942,0.006804,0.011397,0.130790,0.006949,0.009514,0.833333,0.333333,0.285714,0.857143,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67949,0.006804,0.011397,0.048008,0.006949,0.009514,0.833333,0.000000,0.095238,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
# Model 1: Linear Learner with `precision_at_target_recall` model selection criteria

## train 

In [4]:
from sagemaker import LinearLearner

# specify an output path
prefix = 'LinearLearner'
output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate LinearLearner
linear_recall = LinearLearner(role=role,
                              train_instance_count=1, 
                              train_instance_type='ml.c4.xlarge',
                              predictor_type='binary_classifier',
                              output_path=output_path,
                              sagemaker_session=sagemaker_session,
                              epochs=15,
                              binary_classifier_model_selection_criteria='precision_at_target_recall', # target recall
                              target_recall=0.9) # 90% recall

In [5]:
train = mailout_joint_dataset_imputed_df[mailout_joint_dataset_imputed_df['dataset']==1]
train_labels = train['RESPONSE']
train = train.drop('dataset',axis='columns') 
train = train.drop('RESPONSE',axis='columns') 

formatted_train_data = linear_recall.record_set(train.to_numpy().astype('float32'), labels=train_labels.to_numpy().astype('float32') )

#del train, train_labels

In [6]:
%%time 
# train the estimator on formatted training data
linear_recall.fit(formatted_train_data)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-06-22 17:20:25 Starting - Starting the training job...
2020-06-22 17:20:27 Starting - Launching requested ML instances......
2020-06-22 17:21:35 Starting - Preparing the instances for training...
2020-06-22 17:22:25 Downloading - Downloading input data......
2020-06-22 17:23:26 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/22/2020 17:23:28 INFO 140409988704064] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_

In [7]:

%%time 
# deploy and create a predictor
recall_predictor = linear_recall.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!CPU times: user 289 ms, sys: 15.5 ms, total: 304 ms
Wall time: 8min 32s


In [8]:

print('Metrics for tuned (recall), LinearLearner.\n')

# get metrics for tuned predictor
metrics = evaluate(recall_predictor, 
                   train.to_numpy().astype('float32'), 
                   train_labels.to_numpy().astype('int'), 
                   verbose=True)

Metrics for tuned (recall), LinearLearner.

prediction (col)    0.0   1.0
actual (row)                 
0                 25645  9013
1                    42   394

Recall:     0.904
Precision:  0.042
Accuracy:   0.742



## Predict test cases

In [9]:
test = mailout_joint_dataset_imputed_df[mailout_joint_dataset_imputed_df['dataset']==0]
 
test = test.drop('dataset',axis='columns') 
test = test.drop('RESPONSE',axis='columns') 
test

Unnamed: 0_level_0,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_PERSONEN,ANZ_STATISTISCHE_HAUSHALTE,ANZ_TITEL,BALLRAUM,CAMEO_DEUG_2015,CAMEO_DEUINTL_2015,D19_BANKEN_DIREKT_RZ,D19_BANKEN_GROSS_RZ,...,ALTERSKATEGORIE_FEIN_16.0,ALTERSKATEGORIE_FEIN_17.0,ALTERSKATEGORIE_FEIN_18.0,ALTERSKATEGORIE_FEIN_19.0,ALTERSKATEGORIE_FEIN_20.0,ALTERSKATEGORIE_FEIN_21.0,ALTERSKATEGORIE_FEIN_22.0,ALTERSKATEGORIE_FEIN_23.0,ALTERSKATEGORIE_FEIN_24.0,ALTERSKATEGORIE_FEIN_25.0
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1754,0.009076,0.011397,0.089399,0.009604,0.009514,0.833333,0.111111,0.047619,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1770,0.049986,0.011397,0.048008,0.060053,0.009514,1.000000,0.444444,0.380952,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1465,0.009076,0.011397,0.172181,0.009604,0.009514,0.000000,0.666667,0.619048,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1470,0.006804,0.011397,0.006617,0.006949,0.009514,0.000000,0.111111,0.047619,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1478,0.006804,0.011397,0.172181,0.006949,0.009514,0.833333,0.444444,0.380952,0.285714,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67615,0.010804,0.009730,0.034631,0.011001,0.009510,0.833333,0.444444,0.476190,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67938,0.009076,0.011397,0.048008,0.009604,0.009514,0.833333,0.555556,0.380952,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67942,0.006804,0.011397,0.130790,0.006949,0.009514,0.833333,0.333333,0.285714,0.857143,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67949,0.006804,0.011397,0.048008,0.006949,0.009514,0.833333,0.000000,0.095238,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
prediction_batches = [recall_predictor.predict(batch) for batch in np.array_split(test.to_numpy().astype('float32'), 100)]

# LinearLearner produces a `predicted_label` for each data point in a batch
# get the 'predicted_label' for every point in a batch
test_preds = np.concatenate([np.array([x.label['predicted_label'].float32_tensor.values[0] for x in batch]) 
                             for batch in prediction_batches])

llearner_recall = pd.DataFrame({'LNR':test.index ,'RESPONSE':  test_preds })
llearner_recall.to_csv("../submissions/llearner_recall.csv",sep=",", index=False)

In [11]:
# delete the predictor endpoint 
delete_endpoint(recall_predictor)

Deleted linear-learner-2020-06-22-17-20-25-487


## TEST SCORE on Kaggle: `0.67178`

Evaluated on `Kaggle`

--- 
# Model 2:  Linear Learner with `precision_at_target_recall` model selection criteria **with BALANCED weights matrix**

We want the algorithm to choose a weight so that errors in classifying negative vs. positive examples have equal impact on training loss, specify balanced."

In [12]:
linear_balanced = LinearLearner(role=role,
                                train_instance_count=1, 
                                train_instance_type='ml.c4.xlarge',
                                predictor_type='binary_classifier',
                                output_path=output_path,
                                sagemaker_session=sagemaker_session,
                                epochs=15,
                                binary_classifier_model_selection_criteria='precision_at_target_recall', # target recall
                                target_recall=0.9,
                                positive_example_weight_mult='balanced')

In [13]:
%%time 
# train the estimator on formatted training data
linear_balanced.fit(formatted_train_data)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-06-22 17:34:32 Starting - Starting the training job...
2020-06-22 17:34:34 Starting - Launching requested ML instances.........
2020-06-22 17:36:12 Starting - Preparing the instances for training......
2020-06-22 17:37:24 Downloading - Downloading input data...
2020-06-22 17:38:02 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/22/2020 17:38:05 INFO 140424515323712] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_schedul

In [14]:

%%time 
# deploy and create a predictor
linear_balanced = linear_balanced.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!CPU times: user 290 ms, sys: 15.5 ms, total: 306 ms
Wall time: 8min 36s


In [15]:

print('Metrics for tuned (recall), LinearLearner.\n')

# get metrics for tuned predictor
metrics = evaluate(linear_balanced, 
                   train.to_numpy().astype('float32'), 
                   train_labels.to_numpy().astype('int'), 
                   verbose=True)

Metrics for tuned (recall), LinearLearner.

prediction (col)    0.0   1.0
actual (row)                 
0                 29804  4854
1                    39   397

Recall:     0.911
Precision:  0.076
Accuracy:   0.861



In [16]:

prediction_batches = [linear_balanced.predict(batch) for batch in np.array_split(test.to_numpy().astype('float32'), 100)]

# LinearLearner produces a `predicted_label` for each data point in a batch
# get the 'predicted_label' for every point in a batch
test_preds = np.concatenate([np.array([x.label['predicted_label'].float32_tensor.values[0] for x in batch]) 
                             for batch in prediction_batches])

llearner_recall = pd.DataFrame({'LNR':test.index ,'RESPONSE':  test_preds })
llearner_recall.to_csv("../submissions/llearner_recall_balanced.csv",sep=",", index=False)

## Kaggle score: `0.62746`

In [17]:
# delete the predictor endpoint 
delete_endpoint(linear_balanced)

Deleted linear-learner-2020-06-22-17-34-32-355
