In [None]:
!pip install sklearn

In [428]:
bucket = 'ml-lab-pyspark'
prefix = 'sagemaker/linear_learner'
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role



In [429]:
boto3_session=boto3.Session(profile_name='ml-lab')

from sagemaker.session import Session

sagemaker_session=Session(boto3_session)

try:
    role = get_execution_role()
except:
    role="arn:aws:iam::814948925568:role/service-role/AmazonSageMaker-ExecutionRole-20200310T154729"

In [430]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
output_location

's3://ml-lab-pyspark/sagemaker/linear_learner/output'

In [None]:
from sagemaker.inputs import s3_input
training_data=s3_input('s3://{}/{}'.format(bucket, 'training_data.io'),
    s3_data_type='S3Prefix',
    input_mode=None
)
training_data

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [None]:
import boto3
import sagemaker

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sagemaker_session)
linear.set_hyperparameters(feature_dim=72,
    predictor_type='binary_classifier',
    mini_batch_size=200,
    normalize_data=True,
    normalize_label=False,
)

linear.fit({'train': training_data})

In [431]:
from sagemaker.predictor import RealTimePredictor
linear_predictor=RealTimePredictor('linear-learner-absolute-price-endpoint',sagemaker_session)

In [432]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [435]:
import pandas as pd
evaluation_data_df=pd.read_csv('../data/evaluation_data.csv',header=None)
# evaluation_data_df.loc[0].tolist()[0:72]
evaluation_data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,596,597,598,599,600,601,602,603,604,605
0,1.24979,1.24996,1.25699,1.26256,1.24976,1.251452,1.251481,1.251491,1.251475,1.251499,...,1.249929,1.249947,1.249934,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1
1,1.24981,1.24998,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249947,1.249934,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1
2,1.24976,1.24993,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249947,1.249934,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1
3,1.24978,1.24995,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249947,1.249934,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1
4,1.24977,1.24997,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249947,1.249934,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1071,1.24964,1.24981,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249825,1.249830,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1
1072,1.24963,1.24980,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249825,1.249830,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1
1073,1.24964,1.24981,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249825,1.249830,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1
1074,1.24964,1.24981,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249825,1.249830,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1


In [444]:
data=','.join([str(x) for x in evaluation_data_df.loc[0].tolist()[0:605]])
linear_predictor.predict(data)

{'predictions': [{'score': 2.0741686217434445e-30, 'predicted_label': 0.0}]}

In [445]:
def eval_func(row):
    csv=','.join(row.tolist()[0:605])
    result=linear_predictor.predict(csv)
    return result
results=evaluation_data_df.astype(str).apply(eval_func,axis=1)

In [446]:
results[0]

{'predictions': [{'score': 2.0741686217434445e-30, 'predicted_label': 0.0}]}

In [447]:
predicted_labels=[1 if i['predictions'][0]['predicted_label']==1.0 else 0 for i in results]

In [448]:
score=[i['predictions'][0]['score'] for i in results]

In [449]:
evaluation_data_df['predicted_labels']=predicted_labels
evaluation_data_df['score']=score
evaluation_data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,598,599,600,601,602,603,604,605,predicted_labels,score
0,1.24979,1.24996,1.25699,1.26256,1.24976,1.251452,1.251481,1.251491,1.251475,1.251499,...,1.249934,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1,0,2.074169e-30
1,1.24981,1.24998,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1,0,2.074438e-30
2,1.24976,1.24993,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1,0,2.074454e-30
3,1.24978,1.24995,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1,0,2.074454e-30
4,1.24977,1.24997,1.25699,1.26256,1.24976,1.251481,1.251491,1.251475,1.251499,1.251560,...,1.249927,1.249927,1.249885,1.249958,1.249899,1.249879,1.249895,1,0,2.074454e-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1071,1.24964,1.24981,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1,0,2.163370e-30
1072,1.24963,1.24980,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1,0,2.163370e-30
1073,1.24964,1.24981,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1,0,2.163370e-30
1074,1.24964,1.24981,1.25699,1.26256,1.24884,1.252425,1.252413,1.252412,1.252416,1.252200,...,1.249850,1.249835,1.249805,1.249739,1.249710,1.249701,1.249690,1,0,2.163370e-30


In [None]:
# evaluation_data_df.to_csv('../data/100_precision_025_recall.csv')
# evaluation_data_df=pd.read_csv('../data/044precision_high_recall.csv')

In [454]:
most_certain=evaluation_data_df.query('predicted_labels>0')
most_certain

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,598,599,600,601,602,603,604,605,predicted_labels,score


In [None]:
# Importing the metrics package from sklearn library
from sklearn import metrics
# Creating the confusion matrix
cm = metrics.confusion_matrix(most_certain[72], most_certain['predicted_labels'])
# Assigning columns names
cm_df = pd.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df

In [None]:
most_certain.query('predicted_labels==1').describe()

In [None]:
most_certain.query('predicted_labels==0').describe()