# Imports

In [1]:
import logging
import boto3
from botocore.exceptions import ClientError
import pandas as pd

from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input
from sagemaker import get_execution_role
from sagemaker import Session
from sagemaker.estimator import Estimator

In [2]:
bucket_name = 'ye-1468'
region = 'eu-central-1'

In [3]:
session = boto3.Session(
    region_name=region
)

In [4]:
# Now we can create low-level clients or resource clients from our custom session
s3_client = boto3.client('s3')
s3 = boto3.resource('s3')

# Training

## Data

In [5]:
train_data = 's3://aida-project/niy/train.csv'
test_data = 's3://aida-project/niy/test.csv'

# create s3 paths for the objects
s3_input_train = s3_input(train_data, content_type='text/csv')
s3_input_test = s3_input(test_data, content_type='text/csv')

data_channels = {'train': s3_input_train, 'validation': s3_input_test}

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [6]:
# get execution role
role = get_execution_role()
# create session
#sess = Session()

## Linear Learner (Linear Regression)

In [7]:
# Use a previously-built, AWS Linear Learner model for training


container = get_image_uri(region_name=region,
                          repo_name='linear-learner')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [8]:
# create an Linear Learner Estimator


import boto3
import sagemaker




sess = sagemaker.Session()

linear = Estimator(container,
                   role,
                   train_instance_count=1,
                   train_instance_type='ml.c4.xlarge',
                   output_path='s3://aida-project/niy/output',
                   sagemaker_session=sess,
                  base_job_name = 'lin-nj'
                  )


Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [9]:
linear.set_hyperparameters(predictor_type='regressor',
                           normalize_data=False,
                           mini_batch_size=1000)                       
                        


In [10]:

linear.fit({"train": s3_input_train, "validation": s3_input_test}, wait=True)

2020-11-19 17:52:48 Starting - Starting the training job...
2020-11-19 17:52:50 Starting - Launching requested ML instances......
2020-11-19 17:53:54 Starting - Preparing the instances for training......
2020-11-19 17:54:52 Downloading - Downloading input data...
2020-11-19 17:55:49 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/19/2020 17:55:53 INFO 140151905036096] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_

In [16]:
from sagemaker.analytics import TrainingJobAnalytics

training_job_name = 'lin-nj-2020-11-19-17-52-48-664'
metrics_df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
metrics_df



Unnamed: 0,timestamp,metric_name,value
0,0.0,train:progress,6.0
1,60.0,train:progress,56.357143
2,0.0,train:objective_loss,1157.616619
3,60.0,train:objective_loss,2.585761
4,0.0,validation:objective_loss,381.250121
5,60.0,validation:objective_loss,2.183769
6,0.0,validation:objective_loss:final,1.808702
7,0.0,validation:mse,1.808702
8,0.0,train:throughput,15173.542111
9,60.0,train:throughput,15832.954479


In [31]:
import math
print("rmse is", math.sqrt(1.808702))

rmse is 1.3448799202902837


# Hyperparameter Tuning 

In [17]:
import sagemaker
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch

hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.001, 0.08)}


objective_metric_name = 'test:objective_loss'
objective_type = 'Minimize'
strategy='Bayesian'


tuner = HyperparameterTuner(estimator=linear,
                            objective_metric_name=objective_metric_name,
                            hyperparameter_ranges=hyperparameter_ranges,
                            strategy=strategy,
                            objective_type=objective_type,
                            max_jobs=10,
                            max_parallel_jobs=5,
                            base_tuning_job_name='linreg-nj',
                            early_stopping_type='Auto'
                           )

                    


In [18]:
tuner.fit({"train": s3_input_train, "test": s3_input_test}, wait=True)

In [19]:
from sagemaker.analytics import TrainingJobAnalytics

training_job_name = 'linreg-nj-201119-1801-010-fca1c10e'
metrics_df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
metrics_df



Unnamed: 0,timestamp,metric_name,value
0,0.0,train:progress,46.307692
1,60.0,train:progress,96.5
2,0.0,train:objective_loss,84.994376
3,60.0,train:objective_loss,0.945287
4,0.0,test:mse,1.849445
5,0.0,test:absolute_loss,1.042327
6,0.0,train:mse,1.830633
7,0.0,train:objective_loss:final,1.830633
8,0.0,test:objective_loss,1.849445
9,0.0,train:absolute_loss,1.04697


In [30]:
import math
print("rmse is", math.sqrt(1.830633))

rmse is 1.3530088691505315


#  ---> Without Hyperparameter Tuning is the rmse slightly better!

# Deployment

In [None]:
## TO DO

In [None]:
'''
# Deploy your model to an endpoint to perform predictions
xgb_predictor = xgb_model.deploy(
    initial_instance_count = 1, 
    instance_type = 'ml.t2.medium')  
'''
print()

In [None]:
'''
# Configure the predictor's serializer and deserializer

from sagemaker.predictor import csv_serializer, csv_deserializer
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
xgb_predictor.deserializer = csv_deserializer
'''
print()

## Prediction

In [None]:
## TO DO

In [None]:
# xgb_predictor.predict(df_test.values)