In [None]:
from datasource_helper import read_from_s3
from datasource_helper import read_from_redshift
from datasource_helper import read_from_vertica
from datasource_helper import write_df_to_s3_csv
import getpass
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sagemaker.sklearn.processing import SKLearnProcessor
from sklearn.preprocessing import StandardScaler
import os
from sagemaker.processing import ProcessingInput, ProcessingOutput
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter

### Read data from S3, Vertica or Redshift

In [None]:
# Read data from Vertica
SQL_QUERY = "select * from MDA.on_training1 limit 10"
username = getpass.getpass()
password = getpass.getpass()
df = read_from_vertica(username=username, password=password, sql_query=SQL_QUERY)
df.head(10)

In [None]:
# Read data from Redshift
SQL_QUERY = "select * from MDA.on_training1 limit 10"
username = getpass.getpass()
password = getpass.getpass()
df = read_from_redshift(username=username, password=password, sql_query=SQL_QUERY)
df.head(10)

In [None]:
# Read data from S3 
s3Obj = read_from_s3("choice-mlflow-input", "demo/datasets/housing.csv")
df = pd.read_csv(s3Obj['Body'])
df.head(10)

#### Preprocessing Stage
    1. Missing Values
    2. Categorical Variables
    3. Prepare Datasets for train/test/cross-validation
    4. Feature Scaling on train dataset
    
    

In [None]:
df.describe()

#### Running on Notebook

In [None]:
# Output locations
train_features_output_path = os.path.join('/home/ec2-user/demo_datasets/train', 'train_features.csv')
train_labels_output_path = os.path.join('/home/ec2-user/demo_datasets/train', 'train_labels.csv')
test_features_output_path = os.path.join('/home/ec2-user/demo_datasets/test', 'test_features.csv')
test_labels_output_path = os.path.join('/home/ec2-user/demo_datasets/test', 'test_labels.csv')

In [None]:
df['income_category'] = np.ceil(df['median_income']/1.5)
df['income_category'].where(df['median_income'] < 5, 5.0, inplace=True)
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=df['income_category'])

# Now we should remove the income_category attribute, so the data is back to its original state:
train_set = train_set.drop("income_category", axis=1)
test_set = test_set.drop("income_category", axis=1)

X_train = train_set.drop(columns="median_house_value") 
X_test = test_set.drop(columns="median_house_value") 

y_train = train_set["median_house_value"].copy()
y_test = test_set["median_house_value"].copy()

# Handle Missing Values
simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None, 
                               verbose=0, copy=True)

simple_imputer_categorical = SimpleImputer(missing_values=np.nan, strategy='most_frequent', fill_value=None, 
                               verbose=0, copy=True)

# Handle Categorical Variables
one_hot_encoder = OneHotEncoder(categories='auto', sparse=True, dtype=np.float64, handle_unknown='error')

# Feature Scaling
std_scalar = StandardScaler(copy=True, with_mean=True, with_std=True)


numeric_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 
                    'total_bedrooms', 'population', 'households', 'median_income']

categorical_features = ['ocean_proximity']

# A transformer to apply to apply on numerical features.
numeric_transformer = Pipeline(steps=[('imputer', simple_imputer), ('scaler', std_scalar)])
categorical_transformer = Pipeline(steps=[('imputer', simple_imputer_categorical), ('onehot', one_hot_encoder)])

preprocessor = ColumnTransformer(transformers=[
                    ('numerical', numeric_transformer, numeric_features),
                    ('categorical', categorical_transformer, categorical_features)])

train_features = preprocessor.fit_transform(X_train)
test_features = preprocessor.fit_transform(X_test)

print('Train data shape after preprocessing: {}'.format(train_features.shape))
print('Test data shape after preprocessing: {}'.format(test_features.shape))
    
print('Saving training features to {}'.format(train_features_output_path))
pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)
    
print('Saving test features to {}'.format(test_features_output_path))
pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)
    
print('Saving training labels to {}'.format(train_labels_output_path))
y_train.to_csv(train_labels_output_path, header=False, index=False)
    
print('Saving test labels to {}'.format(test_labels_output_path))
y_test.to_csv(test_labels_output_path, header=False, index=False)


#### Amazon SageMaker Processing Jobs

In [None]:
%%writefile preprocessing.py
import argparse
import os
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import os

from sklearn.exceptions import DataConversionWarning

if __name__=='__main__':
    train_features_output_path = os.path.join('/opt/ml/processing/train', 'train_features.csv')
    train_labels_output_path = os.path.join('/opt/ml/processing/train', 'train_labels.csv')
    test_features_output_path = os.path.join('/opt/ml/processing/test', 'test_features.csv')
    test_labels_output_path = os.path.join('/opt/ml/processing/test', 'test_labels.csv')

    parser = argparse.ArgumentParser()
    parser.add_argument('--train-test-split-ratio', type=float, default=0.3)
    args, _ = parser.parse_known_args()
    
    print('Received arguments {}'.format(args))

    input_data_path = os.path.join('/opt/ml/processing/input', 'housing.csv')   
    print('Reading input data from {}'.format(input_data_path))
    df = pd.read_csv(input_data_path)
    df['income_category'] = np.ceil(df['median_income']/1.5)
    df['income_category'].where(df['median_income'] < 5, 5.0, inplace=True)
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=df['income_category'])

    # Now we should remove the income_category attribute, so the data is back to its original state:
    train_set = train_set.drop("income_category", axis=1)
    test_set = test_set.drop("income_category", axis=1)

    X_train = train_set.drop(columns="median_house_value") 
    X_test = test_set.drop(columns="median_house_value") 

    y_train = train_set["median_house_value"].copy()
    y_test = test_set["median_house_value"].copy()

    # Handle Missing Values
    simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None, 
                                   verbose=0, copy=True)

    simple_imputer_categorical = SimpleImputer(missing_values=np.nan, strategy='most_frequent', fill_value=None, 
                                   verbose=0, copy=True)

    # Handle Categorical Variables
    one_hot_encoder = OneHotEncoder(categories='auto', sparse=True, dtype=np.float64, handle_unknown='error')

    # Feature Scaling
    std_scalar = StandardScaler(copy=True, with_mean=True, with_std=True)


    numeric_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 
                        'total_bedrooms', 'population', 'households', 'median_income']

    categorical_features = ['ocean_proximity']

    # A transformer to apply to apply on numerical features.
    numeric_transformer = Pipeline(steps=[('imputer', simple_imputer), ('scaler', std_scalar)])
    categorical_transformer = Pipeline(steps=[('imputer', simple_imputer_categorical), ('onehot', one_hot_encoder)])

    preprocessor = ColumnTransformer(transformers=[
                        ('numerical', numeric_transformer, numeric_features),
                        ('categorical', categorical_transformer, categorical_features)])

    train_features = preprocessor.fit_transform(X_train)
    test_features = preprocessor.fit_transform(X_test)

    print('Train data shape after preprocessing: {}'.format(train_features.shape))
    print('Test data shape after preprocessing: {}'.format(test_features.shape))

    print('Saving training features to {}'.format(train_features_output_path))
    pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)

    print('Saving test features to {}'.format(test_features_output_path))
    pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)

    print('Saving training labels to {}'.format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)

    print('Saving test labels to {}'.format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)


In [None]:
region = boto3.session.Session().region_name

role = get_execution_role()
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m5.xlarge',
                                     instance_count=1)

sklearn_processor.run(code='preprocessing.py',
                      inputs=[ProcessingInput(
                        source='s3://choice-mlflow-input/demo/datasets/housing.csv',
                        destination='/opt/ml/processing/input')],
                      outputs=[ProcessingOutput(output_name='train_data',
                                                source='/opt/ml/processing/train', 
                                                destination= 's3://choice-mlflow-input/demo/datasets/train/'),
                               ProcessingOutput(output_name='test_data',
                                                source='/opt/ml/processing/test', 
                                                destination='s3://choice-mlflow-input/demo/datasets/test/')],
                      arguments=['--train-test-split-ratio', '0.2'])

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_data':
        preprocessed_test_data = output['S3Output']['S3Uri']
        

#### Train Model

In [None]:
train_features_s3Obj = read_from_s3("choice-mlflow-input", "demo/datasets/train/train_features.csv")
train_labels_s3Obj = read_from_s3("choice-mlflow-input", "demo/datasets/train/train_labels.csv")
train_features = pd.read_csv(train_features_s3Obj['Body'], header=None)
train_labels = pd.read_csv(train_labels_s3Obj['Body'], header=None)


test_features_s3Obj = read_from_s3("choice-mlflow-input", "demo/datasets/test/test_features.csv")
test_labels_s3Obj = read_from_s3("choice-mlflow-input", "demo/datasets/test/test_labels.csv")
test_features = pd.read_csv(test_features_s3Obj['Body'], header=None)
test_labels = pd.read_csv(test_labels_s3Obj['Body'], header=None)

In [None]:
ridge = Ridge(alpha=0.3, max_iter=1000)
ridge.fit(train_features, train_labels)

housing_predictions = ridge.predict(test_features)

# Calculate Error
mse = mean_squared_error(y_true=test_labels, y_pred=housing_predictions)
mae = mean_absolute_error(y_true=test_labels, y_pred=housing_predictions)

print("MSE : %0.3f " % mse)
print("MAE : %0.3f " % mae)

#### Train Model On Sagemaker

In [None]:
%%writefile train.py
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
import argparse
import pandas as pd
import os
from sklearn import tree
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error

def define_hyperparameters(parser):
    # Hyperparameters are defined here. In this simple example we are just including one hyperparameter.
    parser.add_argument('--alpha', type=float, default=0.5)

    
def define_data_directories(parser):
    #A string representing the path to the directory to write model artifacts to. 
    #Any artifacts saved in this folder are uploaded to S3 for model hosting after the training job completes.
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    
    #A string representing the filesystem path to write output artifacts to. 
    #Output artifacts may include checkpoints, graphs, and other files to save, not including model artifacts.
    #These artifacts are compressed and uploaded to S3 to the same S3 prefix as the model artifacts.
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    
    #A string representing the path to the directory containing data in the 'train' channel
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    
    #A string representing the path to the directory containing data in the 'test' channel
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
    
    #A string representing the path to the directory containing data in the 'validation' channel
    #parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION'])

    
def train(args):
    train_features_data = os.path.join(args.train, 'train_features.csv')
    train_labels_data = os.path.join(args.train, 'train_labels.csv')
    test_features_data = os.path.join(args.test, 'test_features.csv')
    test_labels_data = os.path.join(args.test, 'test_labels.csv')
   
    print('Reading input data from {}'.format(args.train))
    X_train = pd.read_csv(train_features_data, header=None)
    y_train = pd.read_csv(train_labels_data, header=None)
    X_test = pd.read_csv(test_features_data, header=None)
    y_test = pd.read_csv(test_labels_data, header=None)

    print('Fitting the model to data')
    ridge = Ridge(alpha=args.alpha, max_iter=100, solver='sag')
    ridge.fit(X_train, y_train)
    
    mse = mean_squared_error(y_test, ridge.predict(X_test))
    print("MSE: %.4f" % mse) 
    
    # Dump the model to S3
    joblib.dump(ridge, os.path.join(args.model_dir, "model.joblib"))
    

def model_fn(model_dir):
    """Deserialized and return fitted model
    Note that this should have the same name as the serialized model in the main method
    """
    print('Writing model artifacts to {}'.format(model_dir))
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf    


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    define_hyperparameters(parser=parser)
    define_data_directories(parser=parser)
    args = parser.parse_args()
    train(args=args)

In [None]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
script_path = 'train.py'
    
sklearn = SKLearn(entry_point=script_path, 
                  train_instance_type="ml.m4.4xlarge",
                  source_dir='/home/ec2-user/SageMaker/umasrivenkat_kannikanti/Demo',
                  output_path= 's3://choice-mlflow-input/demo/output',
                  role=role,
                  sagemaker_session=sagemaker_session, 
                  hyperparameters={'alpha': 0.7})

sklearn.fit({'train': 's3://choice-mlflow-input/demo/datasets/train/', 'test': 's3://choice-mlflow-input/demo/datasets/test/'})

#### Hyperparameter Tuning on Sagemaker

In [None]:
sklearn_estimator = SKLearn(entry_point=script_path, 
                            train_instance_type="ml.m4.4xlarge",
                            source_dir='/home/ec2-user/SageMaker/umasrivenkat_kannikanti/Demo',
                            output_path= 's3://choice-mlflow-input/demo/output',
                            role=role,
                            sagemaker_session=sagemaker_session, 
                            hyperparameters={'alpha': 0.7})

hyperparameter_ranges = {'alpha': ContinuousParameter(0.5, 1.2)}

## Stupid way of doing this.. but no other option.. Use it or move away from AWShit lol. 
objective_metric_name = 'MSE'
metric_definitions = [{'Name': 'MSE',
                       'Regex': 'MSE: ([0-9\\.]+)'}]

tuner = HyperparameterTuner(sklearn_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=9,
                            max_parallel_jobs=3)

tuner.fit({'train': 's3://choice-mlflow-input/demo/datasets/train/', 'test': 's3://choice-mlflow-input/demo/datasets/test/'})

#### Deploy Model & Inference Using Endpoint
*Deploying the model to SageMaker hosting just requires a deploy call on the fitted model. This call takes an instance count and instance type*

In [None]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

In [None]:
test_point = np.array([[0.591651, -0.691139, 1.384765, -0.486401, -0.621162, -0.731684, -0.567076, 0.655892, 1.0, 0.0, 0.0, 0.0, 0.0]])
print(predictor.predict(test_point))

In [None]:
#Endpoint cleanup
sklearn.delete_endpoint()

#### Batch Transform
*We can also use the trained model for asynchronous batch inference on S3 data using SageMaker Batch Transform.*

In [None]:
transformer = sklearn.transformer(instance_count=1, instance_type='ml.t2.medium')
write_df_to_s3_csv(df = pd.DataFrame(data=X_test), project_name=project_name, channel="test")
transformer.transform("s3://pricing_demo/test/", content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()

In [None]:
X_train.iloc[9:10]