In [4]:
import os

import pandas as pd
import numpy as np

import boto3
import sagemaker
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

from sagemaker.amazon.amazon_estimator import RecordSet # could be used if data fits in mem
import io
import sagemaker.amazon.common as smac

pd.set_option('display.max_columns', None)

In [10]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = "wyatt-datalake"
prefix = "terraform-aws-project-1"

In [6]:
df_dtypes = {
    "price": np.float32,
    "year": np.float16,
    "manufacturer": "category",
    "model": "category",
    "condition": "category",
    "cylinders": "category",
    "fuel": "category",
    "title_status": "category",
    "size": "category",
    "odometer": "category",
    "transmission": "category",
    "drive": "category",
    "type": "category",
    "state": "category",
    "paint_color": "category"
}

df = pd.read_csv("s3://wyatt-datalake/data/terraform-aws-project-1/vehicles.csv", dtype=df_dtypes, usecols=list(df_dtypes.keys()))

In [21]:
features = df.drop("price", axis=1).values
labels = df["price"].values
np.random.seed(0)

numeric_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, make_column_selector(dtype_exclude="category")),
        ("cat", categorical_transformer, make_column_selector(dtype_include="category"))])
features = preprocessor.fit_transform(df.drop("price", axis=1))
features = features.astype(np.float32)

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5
)

In [23]:
# for split in ["train", "val", "test"]:
###Uploading training data
buf = io.BytesIO()
smac.write_spmatrix_to_sparse_tensor(buf, X_train, y_train)
buf.seek(0)

#Filename for training data we are uploading to S3 
key = 'linear-train-data'
#Upload training data to S3
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

###Uploading test data
buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_spmatrix_to_sparse_tensor(buf, X_test, y_test)
buf.seek(0)

#Sub-folder for test data
key = 'linear-test-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded data location: {}'.format(s3_test_data))

###Uploading val data
buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_spmatrix_to_sparse_tensor(buf, X_val, y_val)
buf.seek(0)

#Sub-folder for val data
key = 'linear-val-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'val', key)).upload_fileobj(buf)
s3_val_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded data location: {}'.format(s3_val_data))

###Model Artifacts
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Model Artifacts will be uploaded to: {}'.format(output_location))

uploaded training data location: s3://wyatt-datalake/terraform-aws-project-1/train/linear-train-data
uploaded training data location: s3://wyatt-datalake/terraform-aws-project-1/test/linear-test-data
uploaded training data location: s3://wyatt-datalake/terraform-aws-project-1/test/linear-val-data
Training artifacts will be uploaded to: s3://wyatt-datalake/terraform-aws-project-1/output


In [24]:
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.2xlarge",
    volume_size=20,
    max_run=3600,
    input_mode="Pipe",
    output_path=output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(predictor_type="regressor", mini_batch_size=32)
ll_estimator.fit(inputs={"train": s3_train_data}, logs=True)

2021-10-26 20:32:32 Starting - Starting the training job...
2021-10-26 20:32:55 Starting - Launching requested ML instancesProfilerReport-1635280352: InProgress
......
2021-10-26 20:33:58 Starting - Preparing the instances for training.........
2021-10-26 20:35:31 Downloading - Downloading input data......
2021-10-26 20:36:24 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[10/26/2021 20:36:30 INFO 139853911131968] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'ini