In [2]:
import os

import pandas as pd
import numpy as np

import boto3
import sagemaker
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Binarizer, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.amazon.amazon_estimator import RecordSet # could be used if data fits in mem
import io
import sagemaker.amazon.common as smac

pd.set_option('display.max_columns', None)

In [5]:
!pwd

/home/ec2-user/SageMaker/terraform-aws-project-1/notebooks


In [15]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = "wyatt-datalake"
prefix = "terraform-aws-project-1"
# BASE_DIR = os.path.dirname(os.path.realpath(__file__))
BASE_DIR = "/home/ec2-user/SageMaker/terraform-aws-project-1/notebooks/"

In [16]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor

processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.m5.xlarge"
)

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=f"{prefix}/sklearn_preprocessor",
    sagemaker_session=sagemaker_session,
    role=role,
)
step_process = ProcessingStep(
    name="terraform-aws-project-1",
    processor=sklearn_processor,
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    code=os.path.join(BASE_DIR, "sklearn_preprocess.py"),
    job_arguments=["--input-data", input_data],
)

NameError: name 'input_data' is not defined

In [None]:
features = df.drop("price", axis=1).values
labels = df["price"].values
np.random.seed(0)

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5
)

In [5]:
def foo(split, features, labels):
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler())

    categorical_transformer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(handle_unknown='ignore'))

    preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, make_column_selector(dtype_exclude="category")),
            ("cat", categorical_transformer, make_column_selector(dtype_include="category"))])
    features = preprocessor.fit_transform(df.drop("price", axis=1))
    features = features.astype(np.float32)

    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, features, target)
    buf.seek(0)

    #Filename for training data we are uploading to S3 
    key = 'linear-data'
    #Upload training data to S3
    boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, split, key)).upload_fileobj(buf)
    s3_split_data = 's3://{}/{}/{}/{}'.format(bucket, prefix, split, key)
    print('uploaded {} data location: {}'.format(split, s3_split_data))

foo('train', X_train, y_train)
foo('test', X_test, y_test)
foo('val', X_val, y_val)

###Model Artifacts
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Model Artifacts will be uploaded to: {}'.format(output_location))

In [6]:
###Uploading training data
buf = io.BytesIO()
smac.write_spmatrix_to_sparse_tensor(buf, X_train, y_train)
buf.seek(0)

#Filename for training data we are uploading to S3 
key = 'linear-train-data'
#Upload training data to S3
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

###Uploading test data
buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_spmatrix_to_sparse_tensor(buf, X_test, y_test)
buf.seek(0)

#Sub-folder for test data
key = 'linear-test-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded data location: {}'.format(s3_test_data))

###Uploading val data
buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_spmatrix_to_sparse_tensor(buf, X_val, y_val)
buf.seek(0)

#Sub-folder for val data
key = 'linear-val-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'val', key)).upload_fileobj(buf)
s3_val_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded data location: {}'.format(s3_val_data))

###Model Artifacts
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Model Artifacts will be uploaded to: {}'.format(output_location))

uploaded training data location: s3://wyatt-datalake/terraform-aws-project-1/train/linear-train-data
uploaded data location: s3://wyatt-datalake/terraform-aws-project-1/test/linear-test-data
uploaded data location: s3://wyatt-datalake/terraform-aws-project-1/test/linear-val-data
Model Artifacts will be uploaded to: s3://wyatt-datalake/terraform-aws-project-1/output


In [None]:
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.2xlarge",
    volume_size=20,
    max_run=3600,
    input_mode="Pipe",
    output_path=output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(predictor_type="regressor", mini_batch_size=32)
ll_estimator.fit(inputs={"train": s3_train_data}, logs=True)


In [None]:
modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1',
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }
# Specify the model source
model_url = "s3://your-bucket-name/model.tar.gz"

# Specify the model data
modelpackage_inference_specification["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]=model_url

create_model_package_input_dict = {
    "ModelPackageGroupName" : model_package_group_name,
    "ModelPackageDescription" : "Model to detect 3 different types of irises (Setosa, Versicolour, and Virginica)",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)