In [2]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston


sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)



Using bucket sagemaker-us-east-1-827713284860


In [3]:
# we use the Boston housing dataset
data = load_boston()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test



In [5]:
trainX.head()

trainX.to_csv("boston_train.csv")
testX.to_csv("boston_test.csv")

# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="boston_train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

testpath = sess.upload_data(
    path="boston_test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)



In [6]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="boston_train.csv")
    parser.add_argument("--test-file", type=str, default="boston_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)



Writing script.py


In [7]:
! python script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT' \
                   --target target

extracting arguments
reading data
building training and testing datasets
training model
validating model
AE-at-10th-percentile: 0.27305738095238324
AE-at-50th-percentile: 1.5763249999999989
AE-at-90th-percentile: 4.497606904761905
model persisted at ./model.joblib
2


In [9]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",#"ml.c5.xlarge",#"ml.m4.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT",
        "target": "target",
    },
)

import time
tic = time.perf_counter()

# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=False)

toc = time.perf_counter()
print(toc - tic)

0.6390290900001219


In [10]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2023-02-05 17:40:52 Starting - Preparing the instances for training...............
2023-02-05 17:42:19 Downloading - Downloading input data.....
2023-02-05 17:42:49 Training - Downloading the training image......
2023-02-05 17:43:25 Training - Training image download completed. Training in progress.....
2023-02-05 17:43:45 Uploading - Uploading generated training model.
2023-02-05 17:43:57 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-827713284860/rf-scikit-2023-02-05-17-40-26-075/output/model.tar.gz


In [11]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1)



----!

In [13]:
df_test = pd.read_csv('boston_test.csv')

In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  127 non-null    int64  
 1   CRIM        127 non-null    float64
 2   ZN          127 non-null    float64
 3   INDUS       127 non-null    float64
 4   CHAS        127 non-null    float64
 5   NOX         127 non-null    float64
 6   RM          127 non-null    float64
 7   AGE         127 non-null    float64
 8   DIS         127 non-null    float64
 9   RAD         127 non-null    float64
 10  TAX         127 non-null    float64
 11  PTRATIO     127 non-null    float64
 12  B           127 non-null    float64
 13  LSTAT       127 non-null    float64
 14  target      127 non-null    float64
dtypes: float64(14), int64(1)
memory usage: 15.0 KB


In [23]:
df_test.keys()[1:-1]

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')

In [25]:
df_test[df_test.keys()[1:-1]].iloc[0]
#df_test[['CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT,']].iloc[0]

CRIM         0.09178
ZN           0.00000
INDUS        4.05000
CHAS         0.00000
NOX          0.51000
RM           6.41600
AGE         84.10000
DIS          2.64630
RAD          5.00000
TAX        296.00000
PTRATIO     16.60000
B          395.50000
LSTAT        9.04000
Name: 0, dtype: float64

In [42]:
random_index = np.random.randint(1,df_test.shape[0],10)>

In [45]:
batch_sample = df_test[df_test.keys()[1:-1]].iloc[random_index].to_numpy()

In [46]:
batch_sample.shape

(10, 13)

In [47]:
predictor.predict(batch_sample)

array([22.18369968, 12.14249722, 29.42746436, 45.4434333 , 20.31611522,
       28.86040675, 27.09973929, 17.77245902, 30.27263081, 32.23756905])

In [49]:
for i in range(400):
    random_index = np.random.randint(1,df_test.shape[0],2)
    batch_sample = df_test[df_test.keys()[1:-1]].iloc[random_index].to_numpy()
    print(f"Prediction{predictor.predict(batch_sample)}")

Prediction[24.06448526 15.40668283]
Prediction[23.5769833 16.2932645]
Prediction[25.62441865 15.7027446 ]
Prediction[23.35320999 24.36397392]
Prediction[21.99052288 25.60333254]
Prediction[19.05345076 28.86040675]
Prediction[20.52581587 20.52581587]
Prediction[18.41432796 30.75194614]
Prediction[22.27977343 12.68736136]
Prediction[22.18369968 20.13417781]
Prediction[20.43824491 21.18750458]
Prediction[ 7.60912381 15.7027446 ]
Prediction[30.75194614 14.34389892]
Prediction[30.75194614 25.15943734]
Prediction[17.77245902 17.77245902]
Prediction[23.37949197 21.72177995]
Prediction[44.91840827 21.13879416]
Prediction[20.43824491 19.62208279]
Prediction[19.70493712 17.4381631 ]
Prediction[30.75194614 14.8017823 ]
Prediction[12.68736136 16.58506383]
Prediction[26.83569722 13.89277627]
Prediction[19.49479748 19.97697817]
Prediction[21.12890945 23.54087433]
Prediction[23.61554571 10.11048334]
Prediction[ 9.09670833 19.97697817]
Prediction[21.12854841 21.18750458]
Prediction[ 8.51434063 19.9623

In [50]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': '77342828-4ac7-4698-86c5-48bb13f17cf0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '77342828-4ac7-4698-86c5-48bb13f17cf0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 05 Feb 2023 18:18:00 GMT'},
  'RetryAttempts': 0}}