In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3 = boto3.client('sagemaker', region_name='####')
sess = sagemaker.Session(boto_session=boto3.Session(region_name='######'))
bucket = '#####'
print("Using bucket " + bucket)

In [20]:
df = pd.read_csv("bodyfat.csv")

In [21]:
df.head()

Unnamed: 0,Wrist,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,BodyFat
0,17.1,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,12.3
1,18.2,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,6.1
2,16.6,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,25.3
3,18.2,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,10.4
4,17.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,28.7


In [22]:
df.shape

(252, 14)

In [23]:
df.isnull().mean() * 100

Wrist      0.0
Age        0.0
Weight     0.0
Height     0.0
Neck       0.0
Chest      0.0
Abdomen    0.0
Hip        0.0
Thigh      0.0
Knee       0.0
Ankle      0.0
Biceps     0.0
Forearm    0.0
BodyFat    0.0
dtype: float64

In [24]:
features = list(df.columns)
features
label = features.pop(-1)

In [25]:
x = df[features]
y = df[label]

In [26]:
x.head()

Unnamed: 0,Wrist,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm
0,17.1,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4
1,18.2,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9
2,16.6,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2
3,18.2,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4
4,17.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7


In [27]:
y.head()

0    12.3
1     6.1
2    25.3
3    10.4
4    28.7
Name: BodyFat, dtype: float64

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=0)

In [29]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(201, 13)
(51, 13)
(201,)
(51,)


In [30]:
trainX = pd.DataFrame(x_train)
trainX[label] = y_train

testX = pd.DataFrame(x_test)
testX[label] = y_test

In [31]:
print(trainX.shape)
print(testX.shape)

(201, 14)
(51, 14)


Start of AWS Section

In [32]:
# Save datasets as csv files
trainX.to_csv("train-v-1.csv", index = False)
testX.to_csv("test-v-1.csv", index = False)

In [None]:
# send to S3

sk_prefix = "#######"

trainpath = sess.upload_data(
    path = "train-v-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path = "test-v-1.csv", bucket=bucket, key_prefix=sk_prefix
)

print(trainpath)
print(testpath)

In [40]:
%%writefile script.py

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # data, model, and other output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument("--train", type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument("--test", type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument("--train-file", type=str, default="train-v-1.csv")
    parser.add_argument("--test-file", type=str, default="test-v-1.csv")

    args, _ = parser.parse_known_args()

    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading Data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    print("[INFO] Building Datasets")
    print()

    x_train = train_df[features]
    x_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print("[INFO] Displaying Columns")
    print(features)
    print()

    print("[INFO] Displaying Data Shape")
    print()
    print("----- TRAIN -----")
    print(x_train.shape)
    print(y_train.shape)
    print()
    print("------ TEST -----")
    print(x_test.shape)
    print(y_test.shape)
    print()

    print("[INFO] Training Model")
    print()
    model = RandomForestRegressor()
    model.fit(x_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    # dump model after mmodel is trained
    joblib.dump(model, model_path)
    print("Model" + model_path)
    print()

    # testing accuracy
    print()
    print("----- METRICS -----")
    print()
    y_pred_test = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred_test)
    print(f"Mean Squared Error (MSE): {mse}")

Overwriting script.py


In [41]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="########",
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [None]:
# launch training job (asynch - create instance then run)
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("[INFO] Model Artifact Persisted: " + artifact)

In [None]:
artifact

In [45]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-SKLearn-Model-" + strftime("%Y-%m-%d-%H-%M-%s", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data=artifact,
    role="####",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [None]:
model_name

In [None]:
endpoint_name = "Custom-SKLearn-Model-" + strftime("%Y-%m-%d-%H-%M-%s", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

In [None]:
endpoint_name

In [50]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[ 7.062 21.392]


In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)