In [2]:
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline_context import PipelineSession

In [3]:
sess = boto3.Session()
sm = sess.client("sagemaker")
role = get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

# print("bucket name: ", bucket,"\nregion: ", region)

pipeline_session = PipelineSession()

model_package_group_name = "PipelineModelPackageGroup"
prefix = "pipeline-model-example"
pipeline_name = 'serial-inference-pipeline'   # Sagemaker pipeline name


### Download California Housing dataset and upload to Amazon S3
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

In [4]:
data_dir = os.path.join(os.getcwd(), "data")
os.makedirs(data_dir, exist_ok=True)

raw_dir = os.path.join(os.getcwd(), "data/raw")
os.makedirs(raw_dir, exist_ok=True)


In [None]:
# 되기는 되는데 너무 늦음. Hang
# s3 = boto3.client("s3")
# s3.download_file(
#     f"sagemaker-example-files-prod-{region}",
#     "datasets/tabular/california_housing/cal_housing.tgz",
#     "cal_housing2.tgz",
# )

In [5]:
columns = [
  "longitude",
  "latitude",
  "housingMedianAge",
  "totalRooms",
  "totalBedrooms",
  "population",
  "households",
  "medianIncome",
  "medianHouseValue",
]
cal_housing_df = pd.read_csv("CaliforniaHousing/cal_housing.data", names=columns, header=None)
# cal_housing_df.head()
cal_housing_df[
  "medianHouseValue"
] /= 500000 # Scaling target down to avoid overcomplicating the example
cal_housing_df.to_csv(f"./data/raw/raw_data_all.csv", header=True, index=False)
rawdata_s3_prefix = "{}/data/raw".format(prefix)
raw_s3 = sagemaker_session.upload_data(path='./data/raw', key_prefix=rawdata_s3_prefix)
print(raw_s3)


s3://sagemaker-ap-northeast-2-532805286864/pipeline-model-example/data/raw


### Define Parameters to Parametrize Pipeline Execution

Define Pipeline parameters that you can use to parameterize the pipeline. Parameters enable custom pipeline execution and schedules without having to modify the Pipeline Definition.

The supported parameter types include:
- ParameterString - represents a str Python type
- ParameterInteger 
- ParameterFloat

In [6]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat

# raw input data
input_data = ParameterString(name="InputData", default_value=raw_s3)

# status of newly trained model in regstry
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")

# processing step parameter
processing_instance_type = ParameterString(
  name="ProcessingInstanceType", default_value="ml.m5.xlarge"
)
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

# training step parameters
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
training_epochs = ParameterString(name="TrainingEpochs", default_value="100")

# model performance step Parameter
accuracy_mse_threshold = ParameterFloat(name="AccuracyMseThreshold", default_value=0.75)

### Define a Processing Step for Feature Engineering
the below preprocessing script, in addition to creating a scaler, contains the neccessary functions for it to be deployed as part of a pipeline model

In [7]:
!mkdir -p code

In [None]:
#%%writefilee code/preprocessing.py

In [None]:
X = cal_housing_df[[
  "longitude",
  "latitude",
  "housingMedianAge",
  "totalBedrooms",
  "population",
  "households",
  "medianIncome",
  "medianHouseVAlue"
]]