In [1]:
import boto3
import sagemaker
import sagemaker.session
import os
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import TrainingStep

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

pipeline_session = PipelineSession()

In [3]:
os.makedirs("02_simple_training_pipeline", exist_ok=True) # Create folder for training code

In [4]:
%%writefile 02_simple_training_pipeline/train.py

from __future__ import print_function

import argparse
import joblib
import os
import pandas as pd

from sklearn import tree

if __name__ == '__main__':
    model_dir = os.environ['SM_MODEL_DIR'] # Folder where model must be saved
    train_dir = os.environ['SM_CHANNEL_TRAIN'] # Folder where train data is stored

    # Lets assume there is only one training file
    train_file_name = os.listdir(train_dir)[0]
    train_file_path = os.path.join(train_dir, train_file_name)
    
    train_data = pd.read_csv(train_file_path, header=None, engine="python")

    # labels are in the first column
    train_y = train_data.iloc[:, 0]
    train_X = train_data.iloc[:, 1:]  

    # Train the model
    # Hyperparameters are hardcoded
    clf = tree.DecisionTreeClassifier(max_leaf_nodes=30)
    clf = clf.fit(train_X, train_y)

    # Save model object
    joblib.dump(clf, os.path.join(model_dir, "model.joblib"))

Overwriting 02_simple_training_pipeline/train.py


In [5]:
sklearn = SKLearn(
    entry_point='train.py', # The file with the training code
    source_dir='02_simple_training_pipeline', # The folder with the training code
    framework_version='1.2-1', # Version of SKLearn which will be used
    instance_type='ml.m5.large', # Instance type that wil be used
    role=role, # Role that will be used during execution
    sagemaker_session=pipeline_session, 
    base_job_name='02_simple_training_pipeline' # Name of the training job. Timestamp will be added as suffix
)

In [6]:
train_args = sklearn.fit({"train": "s3://sagemaker-bucket-ds/training-jobs/data/train/"})



In [7]:
step_train = TrainingStep(
    name="SimpleTrain",
    step_args = train_args
)

In [8]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"02-simple-training-pipeline"
pipeline = Pipeline(
    name=pipeline_name,
    steps=[step_train],
)

In [9]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:eu-west-1:211125740051:pipeline/02-simple-training-pipeline',
 'ResponseMetadata': {'RequestId': '173ffb17-71e6-462f-bfd5-3513b686523f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '173ffb17-71e6-462f-bfd5-3513b686523f',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '95',
   'date': 'Thu, 11 Jul 2024 12:53:58 GMT'},
  'RetryAttempts': 0}}

In [10]:
execution = pipeline.start()