In [1]:
from functions import *

import os
import pandas as pd
import numpy as np
import sagemaker

from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

settings = read_settings()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
os.makedirs("training_script", exist_ok=True) # Create folder for training code

In [3]:
%%writefile training_script/requirements.txt

mlflow==2.13.2
sagemaker-mlflow==0.1.0

Overwriting training_script/requirements.txt


In [4]:
%%writefile training_script/train.py

from __future__ import print_function

import argparse
import joblib
import os
import pandas as pd
import tarfile
from sklearn.linear_model import LogisticRegression
import mlflow
import shutil

if __name__ == '__main__':
    model_dir = os.environ['SM_MODEL_DIR'] # Folder where model must be saved
    train_dir = os.environ['SM_CHANNEL_TRAIN'] # Folder where train data is stored
    output_dir = os.environ['SM_OUTPUT_DATA_DIR']

    # Lets assume there is only one training file
    train_file_name = os.listdir(train_dir)[0]
    train_file_path = os.path.join(train_dir, train_file_name)
    
    train_data = pd.read_csv(train_file_path, engine="python")

    # labels are in the first column
    train_y = train_data.iloc[:, 0]
    train_X = train_data.iloc[:, 1:]  

    mlflow.set_tracking_uri(os.environ['mlflow_arn'])
    mlflow.set_experiment(os.environ['mlflow_experiment_name'])
    
    with mlflow.start_run(run_name = os.environ['mlflow_final_model_name']):
        mlflow.set_tag("training_job_name", os.environ['TRAINING_JOB_NAME'])
        
        clf = LogisticRegression(max_iter=100)
        clf = clf.fit(train_X, train_y)
    
        mlflow.sklearn.log_model(clf, "model")

        model_path = os.path.join(model_dir, "model.pkl")
        joblib.dump(clf, model_path)
    

        # Pack model.pkl into model.tar.gz
        tar_path = os.path.join(output_dir, "model.tar.gz")
        with tarfile.open(tar_path, "w:gz") as tar:
            tar.add(model_path, arcname=os.path.basename(model_path))

        # Log the model.tar.gz as an artifact in MLflow
        mlflow.log_artifact(tar_path, artifact_path="model")
        os.remove(tar_path)

    # Register the model with MLflow
    run_id = mlflow.last_active_run().info.run_id
    artifact_path = "model"
    model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run_id, artifact_path=artifact_path)
    model_details = mlflow.register_model(model_uri=model_uri, name=os.environ['mlflow_model_name'])

Overwriting training_script/train.py


In [5]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()

In [6]:
environment = {
    'mlflow_arn': settings['mlflow_arn'],
    'mlflow_experiment_name': settings['mlflow_experiment_name'],
    'mlflow_final_model_name': 'final-model2',
    'mlflow_model_name': settings['mlflow_model_name']
}

sklearn = SKLearn(
    entry_point='train.py', # The file with the training code
    source_dir='training_script', # The folder with the training code
    framework_version='1.2-1', # Version of SKLearn which will be used
    instance_type='ml.m5.large', # Instance type that wil be used
    role=role, # Role that will be used during execution
    sagemaker_session=sagemaker_session, 
    base_job_name=settings['training_job_name'], # Name of the training job. Timestamp will be added as suffix
    environment = environment
)

In [7]:
# Read the train and validation data from S3
train_s3_key = f"{settings['project_path_s3']}/data/train/train.csv"

train_df = read_csv_from_s3(settings['bucket_name'], train_s3_key)

train_df = train_df[['target', "col_1", "col_2", "col_3", "col_6", "col_7", "col_8"]]

s3_key = f"{settings['project_path_s3']}/data/train_job/train.csv"
save_df_to_s3(df = train_df, bucket_name = settings['bucket_name'], s3_key = s3_key, decimal_places = 5)

In [8]:
sklearn.fit({"train": f"s3://{settings['bucket_name']}/{settings['project_path_s3']}/data/train_job/"})

INFO:sagemaker:Creating training-job with name: 01-churn-2024-07-12-14-06-35-035


2024-07-12 14:06:35 Starting - Starting the training job...
2024-07-12 14:06:53 Starting - Preparing the instances for training...
2024-07-12 14:07:18 Downloading - Downloading input data...
2024-07-12 14:07:43 Downloading - Downloading the training image......
2024-07-12 14:08:43 Training - Training image download completed. Training in progress.[34m2024-07-12 14:08:48,162 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-07-12 14:08:48,166 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-07-12 14:08:48,170 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-07-12 14:08:48,260 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-07-12 14:08:48,499 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
