In [1]:
import sagemaker
import boto3
import os

In [2]:
BUCKET = 'capstone-khoivn'
model_output_dir = f"s3://{BUCKET}/model/hp-tuning/model.tar.gz"
input_train = f"s3://{BUCKET}/data/df_train_feature_engineering_smote_selection.csv"
input_test = f"s3://{BUCKET}/data/df_test_feature_engineering_selection.csv"

os.environ["SM_MODEL_DIR"] = model_output_dir
os.environ["SM_CHANNEL_TRAIN"] = input_train
os.environ["SM_CHANNEL_TEST"] = input_test

In [3]:
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

role = sagemaker.get_execution_role()

hyperparameter_ranges = {
    "n_estimators": CategoricalParameter([100, 200, 300]),
    "max_depth": CategoricalParameter([10, 20, 30, 40, 50]),
    "min_samples_split": CategoricalParameter([2, 10, 20])
}

objective_metric_name = "cv f1-score"
objective_type = "Maximize"
metric_definitions = [{"Name": "cv f1-score", "Regex": "CV F1-score: ([0-9\\.]+)"}]

In [29]:
from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(
    entry_point="training/train_model_rf.py",
    role=role,
    py_version='py3',
    framework_version="1.2-1",
    instance_count=1,
    instance_type="ml.c4.xlarge",
)

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name=objective_metric_name,
    objective_type=objective_type,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metric_definitions,
    max_jobs=4, 
    max_parallel_jobs=2
)



In [25]:
estimator.fit(
    inputs={
        "train": input_train, 
        "test": input_test,
    },
    wait=True
)

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-09-23-12-38-36-117


Using provided s3_resource
2023-09-23 12:38:36 Starting - Starting the training job...
2023-09-23 12:38:50 Starting - Preparing the instances for training.........
2023-09-23 12:40:20 Downloading - Downloading input data...
2023-09-23 12:40:40 Training - Downloading the training image..[34m2023-09-23 12:41:17,137 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-09-23 12:41:17,140 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-23 12:41:17,149 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-09-23 12:41:17,355 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-23 12:41:17,366 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-23 12:41:17,378 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-23 12:4

In [30]:
tuner.fit(
    inputs={
        "train": input_train, 
        "test": input_test,
    },
    wait=True
)

INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-scikit-lea-230923-1308


Using provided s3_resource
...............................................!


In [31]:
tuner.best_training_job()

'sagemaker-scikit-lea-230923-1308-003-1aa0f709'

In [32]:
best_estimator = tuner.best_estimator()
best_estimator.hyperparameters()




2023-09-23 13:12:56 Starting - Found matching resource for reuse
2023-09-23 13:12:56 Downloading - Downloading input data
2023-09-23 13:12:56 Training - Training image download completed. Training in progress.
2023-09-23 13:12:56 Uploading - Uploading generated training model
2023-09-23 13:12:56 Completed - Resource retained for reuse


{'_tuning_objective_metric': '"cv f1-score"',
 'max_depth': '"50"',
 'min_samples_split': '"10"',
 'n_estimators': '"200"',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"SKLearn"',
 'sagemaker_estimator_module': '"sagemaker.sklearn.estimator"',
 'sagemaker_job_name': '"sagemaker-scikit-learn-2023-09-23-13-08-57-541"',
 'sagemaker_program': '"train_model_rf.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-700736504781/sagemaker-scikit-learn-2023-09-23-13-08-57-541/source/sourcedir.tar.gz"'}

In [33]:
hyperparameters = best_estimator.hyperparameters()
hyperparameters

{'_tuning_objective_metric': '"cv f1-score"',
 'max_depth': '"50"',
 'min_samples_split': '"10"',
 'n_estimators': '"200"',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"SKLearn"',
 'sagemaker_estimator_module': '"sagemaker.sklearn.estimator"',
 'sagemaker_job_name': '"sagemaker-scikit-learn-2023-09-23-13-08-57-541"',
 'sagemaker_program': '"train_model_rf.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-700736504781/sagemaker-scikit-learn-2023-09-23-13-08-57-541/source/sourcedir.tar.gz"'}