# Amazon Sagemaker AI : Dry Bean Classification Problem
You will learn
 - Using Sagemaker AI to train the model in notebook itself.
 - AWS -> Sagemaker AI ->  Create a Domain -> Open Studio -> Creating a Jupyter Lab Space with a machine
 - In Jupyter Lab Space create a notebook -> load dataset -> and train model in notebook itself

references: https://www.datacamp.com/tutorial/aws-sagemaker-tutorial

references: https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-python-sdk/scikit_learn_iris/scikit_learn_estimator_example_with_batch_transform_outputs.html#upload_data

## Issues Arised:
### Couldn't access/read files from user/new created buckets.
Reason being S3 read access was not allowed for the role that is executing the code.
- Role: Role is kind of a hat that a user assumes to execute a task. It contains all the necessary permissions to resources needed to execute the task.
- Policy: each role has a policy attached as a json document, which we can edit to give necessary permissions or there are pre-defined policies that can be used.
i.e. for this issue, just display the role using `get_execution_role(sagemaker_session=sagemaker.Session())`, and attach the aws managed policy to "AmazonS3FullAccess" to the role. This will give access to all the buckets for the current user. Or if you want to give
access to a selected bucket either edit the policy attached to the role or create a custom managed policy.


{
	"Version": "2012-10-17",
	"Statement": [
		{
			"Action": [
				"s3:ListBucket"
			],
			"Effect": "Allow",
			"Resource": [
				"arn:aws:s3:::SageMaker",
				"*"
			]
		},
		{
			"Action": [
				"s3:GetObject",
				"s3:PutObject",
				"s3:DeleteObject"
			],
			"Effect": "Allow",
			"Resource": [
				"arn:aws:s3:::SageMaker/*",
                "arn:aws:s3:::{bucket_name}/*",
			]
		}
	]
}


In [1]:
! pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.243.0-py3-none-any.whl.metadata (16 kB)
Downloading sagemaker-2.243.0-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.240.0
    Uninstalling sagemaker-2.240.0:
      Successfully uninstalled sagemaker-2.240.0
Successfully installed sagemaker-2.243.0


In [4]:
import pandas as pd 
import numpy as np 
import boto3      # aws python sdk to access other qws services. 
import sagemaker  # sagemaker python sdk to access other sagemaker services

In [7]:
#Session: Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
sess = sagemaker.Session()           # Sagemaker Python SDK to interact with sagemaker service
ROLE = get_execution_role(sagemaker_session=sess)
BUCKET = sess.default_bucket()
REGION = sess.boto_session.region_name
print(ROLE)
print(REGION)
print(BUCKET)

arn:aws:iam::205930620783:role/service-role/AmazonSageMaker-ExecutionRole-20250401T145997
us-east-1
sagemaker-us-east-1-205930620783


In [2]:
sagemaker_client = boto3.client("sagemaker") # AWS Python SDK to interact with AWS services

arn:aws:iam::205930620783:role/service-role/AmazonSageMaker-ExecutionRole-20250401T145997
us-east-1


In [3]:
# If we did not edit the policy for the role, i.e. specified bucket in the 
print(sess.list_s3_files("sagemaker-us-east-1-205930620783", "dry-bean-classification-problem/"))
print("Bucket Items")
print(sess.list_s3_files("dry-bean-classification-problem-usa-east1", "dataset/"))

# Only able to read and write in the default created bucket
#BUCKET_NAME = "sagemaker-us-east-1-205930620783"
BUCKET_NAME = "dry-bean-classification-problem-usa-east1"
BUCKET_URI = f"s3://{BUCKET_NAME}"

DATASET_PREFIX = "dataset"
MODELS_PREFIX = "models"

TARGET_VAR = "Class"

['dry-bean-classification-problem/', 'dry-bean-classification-problem/dataset/', 'dry-bean-classification-problem/dataset/Dry_Bean_Dataset.csv', 'dry-bean-classification-problem/dataset/dry-bean-test.csv', 'dry-bean-classification-problem/dataset/dry-bean-train.csv', 'dry-bean-classification-problem/dataset/models/sagemaker-scikit-learn-2025-04-02-03-42-46-014/debug-output/training_job_end.ts', 'dry-bean-classification-problem/dataset/models/sagemaker-scikit-learn-2025-04-02-03-42-46-014/profiler-output/framework/training_job_end.ts', 'dry-bean-classification-problem/dataset/models/sagemaker-scikit-learn-2025-04-02-03-42-46-014/profiler-output/system/incremental/2025040203/1743565380.algo-1.json', 'dry-bean-classification-problem/dataset/models/sagemaker-scikit-learn-2025-04-02-03-42-46-014/profiler-output/system/incremental/2025040203/1743565440.algo-1.json', 'dry-bean-classification-problem/dataset/models/sagemaker-scikit-learn-2025-04-02-03-42-46-014/profiler-output/system/training_

### Read train and test data from S3

In [None]:
df_train = pd.read_csv(f'{BUCKET_URI}/{DATASET_PREFIX}/dry-bean-train.csv')
df_test = pd.read_csv(f'{BUCKET_URI}/{DATASET_PREFIX}/dry-bean-test.csv')
# You can write file to s3, instead of uploading the data
#df_test.to_csv(f'{BUCKET_URI}/{DATASET_PREFIX}/dry-bean-test2.csv')
X_train, y_train = df_train.drop(['Class'], axis=1).values, df_train['Class'].values
X_test, y_test = df_test.drop(['Class'], axis=1).values, df_test['Class'].values
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(df_train.columns)

### Write a python script in the current dir to give run it as a training job

In [None]:
%%writefile train.py

import pandas as pd
import argparse
import joblib
import os
import boto3
import tempfile
import numpy as np
# Create a python script that takes the arguments from the command line for hyperparameters,
# trains the model
# and store the model.
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--max-depth", type=int, default=7)
    parser.add_argument("--min-samples-leaf", type=int, default=10)
    parser.add_argument("--model-dir", type=str)
    parser.add_argument("--model-filename", type=str)
    parser.add_argument("--train-dir", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test-dir", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="california_housing_train.csv")
    parser.add_argument("--test-file", type=str, default="california_housing_test.csv")
    parser.add_argument("--features", type=str)
    parser.add_argument("--target", type=str)

    args = parser.parse_args()

    print('reading data')
    train_data_path = os.path.join(args.train_dir, args.train_file)
    print('train_data_path:', train_data_path)
    test_data_path = os.path.join(args.test_dir, args.test_file)
    print('test_data_path:', test_data_path)

    df_train = pd.read_csv(train_data_path)
    df_test = pd.read_csv(test_data_path)

    X_train, y_train = df_train[args.features.split()], df_train[args.target]
    X_test, y_test = df_test[args.features.split()], df_test[args.target]

    from sklearn.ensemble import RandomForestClassifier

    model_rf = RandomForestClassifier(
        n_estimators=args.n_estimators,
        max_depth=args.max_depth,
        min_samples_leaf=args.min_samples_leaf,
        random_state=0,
        n_jobs=-1
    )
    model_rf.fit(X_train, y_train)

    y_test_pred = model_rf.predict(X_test)
    y_train_pred = model_rf.predict(X_train)

    from sklearn.metrics import balanced_accuracy_score
    bal_acc_test = balanced_accuracy_score(y_test, y_test_pred)
    bal_acc_train = balanced_accuracy_score(y_train, y_train_pred)
    print(f"Train balanced accuracy: {bal_acc_train:.3f}")
    print(f"Test balanced accuracy: {bal_acc_test:.3f}")
    
    # presist the model i.e. save it
    #path = f"{args.model_dir}model.joblib"
    #print(path)
    #joblib.dump(model_rf, path) # this gives error when directly dumping it on s3 bucket.

    s3_client = boto3.client("s3") # AWS Python SDK to interact with AWS services
    with tempfile.TemporaryFile() as fp:
             joblib.dump(model_rf, fp)
             fp.seek(0)
             print(f'model saved to s3 bucket {args.model_dir} at {args.model_filename}')
             s3_client.put_object(Body=fp.read(), Bucket=args.model_dir, Key=args.model_filename)
        

In [None]:
# Test the model locally
! python train.py --n-estimators 2 \
--min-samples-leaf 100 \
--model-dir "dry-bean-classification-problem-usa-east1" \
--train-dir "s3://dry-bean-classification-problem-usa-east1/dataset/" \
--test-dir  "s3://dry-bean-classification-problem-usa-east1/dataset/" \
--train-file "dry-bean-train.csv" \
--test-file "dry-bean-test.csv" \
--features "Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity ConvexArea EquivDiameter Extent Solidity roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4" \
--target "Class" \
--model-filename "models/model.joblib"


### Launching a training job with the Python SDK

In [None]:
!python --version

### Custom Training Job

### Spot Training.
Using the machines on demand can be very expensive. To cut down costs, Amazon offers Spot Training instances. With those instances, you can choose high-powered computing resources for a low price with a single caveat — the training won’t start immediately. Instead, SageMaker waits until the demand is low and the machine you requested is available. 

To enable Spot Training, you have to add only a couple of lines to the last code block:

In [None]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='train.py',
    #py_version="py311",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.c5.xlarge",
    framework_version="1.2-1", # check what version of sklearn and pyhon suported by sagemaker in pre-insntalled package / containers.
    hyperparameters={
        "n-estimators": 2,
        "min-samples-leaf": 100,
        "max-depth":5,
        "features": "Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity ConvexArea EquivDiameter Extent Solidity roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4",
        "target": "Class",
        "train-file": "dry-bean-train.csv", 
        "test-file": "dry-bean-test.csv",
        "model-dir": "dry-bean-classification-problem-usa-east1", #"/opt/ml/output/" # Save it in current direcrtory then we will upload it to S3 bucker
        "model-filename": "models/model.joblib"
    }#,
    # Below parameters are for spot training.
    #use_spot_instances=True,
    #max_wait=7200,
    #max_run=3600,
)


In [None]:
# launch training job, with asynchronous call
s3_train_dir="s3://sagemaker-us-east-1-205930620783/dry-bean-classification-problem/dataset/"
sklearn_estimator.fit({'train': s3_train_dir, 'test': s3_train_dir}, wait=True)

In [None]:
###

In [None]:
# Create Optimizer
from sagemaker.tuner import IntegerParameter

# Define exploration boundaries
hyperparameter_ranges = {
   "n-estimators": IntegerParameter(2, 10),
   "min-samples-leaf": IntegerParameter(10, 30),
}

Optimizer = sagemaker.tuner.HyperparameterTuner(
   estimator=sklearn_estimator,
   hyperparameter_ranges=hyperparameter_ranges,
   base_tuning_job_name="RF-tuner",
   objective_type="Maximize",
   objective_metric_name="balanced-accuracy",
   metric_definitions=[
       {"Name": "balanced-accuracy", "Regex": "Test balanced accuracy: ([0-9.]+).*$"} # The training script will log/print the test accuracy, and this training job will capture this, and make a decsion of best estimator using this.
   ],  # Extract tracked metric from logs with regexp
   max_jobs=10,
   max_parallel_jobs=2,
)

In [None]:
# launch training job, with asynchronous call
s3_train_dir="s3://sagemaker-us-east-1-205930620783/dry-bean-classification-problem/dataset/"
Optimizer.fit({'train': s3_train_dir, 'test': s3_train_dir}, wait=True)

In [None]:
# Get tuner results in a df
results = Optimizer.analytics().dataframe()

while results.empty:
   time.sleep(1)
   results = Optimizer.analytics().dataframe()

results

In [None]:
best_estimator = Optimizer.best_estimator()
print(best_estimator.latest_training_job.name)
artifact_path = sagemaker_client.describe_training_job(
   TrainingJobName=best_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact_path)

In [None]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
   model_data=artifact_path,
   role=get_execution_role(),
   entry_point="train.py",
   framework_version="1.2-1",
)
model

In [None]:
import time
sk_endpoint_name = "sklearn-rf-model" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
sk_predictor = best_estimator.deploy(
    initial_instance_count=1, instance_type="ml.m5.large", endpoint_name=sk_endpoint_name
)