# Amazon Sagemaker AI : Dry Bean Classification Problem
You will learn
 - Using Sagemaker AI to train the model in notebook itself.
 - AWS -> Sagemaker AI ->  Create a Domain -> Open Studio -> Creating a Jupyter Lab Space with a machine
 - In Jupyter Lab Space create a notebook -> load dataset -> and train model in notebook itself

reference: https://www.datacamp.com/tutorial/aws-sagemaker-tutorial

In [18]:
import pandas as pd 
import numpy as np 
import boto3 
import sagemaker 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sagemaker import get_execution_role

In [19]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
REGION = sess.boto_session.region_name
print(REGION)
print(get_execution_role(sagemaker_session=sess))



us-east-1
arn:aws:iam::205930620783:role/service-role/AmazonSageMaker-ExecutionRole-20250401T145997


In [13]:
# Only able to read and write in the default created bucket
#BUCKET_NAME = "sagemaker-us-east-1-205930620783"
#DATASET_PREFIX = "dry-bean-classification-problem/dataset"

BUCKET_NAME = "dry-bean-classification-problem-usa-east1"
BUCKET_URI = f"s3://{BUCKET_NAME}"
DATASET_PREFIX = "dataset"
print(BUCKET_URI)
DATASET_FILE = "Dry_Bean_Dataset.csv"
TARGET_VAR = "Class"

s3://dry-bean-classification-problem-usa-east1


In [14]:
data = pd.read_csv(f"{BUCKET_URI}/{DATASET_PREFIX}/{DATASET_FILE}")
data['Class_Raw'] = data['Class']

In [15]:
#print(data.info())
print(data['Class'].value_counts())
# Target variable is a categorical factor.
le = preprocessing.LabelEncoder()
data[TARGET_VAR] = le.fit_transform(data[TARGET_VAR]) # It will just map the categories to numbers in range 0, len(categories). This is an orfinal mapping i.e. so I don't think its a correect way to do that.
print(data[TARGET_VAR].shape)
print(data[TARGET_VAR][0:10])
print(data[TARGET_VAR].value_counts())
class_weights = data[TARGET_VAR].value_counts(normalize=True).to_dict()
print(class_weights)

Class
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: count, dtype: int64
(13611,)
0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
Name: Class, dtype: int64
Class
3    3546
6    2636
5    2027
4    1928
2    1630
0    1322
1     522
Name: count, dtype: int64
{3: 0.2605245757108221, 6: 0.19366688707662919, 5: 0.148923664682977, 4: 0.1416501359194769, 2: 0.11975607964146646, 0: 0.09712732348835501, 1: 0.03835133348027331}


In [16]:
df_train, df_test = train_test_split(data.drop('Class_Raw', axis=1), test_size=.2, random_state=0)
print(df_train.shape, df_train.index[0:10])
print(df_test.shape, df_test.index[0:10])
df_train.to_csv('dry-bean-train.csv')
df_test.to_csv('dry-bean-test.csv')

(10888, 17) Index([2301, 5659, 8597, 6157, 4544, 749, 9556, 6877, 8403, 9485], dtype='int64')
(2723, 17) Index([13027, 11035, 13205, 7578, 1961, 3885, 1094, 12187, 12526, 9540], dtype='int64')


In [17]:
trainpath = sess.upload_data(
   path="dry-bean-train.csv",
   bucket=BUCKET_NAME,
   key_prefix=DATASET_PREFIX, # Subdir
)

testpath = sess.upload_data(
   path="dry-bean-test.csv",
   bucket=BUCKET_NAME,
   key_prefix=DATASET_PREFIX,
)

In [24]:
X_train, y_train = df_train.drop(['Class'], axis=1).values, df_train['Class'].values
X_test, y_test = df_test.drop(['Class'], axis=1).values, df_test['Class'].values
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(10888, 16) (10888,) (2723, 16) (2723,)


In [25]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=20, random_state=0, n_jobs=-1, class_weight=class_weights)
model_rf.fit(X_train, y_train)
y_test_pred = model_rf.predict(X_test)
y_train_pred = model_rf.predict(X_train)


In [26]:
print(y_test[0:10])
print(y_test_pred[0:10])

from sklearn.metrics import balanced_accuracy_score

acc_test = balanced_accuracy_score(y_test, y_test_pred)
acc_train = balanced_accuracy_score(y_train, y_train_pred)
print(acc_test, acc_train)

[3 3 3 6 5 2 5 3 3 6]
[3 3 3 3 6 4 5 3 3 6]
0.9234898448914083 0.9311536024911987
