## Libraries and Data Setup

In [55]:
import sagemaker
import boto3

import joblib
import pathlib
from io import StringIO
import argparse
import os

import pandas as pd
import numpy as np

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import train_test_split

In [56]:

# Specify the region
region = 'us-west-2'  # replace with your desired region

# Connect to SageMaker
boto_session = boto3.Session(region_name=region)
sm_boto3 = boto_session.client("sagemaker")
session = sagemaker.Session(boto_session=boto_session)
bucket = 'ml-mobile-price-classification-sagemaker'

In [57]:
df = pd.read_csv('train.csv')

## Exploratory Analysis

In [58]:
df.shape

(2000, 21)

In [59]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [60]:
df.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [61]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [62]:
df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [63]:
# Percentage of values that are null
df.isnull().mean() * 100

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [64]:
features = list(df)
lables = features.pop()
x, y = df[features], df[lables]

In [65]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0)

In [66]:
print('X_train shape:  ', X_train.shape)
print('X_test shape:  ', X_test.shape)
print('Y_train shape:  ', y_train.shape)
print('Y_test shape:  ', y_test.shape)

X_train shape:   (1600, 20)
X_test shape:   (400, 20)
Y_train shape:   (1600,)
Y_test shape:   (400,)


In [67]:
X_train.to_csv('split_data/x_train.csv', index=False)
X_test.to_csv('split_data/x_test.csv', index=False)

### Push Split Data To S3 Bucket

In [68]:
sk_prefix = f'sagemaker/{bucket}/sklearncontainer'
train_path = session.upload_data(
    path='split_data/x_train.csv',
    bucket=bucket,
    key_prefix=sk_prefix
)

test_path = session.upload_data(
    path='split_data/x_test.csv',
    bucket=bucket,
    key_prefix=sk_prefix
)

print(train_path)
print(test_path)

s3://ml-mobile-price-classification-sagemaker/sagemaker/ml-mobile-price-classification-sagemaker/sklearncontainer/x_train.csv
s3://ml-mobile-price-classification-sagemaker/sagemaker/ml-mobile-price-classification-sagemaker/sklearncontainer/x_test.csv


In [74]:
%%writefile script.py
import sagemaker
import boto3

import joblib
import pathlib
from io import StringIO
import argparse
import os

import pandas as pd
import numpy as np

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score
from sklearn.model_selection import train_test_split

def model_fxn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

if __name__ == '__main__':
    print('[INFO] Extracting arguments')
    parser = argparse.ArgumentParser()

    """
        Hyperparameters sent by client are passed as command lin arguments to the script
    """
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default-os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default-os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default-os.environ.get("x_train.csv"))
    parser.add_argument("--test-file", type=str, default-os.environ.get("x_test.csv"))

    args, _ = parser.parse_known_args()

    print("SKLearn Version:  ", sklearn.__version__)
    print("Joblib Version:  ", joblib.__version__)

Overwriting script.py
