<a href="https://clarusway.com/contact-us/"><img align="center" src="https://i.ibb.co/B43qn24/officially-licensed-logo.png" alt="Open in Clarusway LMS" width="110" height="200" title="This notebook is licensed by Clarusway IT training school. Please contact the authorized persons about the conditions under which you can use or share."></a>

# TRAINING AND MODEL/ENDPOINT CREATION FROM SAGEMAKER CONSOLE

In [1]:
import pandas as pd      
import numpy as np 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

In [3]:
train_data.shape

(12731, 6)

In [4]:
test_data.shape

(3183, 6)

In [5]:
train_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,26379,0.0,100.0,5900.0,Manual,Opel Insignia
1,18990,0.0,66.0,1900.0,Manual,Opel Astra
2,12480,2.0,81.0,39792.0,Manual,Opel Astra
3,34490,0.0,154.0,10.0,Automatic,Opel Insignia
4,15888,2.0,60.0,11903.0,Manual,Audi A1


In [6]:
test_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,14500,2.0,141.0,80000.0,Automatic,Audi A1
1,16790,3.0,66.0,16200.0,Automatic,Audi A1
2,15090,3.0,85.0,63668.0,Automatic,Audi A1
3,17990,2.0,70.0,16103.0,Automatic,Audi A1
4,17990,3.0,92.0,26415.0,Automatic,Audi A1


In [7]:
train_data.describe()

Unnamed: 0,price,age,hp_kW,km
count,12731.0,12731.0,12731.0,12731.0
mean,17996.286702,1.387244,88.382374,32002.454174
std,7349.138624,1.121765,26.693078,37081.360187
min,4950.0,0.0,40.0,0.0
25%,12850.0,0.0,66.0,1699.5
50%,16890.0,1.0,85.0,20321.0
75%,21910.0,2.0,101.0,46375.0
max,74600.0,3.0,294.0,291800.0


In [8]:
test_data.describe()

Unnamed: 0,price,age,hp_kW,km
count,3183.0,3183.0,3183.0,3183.0
mean,18133.550424,1.399623,88.968269,32441.995475
std,7509.562135,1.119744,26.602421,36567.08854
min,5450.0,0.0,51.0,0.0
25%,12880.0,0.0,66.0,2970.5
50%,16990.0,1.0,85.0,20900.0
75%,21900.0,2.0,103.0,48000.0
max,68320.0,3.0,294.0,317000.0


In [9]:
train_data.make_model.value_counts()

Audi A3           2488
Audi A1           2111
Opel Insignia     2044
Opel Astra        1995
Opel Corsa        1791
Renault Clio      1488
Renault Espace     786
Renault Duster      28
Name: make_model, dtype: int64

In [10]:
test_data.make_model.value_counts()

Audi A3           609
Opel Insignia     554
Opel Astra        530
Audi A1           503
Opel Corsa        425
Renault Clio      351
Renault Espace    205
Renault Duster      6
Name: make_model, dtype: int64

In [11]:
train_data.Gearing_Type.value_counts()

Manual            6496
Automatic         5861
Semi-automatic     374
Name: Gearing_Type, dtype: int64

# Split train_data into train and validation

In [13]:
X = train_data.drop(["price"], axis = 1)
y = train_data['price']

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)

In [15]:
X_train.head()

Unnamed: 0,age,hp_kW,km,Gearing_Type,make_model
8699,2.0,51.0,40000.0,Manual,Opel Corsa
5137,3.0,70.0,97976.0,Manual,Audi A1
536,1.0,66.0,14500.0,Manual,Audi A1
876,3.0,55.0,25329.0,Manual,Opel Corsa
8690,2.0,125.0,37125.0,Automatic,Opel Insignia


In [16]:
y_train.head()

8699     8900
5137    11490
536     16800
876      7899
8690    19499
Name: price, dtype: int64

In [17]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

In [18]:
cat = X_train.select_dtypes("object").columns
cat

Index(['Gearing_Type', 'make_model'], dtype='object')

In [25]:
ord_enc = OrdinalEncoder()
column_trans = make_column_transformer((ord_enc, cat), remainder='passthrough', verbose_feature_names_out=False).set_output(transform="pandas") # remainder: # Keep other columns rather than cat in original form.
                                                                                # remainder: # if you want to scale rest of cat columns use "minmax" istead passthroug
    
    # Apply ord_enc to categoric variables and remain in original form rest of them.
    # ord_enc for DT based algorithms
    # For Linear reg, log reg vs. use one hot encoder 

In [28]:
train = column_trans.fit_transform(X_train)
validation = column_trans.transform(X_val)

In [30]:
train.head()

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
8699,1.0,3.0,2.0,51.0,40000.0
5137,1.0,0.0,3.0,70.0,97976.0
536,1.0,0.0,1.0,66.0,14500.0
876,1.0,3.0,3.0,55.0,25329.0
8690,0.0,4.0,2.0,125.0,37125.0


In [31]:
train = pd.concat([pd.Series(y_train, index = X_train.index, name="price", dtype=int), train], axis=1)
validation = pd.concat([pd.Series(y_val, index = X_val.index, name="price", dtype=int), validation], axis=1)

In [32]:
train.head()

Unnamed: 0,price,Gearing_Type,make_model,age,hp_kW,km
8699,8900,1.0,3.0,2.0,51.0,40000.0
5137,11490,1.0,0.0,3.0,70.0,97976.0
536,16800,1.0,0.0,1.0,66.0,14500.0
876,7899,1.0,3.0,3.0,55.0,25329.0
8690,19499,0.0,4.0,2.0,125.0,37125.0


In [33]:
validation.head()

Unnamed: 0,price,Gearing_Type,make_model,age,hp_kW,km
3132,15480,0.0,2.0,3.0,100.0,56587.0
8123,24900,1.0,7.0,1.0,96.0,20000.0
10948,27400,0.0,0.0,0.0,85.0,10.0
1986,11975,1.0,2.0,2.0,74.0,38500.0
7487,12450,1.0,4.0,3.0,103.0,45000.0


In [34]:
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

* Amazon SageMaker Python SDK is an open source library for training and deploying machine-learned models on Amazon SageMaker (https://sagemaker.readthedocs.io/en/stable/).
* Boto3 is an AWS SDK for Python and it makes easy to integrate your Python application, library, or script with AWS services including Amazon S3, Amazon EC2, Amazon DynamoDB, and more (https://aws.amazon.com/sdk-for-python/).

In [35]:
# Import sagemaker and boto3 libraries.
# SageMaker Python SDK is an open source library for training and deploying machine learning models on Amazon SageMaker using Python scripts.
# Boto3 is the name of the Python SDK for AWS. 
# It allows you to directly create, update, and delete AWS resources from your Python scripts.

import sagemaker, boto3
bucket = 'hillary-eu13'
prefix = 'sagemaker-autoscout'

In [38]:
# Checking the sagemaker execution role
role = sagemaker.get_execution_role()
role

'arn:aws:iam::046402772087:role/service-role/AmazonSageMaker-ExecutionRole-20230504T102133'

In [36]:
# Loading train and validation datasets into S3 bucket.
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/data/train.csv').upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/data/validation.csv').upload_file('validation.csv')

In [37]:
# Checking the lists of objects in bucket
# We can check them also from AWS S3 console
! aws s3 ls {bucket}/{prefix}/data --recursive

2023-05-04 08:45:55     298744 sagemaker-autoscout/data/train.csv
2023-05-04 08:45:56      99546 sagemaker-autoscout/data/validation.csv


In [None]:
## Now go back to sagemaker console and do the jobs: Training-Model Creation- End Point Creations
## After creating endpoint come back to this notebook again.

# Prepare Test Data

In [39]:
test_data = pd.read_csv("test_data.csv")

In [40]:
test_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,14500,2.0,141.0,80000.0,Automatic,Audi A1
1,16790,3.0,66.0,16200.0,Automatic,Audi A1
2,15090,3.0,85.0,63668.0,Automatic,Audi A1
3,17990,2.0,70.0,16103.0,Automatic,Audi A1
4,17990,3.0,92.0,26415.0,Automatic,Audi A1


In [41]:
X=test_data.drop(["price"], axis = 1)
y=test_data['price']

In [44]:
test=column_trans.transform(X)

In [46]:
test.head()

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
0,0.0,0.0,2.0,141.0,80000.0
1,0.0,0.0,3.0,66.0,16200.0
2,0.0,0.0,3.0,85.0,63668.0
3,0.0,0.0,2.0,70.0,16103.0
4,0.0,0.0,3.0,92.0,26415.0


In [47]:
test.shape

(3183, 5)

In [48]:
test.to_csv('test.csv', index=False, header=False)

# Model Deployment using endpoint name

In [49]:
endpoint_name = "hillary-eu13"   # copy the name from sagemaker console
endpoint_name

'hillary-eu13'

In [50]:
# defining the region and the role that will be used 
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn:{}".format(role))

AWS Region: us-east-1
RoleArn:arn:aws:iam::046402772087:role/service-role/AmazonSageMaker-ExecutionRole-20230504T102133


In [51]:
# Reading and preparing the test data as payload for prediction
with open('test.csv', 'r') as f:
    payload = f.read().strip('\n')

In [53]:
# checking the content of payload
# payload

In [54]:

# Create sagemaker runtime object to send request to endpoint and get response
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=boto3.Session().region_name)

# The name of the endpoint. The name must be unique within an AWS Region in your AWS account. 

# Invoke endpoint to get prediction results
response = sagemaker_runtime.invoke_endpoint(
                            EndpointName=endpoint_name,  # endpoint name we created (should be unique)
                            Body=payload,  # prediction data, replace with your own data.
                            ContentType = 'text/csv')  # type of data

# Optional - to see prediction result
# print(response['Body'].read().decode('utf-8'))

In [56]:
# The prediction result is in json format and "Body" key contains the prediction values.
response

In [57]:
# prediction results
results = response['Body'].read()

# prediction results - alternative
# results = response['Body'].read().decode('utf-8')
results

In [58]:
# prediction results as numpy array
results = np.fromstring(results, sep='\n')

In [59]:
results

array([18229.4140625 , 15893.91308594, 16086.77734375, ...,
       36081.609375  , 33964.19921875, 43396.0078125 ])

## Check the Model Performance

In [60]:
y=test_data['price']

In [61]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print(" r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

In [62]:
eval_metrics(y, results)

 r2_score: 0.9215310890462863 
 mae: 1247.0648232860608 
 mse: 4423748.129496977 
 rmse: 2103.270816965085


In [63]:
test_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,14500,2.0,141.0,80000.0,Automatic,Audi A1
1,16790,3.0,66.0,16200.0,Automatic,Audi A1
2,15090,3.0,85.0,63668.0,Automatic,Audi A1
3,17990,2.0,70.0,16103.0,Automatic,Audi A1
4,17990,3.0,92.0,26415.0,Automatic,Audi A1


In [64]:
test_data["predicted_price"] = results

In [65]:
test_data.sample(10, random_state=41)

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model,predicted_price
1846,13990,1.0,66.0,12099.0,Automatic,Opel Corsa,13252.246094
1671,12500,3.0,66.0,52000.0,Automatic,Opel Corsa,11372.838867
1497,13900,1.0,81.0,13149.0,Manual,Opel Astra,13796.97168
149,15950,3.0,66.0,53900.0,Semi-automatic,Audi A1,16672.304688
2352,16800,1.0,103.0,23401.0,Manual,Opel Insignia,18808.583984
3007,25490,2.0,147.0,49606.0,Automatic,Renault Espace,25784.541016
2114,15980,3.0,125.0,88346.0,Automatic,Opel Insignia,15745.564453
1805,8850,2.0,70.0,89000.0,Manual,Opel Corsa,7742.413086
1304,17400,2.0,110.0,62000.0,Automatic,Opel Astra,15051.368164
1256,10980,3.0,100.0,107791.0,Manual,Opel Astra,10309.319336
