<a href="https://clarusway.com/contact-us/"><img align="center" src="https://i.ibb.co/B43qn24/officially-licensed-logo.png" alt="Open in Clarusway LMS" width="110" height="200" title="This notebook is licensed by Clarusway IT training school. Please contact the authorized persons about the conditions under which you can use or share."></a>

# TRAINING AND MODEL/ENDPOINT CREATION FROM NOTEBOOK

In [1]:
import pandas as pd      
import numpy as np 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print(" r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

In [3]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

In [4]:
train_data.shape

(12731, 6)

In [5]:
test_data.shape

(3183, 6)

In [6]:
train_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,26379,0.0,100.0,5900.0,Manual,Opel Insignia
1,18990,0.0,66.0,1900.0,Manual,Opel Astra
2,12480,2.0,81.0,39792.0,Manual,Opel Astra
3,34490,0.0,154.0,10.0,Automatic,Opel Insignia
4,15888,2.0,60.0,11903.0,Manual,Audi A1


In [7]:
test_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,14500,2.0,141.0,80000.0,Automatic,Audi A1
1,16790,3.0,66.0,16200.0,Automatic,Audi A1
2,15090,3.0,85.0,63668.0,Automatic,Audi A1
3,17990,2.0,70.0,16103.0,Automatic,Audi A1
4,17990,3.0,92.0,26415.0,Automatic,Audi A1


In [8]:
train_data.describe()

Unnamed: 0,price,age,hp_kW,km
count,12731.0,12731.0,12731.0,12731.0
mean,17996.286702,1.387244,88.382374,32002.454174
std,7349.138624,1.121765,26.693078,37081.360187
min,4950.0,0.0,40.0,0.0
25%,12850.0,0.0,66.0,1699.5
50%,16890.0,1.0,85.0,20321.0
75%,21910.0,2.0,101.0,46375.0
max,74600.0,3.0,294.0,291800.0


In [9]:
test_data.describe()

Unnamed: 0,price,age,hp_kW,km
count,3183.0,3183.0,3183.0,3183.0
mean,18133.550424,1.399623,88.968269,32441.995475
std,7509.562135,1.119744,26.602421,36567.08854
min,5450.0,0.0,51.0,0.0
25%,12880.0,0.0,66.0,2970.5
50%,16990.0,1.0,85.0,20900.0
75%,21900.0,2.0,103.0,48000.0
max,68320.0,3.0,294.0,317000.0


In [10]:
train_data.make_model.value_counts()

Audi A3           2488
Audi A1           2111
Opel Insignia     2044
Opel Astra        1995
Opel Corsa        1791
Renault Clio      1488
Renault Espace     786
Renault Duster      28
Name: make_model, dtype: int64

In [11]:
test_data.make_model.value_counts()

Audi A3           609
Opel Insignia     554
Opel Astra        530
Audi A1           503
Opel Corsa        425
Renault Clio      351
Renault Espace    205
Renault Duster      6
Name: make_model, dtype: int64

In [12]:
train_data.Gearing_Type.value_counts()

Manual            6496
Automatic         5861
Semi-automatic     374
Name: Gearing_Type, dtype: int64

# Split train_data into train and validation

In [7]:
X=train_data.drop(["price"], axis = 1)
y=train_data['price']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)

In [15]:
X_train.head()

Unnamed: 0,age,hp_kW,km,Gearing_Type,make_model
8699,2.0,51.0,40000.0,Manual,Opel Corsa
5137,3.0,70.0,97976.0,Manual,Audi A1
536,1.0,66.0,14500.0,Manual,Audi A1
876,3.0,55.0,25329.0,Manual,Opel Corsa
8690,2.0,125.0,37125.0,Automatic,Opel Insignia


In [16]:
y_train.head()

8699     8900
5137    11490
536     16800
876      7899
8690    19499
Name: price, dtype: int64

In [9]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

In [10]:
cat = X_train.select_dtypes("object").columns
cat

Index(['Gearing_Type', 'make_model'], dtype='object')

In [11]:
ord_enc = OrdinalEncoder()
column_trans = make_column_transformer((ord_enc, cat), remainder='passthrough', verbose_feature_names_out=False).set_output(transform="pandas") # remainder: # Keep other columns rather than cat in original form.
                                                                                # remainder: # if you want to scale rest of cat columns use "minmax" istead passthroug
    
    # Apply ord_enc to categoric variables and keep the rest in original form.
    # ord_enc for DT based algorithms
    # For Linear reg, log reg vs. use one hot encoder 

In [12]:
train = column_trans.fit_transform(X_train)
validation = column_trans.transform(X_val)

In [13]:
train.head()

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
8699,1.0,3.0,2.0,51.0,40000.0
5137,1.0,0.0,3.0,70.0,97976.0
536,1.0,0.0,1.0,66.0,14500.0
876,1.0,3.0,3.0,55.0,25329.0
8690,0.0,4.0,2.0,125.0,37125.0


In [14]:
# in sagmaker we don't tain X and y separately, thus we concat price
train = pd.concat([pd.Series(y_train, index = X_train.index, name="price", dtype=int), train], axis=1)
validation = pd.concat([pd.Series(y_val, index = X_val.index, name="price", dtype=int), validation], axis=1)

In [25]:
train.head()

Unnamed: 0,price,Gearing_Type,make_model,age,hp_kW,km
8699,8900,1.0,3.0,2.0,51.0,40000.0
5137,11490,1.0,0.0,3.0,70.0,97976.0
536,16800,1.0,0.0,1.0,66.0,14500.0
876,7899,1.0,3.0,3.0,55.0,25329.0
8690,19499,0.0,4.0,2.0,125.0,37125.0


In [26]:
validation.head()

Unnamed: 0,price,Gearing_Type,make_model,age,hp_kW,km
3132,15480,0.0,2.0,3.0,100.0,56587.0
8123,24900,1.0,7.0,1.0,96.0,20000.0
10948,27400,0.0,0.0,0.0,85.0,10.0
1986,11975,1.0,2.0,2.0,74.0,38500.0
7487,12450,1.0,4.0,3.0,103.0,45000.0


In [17]:
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [15]:
# Import sagemaker and boto3 libraries.
import sagemaker, boto3
bucket = 'eu13notebook' # our bucket name
prefix = 'sagemaker-autoscout'

In [18]:
# Loading train and validation datasets into S3 bucket.
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/data/train.csv').upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(prefix + '/data/validation.csv').upload_file('validation.csv')

In [19]:
# Checking the lists of objects in bucket
# We can check them also from AWS S3 console

! aws s3 ls {bucket}/{prefix}/data --recursive

2023-05-06 08:40:31     298744 sagemaker-autoscout/data/train.csv
2023-05-06 08:40:31      99546 sagemaker-autoscout/data/validation.csv


# Train Model

In [None]:
# %pip install --upgrade boto3  # if the following code raise an error, run this before then restart the kernel or define region name manually

In [20]:
# Defining the region and role variables which will be used for the connection to the AWS.
region = sagemaker.Session().boto_region_name  # region = boto3.Session().region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn:{}".format(role))

AWS Region: us-east-1
RoleArn:arn:aws:iam::506049460673:role/service-role/AmazonSageMaker-ExecutionRole-20230504T193303


In [21]:
sagemaker.__version__

'2.145.0'

In [22]:
from sagemaker.debugger import Rule, rule_configs

In [23]:
# Defining output path in which outputs will be loaded during model training.
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'autoscout_model')

In [24]:
# Defining built-in algorithm name and version which we will use for model creation.
container = sagemaker.image_uris.retrieve("xgboost",region, version="1.2-2")
print(container)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-2


In [54]:
# for a latest version 
# container = sagemaker.image_uris.retrieve("xgboost", region, version="latest")
# print(container)

In [55]:
#help(sagemaker.image_uris.retrieve)

In [25]:
# Model creation using sagemaker estimator function

xgb_model = sagemaker.estimator.Estimator(
    container,  
    role,
    instance_count=1,  
    instance_type='ml.m4.xlarge', 
    volume_size = 1,
    output_path = s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules = [Rule.sagemaker(rule_configs.create_xgboost_report())]
    )

In [26]:
#help(xgb_model.set_hyperparameters)

In [27]:
# Setting the model hyperparameters

xgb_model.set_hyperparameters(max_depth=4,
                              eta=0.3, # learning rate
                              num_round=200,
                              objective = "reg:squarederror", # cost function for optimization
                              early_stopping_rounds=10)  #objective = "reg:squarederror", "reg:linear" for latest xgboost!
                                                               

In [28]:
# Import training input method which will be used to prepare train and validation data before model training
from sagemaker.session import TrainingInput

# Preparing train and validation data for model training
train_input = TrainingInput(
's3://{}/{}/{}'.format(bucket, prefix, 'data/train.csv'), content_type='csv'
)

validation_input = TrainingInput(
's3://{}/{}/{}'.format(bucket, prefix, 'data/validation.csv'), content_type='csv'
)

In [29]:
# Convert the data we prepared into dictionary format for model training 
data_channels = {'train': train_input, 'validation': validation_input}

In [30]:
# Model training with data_channels. At this step training jobs will be created.
xgb_model.fit(data_channels)

# Let's check whether training job is created or not from AWS console.

# Let's check whether the output(model.tar.gz) occurs when training job is completed.

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-05-06-09-12-56-421


2023-05-06 09:12:58 Starting - Starting the training job...
2023-05-06 09:13:30 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
.........
2023-05-06 09:15:00 Downloading - Downloading input data...
2023-05-06 09:15:20 Training - Downloading the training image......
2023-05-06 09:16:36 Uploading - Uploading generated training model[34m[2023-05-06 09:16:27.904 ip-10-0-254-93.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-05-06:09:16:27:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-05-06:09:16:27:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-05-06:09:16:27:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-05-06:09:16:27:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2023-05-06:09:16:27:INFO] Determined delimiter of CSV input is ','[0m
[34m[2023-05-06:09:16:27:INFO] Determined

# Create Endpoint

In [31]:
# CSVSerializer : Serialize data of various formats to a CSV-formatted string
              
from sagemaker.serializers import CSVSerializer

# Deploy the trained model into an endpoint
# At this step an endpoint will be created
predictor = xgb_model.deploy(initial_instance_count=1,  # the number of EC2 for endpoint
                             instance_type='ml.m5.xlarge',  # type of EC2
                            serializer = CSVSerializer())  # data format for prediction

# Let's check whether endpoint is created.

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-05-06-09-23-12-136
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-05-06-09-23-12-136
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-05-06-09-23-12-136


----!

In [32]:
# The name of the  endpoint we created
predictor.endpoint_name

'sagemaker-xgboost-2023-05-06-09-23-12-136'

# Prepare the test data

In [33]:
test_data.head()

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model
0,14500,2.0,141.0,80000.0,Automatic,Audi A1
1,16790,3.0,66.0,16200.0,Automatic,Audi A1
2,15090,3.0,85.0,63668.0,Automatic,Audi A1
3,17990,2.0,70.0,16103.0,Automatic,Audi A1
4,17990,3.0,92.0,26415.0,Automatic,Audi A1


In [34]:
X=test_data.drop(["price"], axis = 1)
y=test_data['price']

In [37]:
test=column_trans.transform(X)

In [38]:
test.head()

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
0,0.0,0.0,2.0,141.0,80000.0
1,0.0,0.0,3.0,66.0,16200.0
2,0.0,0.0,3.0,85.0,63668.0
3,0.0,0.0,2.0,70.0,16103.0
4,0.0,0.0,3.0,92.0,26415.0


In [39]:
test.shape

(3183, 5)

In [40]:
test.to_csv('test.csv', index=False, header=False)

# Prediction using predictor (Method-1)

In [41]:
results = predictor.predict(test.to_numpy())

In [42]:
#results

In [43]:
results = np.fromstring(results, sep='\n')   #sep='\n' or sep=','

In [44]:
results

array([15970.21582031, 15887.58984375, 16116.35449219, ...,
       35987.32421875, 34214.22265625, 42065.0078125 ])

In [45]:
eval_metrics(y, results)

 r2_score: 0.9226945994208223 
 mae: 1257.1303233854658 
 mse: 4358154.293919976 
 rmse: 2087.6192885485552


In [46]:
test_data["predicted_price"] = results

In [47]:
test_data.sample(10, random_state=41)

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model,predicted_price
1846,13990,1.0,66.0,12099.0,Automatic,Opel Corsa,13578.894531
1671,12500,3.0,66.0,52000.0,Automatic,Opel Corsa,10880.681641
1497,13900,1.0,81.0,13149.0,Manual,Opel Astra,14091.145508
149,15950,3.0,66.0,53900.0,Semi-automatic,Audi A1,16133.251953
2352,16800,1.0,103.0,23401.0,Manual,Opel Insignia,18953.675781
3007,25490,2.0,147.0,49606.0,Automatic,Renault Espace,26104.083984
2114,15980,3.0,125.0,88346.0,Automatic,Opel Insignia,16086.40625
1805,8850,2.0,70.0,89000.0,Manual,Opel Corsa,8797.889648
1304,17400,2.0,110.0,62000.0,Automatic,Opel Astra,15223.293945
1256,10980,3.0,100.0,107791.0,Manual,Opel Astra,10500.507812


# Prediction using Endpoint Name (Method-2):

In [48]:
endpoint_name = predictor.endpoint_name
endpoint_name

'sagemaker-xgboost-2023-05-06-09-23-12-136'

In [50]:
# Reading and preparing the test data as payload for prediction
with open('test.csv', 'r') as f:
    payload = f.read().strip('\n')

In [51]:
# checking the content of payload
payload

'0.0,0.0,2.0,141.0,80000.0\n0.0,0.0,3.0,66.0,16200.0\n0.0,0.0,3.0,85.0,63668.0\n0.0,0.0,2.0,70.0,16103.0\n0.0,0.0,3.0,92.0,26415.0\n0.0,0.0,3.0,112.0,45764.0\n0.0,0.0,1.0,85.0,9752.0\n0.0,0.0,3.0,92.0,15850.0\n1.0,0.0,3.0,60.0,43120.0\n1.0,0.0,3.0,66.0,30500.0\n1.0,0.0,2.0,70.0,69500.0\n1.0,0.0,3.0,66.0,99000.0\n1.0,0.0,3.0,66.0,130000.0\n0.0,0.0,2.0,70.0,5108.0\n0.0,0.0,1.0,70.0,6396.0\n0.0,0.0,3.0,92.0,30952.0\n0.0,0.0,3.0,92.0,33991.0\n0.0,0.0,3.0,92.0,41900.0\n0.0,0.0,2.0,71.0,51484.0\n0.0,0.0,1.0,85.0,16500.0\n0.0,0.0,1.0,85.0,10013.0\n0.0,0.0,1.0,70.0,6695.0\n0.0,0.0,3.0,92.0,21275.0\n0.0,0.0,2.0,70.0,19800.0\n0.0,0.0,3.0,85.0,52000.0\n0.0,0.0,2.0,66.0,65000.0\n0.0,0.0,1.0,70.0,4800.0\n0.0,0.0,1.0,70.0,5500.0\n0.0,0.0,2.0,92.0,40000.0\n0.0,0.0,1.0,85.0,15000.0\n0.0,0.0,3.0,92.0,72624.0\n0.0,0.0,3.0,66.0,13960.0\n0.0,0.0,2.0,70.0,22980.0\n0.0,0.0,3.0,66.0,57926.0\n0.0,0.0,3.0,85.0,29800.0\n0.0,0.0,1.0,70.0,8600.0\n0.0,0.0,1.0,70.0,9463.0\n0.0,0.0,3.0,85.0,36709.0\n0.0,0.0,2.0,70.0

In [52]:
# payload="0.0,0.0,2.0,141.0,80000.0"     # by this one you can try a single value to be predicted

In [53]:
# Create sagemaker runtime object to send request to endpoint and get response
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=boto3.Session().region_name)

# Invoke endpoint to get prediction results
response = sagemaker_runtime.invoke_endpoint(
                            EndpointName=endpoint_name,  # endpoint name we created (should be unique)
                            Body=payload, # prediction data
                            ContentType = 'text/csv')  # type of data

# Optional - to see prediction result
# print(response['Body'].read().decode('utf-8'))

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [54]:
# The prediction result is in json format and "Body" key contains the prediction values.
 
results2 = response['Body'].read()   # results2 = response['Body'].read().decode('utf-8')

In [55]:
#results2

In [56]:
# prediction results(probability values) as numpy array
results2 = np.fromstring(results2, sep='\n')

In [57]:
results2

array([15970.21582031, 15887.58984375, 16116.35449219, ...,
       35987.32421875, 34214.22265625, 42065.0078125 ])

In [58]:
results2.shape

(3183,)

In [59]:
y.shape

(3183,)

In [60]:
eval_metrics(y, results2)

 r2_score: 0.9226945994208223 
 mae: 1257.1303233854658 
 mse: 4358154.293919976 
 rmse: 2087.6192885485552


In [61]:
test_data["predicted_price2"] = results2

In [62]:
test_data.sample(10, random_state=41)

Unnamed: 0,price,age,hp_kW,km,Gearing_Type,make_model,predicted_price,predicted_price2
1846,13990,1.0,66.0,12099.0,Automatic,Opel Corsa,13578.894531,13578.894531
1671,12500,3.0,66.0,52000.0,Automatic,Opel Corsa,10880.681641,10880.681641
1497,13900,1.0,81.0,13149.0,Manual,Opel Astra,14091.145508,14091.145508
149,15950,3.0,66.0,53900.0,Semi-automatic,Audi A1,16133.251953,16133.251953
2352,16800,1.0,103.0,23401.0,Manual,Opel Insignia,18953.675781,18953.675781
3007,25490,2.0,147.0,49606.0,Automatic,Renault Espace,26104.083984,26104.083984
2114,15980,3.0,125.0,88346.0,Automatic,Opel Insignia,16086.40625,16086.40625
1805,8850,2.0,70.0,89000.0,Manual,Opel Corsa,8797.889648,8797.889648
1304,17400,2.0,110.0,62000.0,Automatic,Opel Astra,15223.293945,15223.293945
1256,10980,3.0,100.0,107791.0,Manual,Opel Astra,10500.507812,10500.507812


# json info:

In [63]:
import json


event = {
  "data": "0.0,0.0,2.0,141.0,80000.0"
}

data = json.loads(json.dumps(event))

payload = data["data"]
payload

'0.0,0.0,2.0,141.0,80000.0'

In [64]:
type(event)

dict

In [65]:
type(payload)

str

In [66]:
my_dict = {
    "age": 2,
    "hp_kW": 141,
    "km": 80000,
    'Gearing_Type':'Automatic',
    "make_model": 'Audi A1'}

In [67]:
df = pd.DataFrame.from_dict([my_dict])

In [68]:
df2 = column_trans.transform(df)

In [69]:
df2

Unnamed: 0,Gearing_Type,make_model,age,hp_kW,km
0,0.0,0.0,2,141,80000


In [70]:
df2.to_csv('df2.csv', index=False, header=False)

In [71]:
with open('df2.csv', 'r') as f:
    payload = f.read().strip('\n')

In [72]:
payload

'0.0,0.0,2,141,80000'

In [73]:
event = {
  "data": payload
  
}

In [74]:
event

{'data': '0.0,0.0,2,141,80000'}