# Format Dataset

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
df_encoded = pd.read_csv("./Hotel_Reservations.csv")

In [35]:
df_encoded.drop('Booking_ID', axis=1, inplace=True)

In [36]:
def def_label(price):
    if price <= 85:
        return 1
    elif price >85 and price <= 115:
        return 2
    elif price > 115:
        return 3

In [37]:
df_encoded['label_avg_price_per_room'] = df_encoded['avg_price_per_room'].apply(def_label)

In [38]:
df_encoded.drop('avg_price_per_room', axis=1, inplace=True)

In [39]:
df_encoded = pd.get_dummies(df_encoded, columns=['type_of_meal_plan', 'room_type_reserved', 'market_segment_type','booking_status'])

In [40]:
df_encoded.columns

Index(['no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'required_car_parking_space', 'lead_time',
       'arrival_year', 'arrival_month', 'arrival_date', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'no_of_special_requests', 'label_avg_price_per_room',
       'type_of_meal_plan_Meal Plan 1', 'type_of_meal_plan_Meal Plan 2',
       'type_of_meal_plan_Meal Plan 3', 'type_of_meal_plan_Not Selected',
       'room_type_reserved_Room_Type 1', 'room_type_reserved_Room_Type 2',
       'room_type_reserved_Room_Type 3', 'room_type_reserved_Room_Type 4',
       'room_type_reserved_Room_Type 5', 'room_type_reserved_Room_Type 6',
       'room_type_reserved_Room_Type 7', 'market_segment_type_Aviation',
       'market_segment_type_Complementary', 'market_segment_type_Corporate',
       'market_segment_type_Offline', 'market_segment_type_Online',
       'booking_status_Canceled', 'booking_status_Not_Cancel

In [41]:
X= df_encoded.drop('label_avg_price_per_room', axis= 1)

In [42]:
Y= df_encoded['label_avg_price_per_room']

In [43]:
X = np.array(X).astype('int')
Y = np.array(Y).astype('int')

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.3, random_state=True)

In [46]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [47]:
train_base = np.hstack(( y_train.reshape(-1, 1),x_train))
test_base = np.hstack(( y_test.reshape(-1, 1),x_test))

In [48]:
np.savetxt('train_base.csv', train_base, delimiter=',')
np.savetxt('test_base.csv', test_base, delimiter=',')

In [49]:
train_base

array([[1, 1, 0, ..., 0, 1, 0],
       [0, 2, 0, ..., 1, 0, 1],
       [0, 2, 0, ..., 0, 0, 1],
       ...,
       [2, 1, 0, ..., 1, 1, 0],
       [2, 2, 2, ..., 1, 0, 1],
       [1, 2, 0, ..., 1, 1, 0]])

# Config AWS SageMaker

In [50]:
import sagemaker
import boto3
from sagemaker import Session
import sagemaker.amazon.common as smac
import io
import os

In [51]:
session       = sagemaker.Session()
bucket        = 'sprint05-project'
layer_model   = 'models/xgboost'
layer_dataset = 'datasets/xgboost'
key_train     = 'sprint05-project-xgb-train-data'
key_test      = 'sprint05-project-xgb-test-data'
role          = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, layer_dataset, key_train) 
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, layer_dataset, key_test) 
output_lction = 's3://{}/{}/output'.format(bucket, layer_model)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [52]:
with open('train_base.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(layer_dataset, 'train', key_train)).upload_fileobj(f)

In [53]:
with open('test_base.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(layer_dataset, 'test', key_test)).upload_fileobj(f)

# Model Training

In [54]:
from sagemaker import image_uris
container = sagemaker.image_uris.retrieve(framework = 'xgboost', region=boto3.Session().region_name, version='latest')

In [55]:
xgboost = sagemaker.estimator.Estimator(image_uri     = container,
                                       role           = role,
                                       instance_count = 1,
                                       instance_type  = 'ml.m5.2xlarge',
                                       output_path    = output_lction,
                                       sagemaker_session = session,
                                       use_stop_instances = True)

In [56]:
xgboost.set_hyperparameters(objective = 'multi:softmax', num_class = 3, num_round = 100, max_depth=72)

In [57]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [58]:
xgboost.fit(data_channels)

INFO:sagemaker:Creating training-job with name: xgboost-2023-09-13-20-06-38-703


2023-09-13 20:06:38 Starting - Starting the training job...
2023-09-13 20:06:56 Starting - Preparing the instances for training......
2023-09-13 20:07:57 Downloading - Downloading input data...
2023-09-13 20:08:32 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-09-13:20:08:54:INFO] Running standalone xgboost training.[0m
[34m[2023-09-13:20:08:54:INFO] File size need to be processed in the node: 27.68mb. Available memory size in the node: 23627.5mb[0m
[34m[2023-09-13:20:08:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:08:54] S3DistributionType set as FullyReplicated[0m
[34m[20:08:54] 25392x31 matrix with 787152 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-09-13:20:08:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:08:54] S3DistributionType set as FullyReplicated[0m
[34m[20:08:54] 10883x31 matrix with 337373 entries loaded from /opt/ml/inp

# Deploy / Evaluation

In [59]:
xgboost = xgboost.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2023-09-13-20-11-33-301
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-09-13-20-11-33-301
INFO:sagemaker:Creating endpoint with name xgboost-2023-09-13-20-11-33-301


-------!

In [60]:
from sagemaker.serializers import CSVSerializer
xgboost.serializer = CSVSerializer()

In [62]:
y_pred = np.array(xgboost.predict(x_test).decode('utf-8').split(',')).astype(np.float32)

In [63]:
from sklearn.metrics import classification_report, accuracy_score

In [66]:
print((accuracy_score(y_test, y_pred)))

0.8716346595607829


In [68]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      3299
           1       0.85      0.85      0.85      4326
           2       0.88      0.90      0.89      3258

    accuracy                           0.87     10883
   macro avg       0.87      0.87      0.87     10883
weighted avg       0.87      0.87      0.87     10883

