In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [16]:
data = pd.read_csv("hotel_bookings.csv")
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [18]:
# Separate features and label
# features: drop "reservation_status" (same as "is_canceled"), "company" (too many missing values)
features = data.drop(["is_canceled", "reservation_status", "company"], axis=1)
label = data["is_canceled"]

In [19]:
# Split into training set and test set 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.2, random_state = 42)

In [35]:
# Get categorical columns and numerical columns
cat_col = X_train.select_dtypes(include = ["object"]).columns
num_col = X_train.select_dtypes(include = ["int64", "float64"]).columns

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [37]:
# Create ColumnTransformer
cat_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

num_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers = [
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)])

In [38]:
X_train_fit = preprocessor.fit_transform(X_train)
X_train_fit.shape

(95512, 1172)

In [48]:
# Create a Multi-Layer Perceptron Classifier object
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes = (30, 20, 10),max_iter = 200, early_stopping = True, verbose = 10)

In [49]:
# Create a Pipeline to integrate preprocessor and mlp
dnn = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', mlp)
])

In [50]:
# Fit the pipeline on the training data
dnn.fit(X_train, y_train)

Iteration 1, loss = 0.38443986
Validation score: 0.862751
Iteration 2, loss = 0.26789038
Validation score: 0.878874
Iteration 3, loss = 0.22245682
Validation score: 0.907245
Iteration 4, loss = 0.17614818
Validation score: 0.922111
Iteration 5, loss = 0.13045371
Validation score: 0.946608
Iteration 6, loss = 0.10148713
Validation score: 0.952680
Iteration 7, loss = 0.08136763
Validation score: 0.961055
Iteration 8, loss = 0.06826212
Validation score: 0.961055
Iteration 9, loss = 0.05455518
Validation score: 0.962521
Iteration 10, loss = 0.04823170
Validation score: 0.965243
Iteration 11, loss = 0.04227386
Validation score: 0.966394
Iteration 12, loss = 0.03645475
Validation score: 0.965662
Iteration 13, loss = 0.03443089
Validation score: 0.969849
Iteration 14, loss = 0.02899943
Validation score: 0.968907
Iteration 15, loss = 0.02809013
Validation score: 0.967023
Iteration 16, loss = 0.02338302
Validation score: 0.966813
Iteration 17, loss = 0.02180715
Validation score: 0.968279
Iterat

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='missing',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                      

In [51]:
# Make predictions on test data
y_pred = dnn.predict(X_test)

In [52]:
# Calculate prediction accuracy on test data
test_acc = dnn.score(X_test, y_test)
test_acc

0.9722757349861797