---
# 1. Feature Engineering
## 1.1 Import

In [1]:
########################################
#       importing packages
########################################
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
import numpy             as np                       # import np
from sklearn.model_selection import train_test_split # train-test split
from sklearn.tree import DecisionTreeRegressor       # regression trees
from sklearn.ensemble import RandomForestRegressor   # Random Forest models
from sklearn.ensemble import GradientBoostingRegressor # Gradient Boosting Model
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import plot_tree                   # tree plots
from sklearn.model_selection import RandomizedSearchCV # hyperparameter tuning
from sklearn.ensemble import GradientBoostingClassifier # gbm
from sklearn.ensemble import AdaBoostClassifier      # Ada model
from sklearn.ensemble import RandomForestClassifier  # Randome Forest


## 1.2 General Information
### Train data

In [2]:
# importing the training dataset
path             = "./__datasets/"
training_dataset = "train.csv"


# reading in the .csv file with pandas
booking_train    = pd.read_csv(filepath_or_buffer = path + training_dataset)

# checking basic info about the dataset
booking_train.info(verbose = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42100 entries, 0 to 42099
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    42100 non-null  int64  
 1   no_of_adults                          42100 non-null  int64  
 2   no_of_children                        42100 non-null  int64  
 3   no_of_weekend_nights                  42100 non-null  int64  
 4   no_of_week_nights                     42100 non-null  int64  
 5   type_of_meal_plan                     42100 non-null  int64  
 6   required_car_parking_space            42100 non-null  int64  
 7   room_type_reserved                    42100 non-null  int64  
 8   lead_time                             42100 non-null  int64  
 9   arrival_year                          42100 non-null  int64  
 10  arrival_month                         42100 non-null  int64  
 11  arrival_date   

### Test data

In [3]:
# importing the training dataset
path             = "./__datasets/"
training_dataset = "test.csv"


# reading in the .csv file with pandas
booking_test    = pd.read_csv(filepath_or_buffer = path + training_dataset)

# checking basic info about the dataset
booking_test.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28068 entries, 0 to 28067
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    28068 non-null  int64  
 1   no_of_adults                          28068 non-null  int64  
 2   no_of_children                        28068 non-null  int64  
 3   no_of_weekend_nights                  28068 non-null  int64  
 4   no_of_week_nights                     28068 non-null  int64  
 5   type_of_meal_plan                     28068 non-null  int64  
 6   required_car_parking_space            28068 non-null  int64  
 7   room_type_reserved                    28068 non-null  int64  
 8   lead_time                             28068 non-null  int64  
 9   arrival_year                          28068 non-null  int64  
 10  arrival_month                         28068 non-null  int64  
 11  arrival_date   

## 1.3 Combine two datasets

In [4]:
booking_train['set'] = 'Training'
booking_test ['set'] = 'Testing'

# concatenating both datasets together for mv and feature engineering
booking_df = booking_train.append(other = booking_test)

# resetting index to avoid problems later in the code
booking_df.reset_index(drop = False,
                       inplace = True)

  booking_df = booking_train.append(other = booking_test)


In [5]:
booking_df.head(n=100)

Unnamed: 0,index,id,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,...,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,set
0,0,0,2,0,0,2,1,0,0,9,...,1,14,1,1,11,0,67.50,0,0.0,Training
1,1,1,2,0,1,2,0,0,0,117,...,7,29,0,0,0,0,72.25,0,0.0,Training
2,2,2,2,0,0,1,0,0,0,315,...,12,2,0,0,0,0,52.00,0,0.0,Training
3,3,3,1,0,0,2,1,0,0,32,...,12,1,1,0,0,0,56.00,0,0.0,Training
4,4,4,2,0,1,0,0,0,0,258,...,10,16,0,0,0,0,100.00,0,1.0,Training
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,95,2,0,1,2,0,0,0,198,...,5,9,0,0,0,0,90.00,0,1.0,Training
96,96,96,2,0,1,4,0,0,0,138,...,6,29,1,0,0,0,90.95,0,0.0,Training
97,97,97,2,0,1,2,0,0,0,107,...,10,7,1,0,0,0,118.80,1,0.0,Training
98,98,98,1,0,1,3,0,0,0,61,...,10,19,1,0,0,0,80.75,2,0.0,Training


## 1.4 Correction 

In [6]:
# instantiating a correlation matrix
booking_df_corr = booking_train.corr(method = 'pearson').round(decimals = 2)

# transforming correlations to absolute values
booking_df_corr.loc[ : , 'booking_status' ].apply(func = abs).sort_values(ascending = False)

booking_status                          1.00
lead_time                               0.37
no_of_special_requests                  0.22
arrival_year                            0.18
avg_price_per_room                      0.16
repeated_guest                          0.14
market_segment_type                     0.11
required_car_parking_space              0.09
no_of_adults                            0.08
no_of_previous_bookings_not_canceled    0.08
no_of_week_nights                       0.06
type_of_meal_plan                       0.05
no_of_previous_cancellations            0.05
no_of_weekend_nights                    0.04
room_type_reserved                      0.02
arrival_month                           0.01
id                                      0.01
arrival_date                            0.00
no_of_children                          0.00
Name: booking_status, dtype: float64

## 1.5 Cleaning data

In [7]:
# INFOrmation about each variable
booking_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70168 entries, 0 to 70167
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   index                                 70168 non-null  int64  
 1   id                                    70168 non-null  int64  
 2   no_of_adults                          70168 non-null  int64  
 3   no_of_children                        70168 non-null  int64  
 4   no_of_weekend_nights                  70168 non-null  int64  
 5   no_of_week_nights                     70168 non-null  int64  
 6   type_of_meal_plan                     70168 non-null  int64  
 7   required_car_parking_space            70168 non-null  int64  
 8   room_type_reserved                    70168 non-null  int64  
 9   lead_time                             70168 non-null  int64  
 10  arrival_year                          70168 non-null  int64  
 11  arrival_month  

In [8]:
# taking the cooking dataset
# and then
# transforming it into boolean based on if a value is null
# and then
# summing together the results per column
booking_df.isnull().sum(axis = 0)

index                                       0
id                                          0
no_of_adults                                0
no_of_children                              0
no_of_weekend_nights                        0
no_of_week_nights                           0
type_of_meal_plan                           0
required_car_parking_space                  0
room_type_reserved                          0
lead_time                                   0
arrival_year                                0
arrival_month                               0
arrival_date                                0
market_segment_type                         0
repeated_guest                              0
no_of_previous_cancellations                0
no_of_previous_bookings_not_canceled        0
avg_price_per_room                          0
no_of_special_requests                      0
booking_status                          28068
set                                         0
dtype: int64

In [9]:
# instantiating an imputation value
fill = 0

# imputing 'FAMILY_NAME'
booking_df = booking_df.fillna(fill)

In [10]:
# making sure all missing values have been taken care of
booking_df.isnull().any().any()

False

### Shape the dataframe

In [11]:
class BookingDataProcessor:
    def __init__(self, data):
        self.data = data
    
    @staticmethod
    def fe(df):
        # Fix date anomalies (pd.to_datetime throws parsing error for some days, see anomalies section).
        df['year_month'] = pd.to_datetime(df[['arrival_year', 'arrival_month']].astype(str).sum(1), format='%Y%m')
        df.loc[df.arrival_date > df.year_month.dt.days_in_month, 'arrival_date'] = df.year_month.dt.days_in_month
        df.drop(columns='year_month', inplace=True)
        return df

    def process_data(self):
        self.data = self.fe(self.data)
        return self.data

# Create an instance of BookingDataProcessor and process the data
data_processor = BookingDataProcessor(booking_df)
booking_df = data_processor.process_data()

# Print the shape of the processed dataframe
print(f'Shape after Feature Engineering Phase: {booking_df.shape}')


Shape after Feature Engineering Phase: (70168, 21)


## 1.6 One-hot encoding

In [12]:
# printing columns
print(f"""
Type of meal plan
------
{booking_df['type_of_meal_plan'].value_counts()}


Type of market segment
----------
{booking_df['market_segment_type'].value_counts()} 


Type of Room
-------------
{booking_df['room_type_reserved'].value_counts()}

Arrival Year
-------------
{booking_df['arrival_year'].value_counts()}

""")


Type of meal plan
------
0    59184
2     5643
1     5334
3        7
Name: type_of_meal_plan, dtype: int64


Type of market segment
----------
1    43054
0    23896
2     2378
4      716
3      124
Name: market_segment_type, dtype: int64 


Type of Room
-------------
0    49656
1    15371
3     2433
2     1844
4      544
5      308
6       12
Name: room_type_reserved, dtype: int64

Arrival Year
-------------
2018    60096
2017    10072
Name: arrival_year, dtype: int64




### Encoding and drop original columns

In [13]:
# one hot encoding categorical variables
one_hot_type_of_meal_plan        = pd.get_dummies(booking_df['type_of_meal_plan'],prefix = 'type_meal_plan_')
one_hot_market_segment_type      = pd.get_dummies(booking_df['market_segment_type'],prefix = 'type_market_segment_')
one_hot_room_type_reserved       = pd.get_dummies(booking_df['room_type_reserved'],prefix = 'type_room_reserved_')
one_hot_arrival_year             = pd.get_dummies(booking_df['arrival_year'],prefix = 'arrival_year_')

# dropping categorical variables after they've been encoded
#booking_df = booking_df.drop('type_of_meal_plan', axis = 1) 
#booking_df = booking_df.drop('market_segment_type', axis = 1)
#booking_df = booking_df.drop('room_type_reserved', axis = 1)
#booking_df = booking_df.drop('arrival_year', axis = 1)

# joining codings together
booking_df = booking_df.join([one_hot_type_of_meal_plan,one_hot_market_segment_type,one_hot_room_type_reserved,one_hot_arrival_year])


# saving new columns
new_columns= booking_df.columns

In [14]:
# checking results
booking_df.head(n = 5)

Unnamed: 0,index,id,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,...,type_market_segment__4,type_room_reserved__0,type_room_reserved__1,type_room_reserved__2,type_room_reserved__3,type_room_reserved__4,type_room_reserved__5,type_room_reserved__6,arrival_year__2017,arrival_year__2018
0,0,0,2,0,0,2,1,0,0,9,...,0,1,0,0,0,0,0,0,0,1
1,1,1,2,0,1,2,0,0,0,117,...,0,1,0,0,0,0,0,0,0,1
2,2,2,2,0,0,1,0,0,0,315,...,0,1,0,0,0,0,0,0,0,1
3,3,3,1,0,0,2,1,0,0,32,...,0,1,0,0,0,0,0,0,0,1
4,4,4,2,0,1,0,0,0,0,258,...,0,1,0,0,0,0,0,0,0,1


## 1.7 Range
### Range of lead time

In [15]:
# name the range of rating
booking_df['Range_of_LeadTime']      = 0
for index, value in booking_df.iterrows():
    
    # Conditions for ranges
    #range4
    if booking_df.loc[index, 'lead_time'] > 150:
        booking_df.loc[index, 'Range_of_LeadTime'] = 3  
    #range2     
    elif booking_df.loc[index, 'lead_time'] > 50:
        booking_df.loc[index, 'Range_of_LeadTime'] = 2
     #range1     
    elif booking_df.loc[index, 'lead_time'] >= 0:
        booking_df.loc[index, 'Range_of_LeadTime'] = 1
    # make else      
    else:
        booking_df.loc[index, 'Range_of_LeadTime'] = 'Error'
        
        
# checking results
booking_df["Range_of_LeadTime"].value_counts(normalize = True,
                                       sort      = True,
                                       ascending = False).round(decimals = 2)

2    0.42
1    0.31
3    0.27
Name: Range_of_LeadTime, dtype: float64

### Range of arrival month

In [16]:
# name the range of rating
booking_df['Range_of_ArrivalMonth']      = 0
for index, value in booking_df.iterrows():
    
    # Conditions for ranges
    #range4
    if booking_df.loc[index, 'arrival_month'] > 9:
        booking_df.loc[index, 'Range_of_ArrivalMonth'] = 4  
    #range3     
    elif booking_df.loc[index, 'arrival_month'] > 6:
        booking_df.loc[index, 'Range_of_ArrivalMonth'] = 3
     #range2     
    elif booking_df.loc[index, 'arrival_month'] > 3:
        booking_df.loc[index, 'Range_of_ArrivalMonth'] = 2       
     #range1     
    elif booking_df.loc[index, 'arrival_month'] >= 1:
        booking_df.loc[index, 'Range_of_ArrivalMonth'] = 1
    # make else      
    else:
        booking_df.loc[index, 'Range_of_ArrivalMonth'] = 'Error'
        
        
# checking results
booking_df["Range_of_ArrivalMonth"].value_counts(normalize = True,
                                       sort      = True,
                                       ascending = False).round(decimals = 2)

3    0.37
4    0.30
2    0.23
1    0.10
Name: Range_of_ArrivalMonth, dtype: float64

### Range of arrival date

In [17]:
# name the range of rating
booking_df['Range_of_ArrivalDate']      = 0
for index, value in booking_df.iterrows():
    
    # Conditions for ranges
    #range4
    if booking_df.loc[index, 'arrival_date'] > 20:
        booking_df.loc[index, 'Range_of_ArrivalDate'] = 3  
     #range2     
    elif booking_df.loc[index, 'arrival_date'] > 10:
        booking_df.loc[index, 'Range_of_ArrivalDate'] = 2       
     #range1     
    elif booking_df.loc[index, 'arrival_date'] >= 1:
        booking_df.loc[index, 'Range_of_ArrivalDate'] = 1
    # make else      
    else:
        booking_df.loc[index, 'Range_of_ArrivalDate'] = 'Error'
        
        
# checking results
booking_df["Range_of_ArrivalDate"].value_counts(normalize = True,
                                       sort      = True,
                                       ascending = False).round(decimals = 2)

3    0.35
1    0.33
2    0.32
Name: Range_of_ArrivalDate, dtype: float64

### Range of room price

In [18]:
# name the range of rating
booking_df['Range_of_RoomPrice']      = 0
for index, value in booking_df.iterrows():
    
    # Conditions for ranges
    #range4
    if booking_df.loc[index, 'avg_price_per_room'] > 200:
        booking_df.loc[index, 'Range_of_RoomPrice'] = 4 
     #range3     
    elif booking_df.loc[index, 'avg_price_per_room'] > 100:
        booking_df.loc[index, 'Range_of_RoomPrice'] = 3           
     #range2     
    elif booking_df.loc[index, 'avg_price_per_room'] > 1:
        booking_df.loc[index, 'Range_of_RoomPrice'] = 2       
     #range1     
    elif booking_df.loc[index, 'avg_price_per_room'] >= 0:
        booking_df.loc[index, 'Range_of_RoomPrice'] = 1
    # make else      
    else:
        booking_df.loc[index, 'Range_of_RoomPrice'] = 'Error'
        
        
# checking results
booking_df["Range_of_RoomPrice"].value_counts(normalize = True,
                                       sort      = True,
                                       ascending = False).round(decimals = 2)

2    0.50
3    0.47
4    0.02
1    0.02
Name: Range_of_RoomPrice, dtype: float64

---
# 2. Model Preparation

In [19]:
for val in booking_df:
    print(f" {val} + ")

 index + 
 id + 
 no_of_adults + 
 no_of_children + 
 no_of_weekend_nights + 
 no_of_week_nights + 
 type_of_meal_plan + 
 required_car_parking_space + 
 room_type_reserved + 
 lead_time + 
 arrival_year + 
 arrival_month + 
 arrival_date + 
 market_segment_type + 
 repeated_guest + 
 no_of_previous_cancellations + 
 no_of_previous_bookings_not_canceled + 
 avg_price_per_room + 
 no_of_special_requests + 
 booking_status + 
 set + 
 type_meal_plan__0 + 
 type_meal_plan__1 + 
 type_meal_plan__2 + 
 type_meal_plan__3 + 
 type_market_segment__0 + 
 type_market_segment__1 + 
 type_market_segment__2 + 
 type_market_segment__3 + 
 type_market_segment__4 + 
 type_room_reserved__0 + 
 type_room_reserved__1 + 
 type_room_reserved__2 + 
 type_room_reserved__3 + 
 type_room_reserved__4 + 
 type_room_reserved__5 + 
 type_room_reserved__6 + 
 arrival_year__2017 + 
 arrival_year__2018 + 
 Range_of_LeadTime + 
 Range_of_ArrivalMonth + 
 Range_of_ArrivalDate + 
 Range_of_RoomPrice + 


In [20]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """  booking_status ~
                                          id + 
 no_of_adults + 
 no_of_children + 
 no_of_weekend_nights + 


 required_car_parking_space + 

 arrival_year__2018 + 
 lead_time +  
 arrival_month + 

 repeated_guest + 
 no_of_previous_cancellations + 
 avg_price_per_room + 
 no_of_special_requests+

 type_meal_plan__1 + 
 type_meal_plan__2 + 


 type_market_segment__1 + 
 type_market_segment__2 + 
 type_market_segment__3 + 


 type_room_reserved__1 + 
 type_room_reserved__2 + 
 type_room_reserved__3 + 
 type_room_reserved__4 + 
 type_room_reserved__5 + 
 Range_of_LeadTime + 
 Range_of_ArrivalMonth + 
 Range_of_ArrivalDate + 
 Range_of_RoomPrice                                        
                                         
 """,
                                        data    = booking_df)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

Optimization terminated successfully.
         Current function value: 0.353618
         Iterations 10


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.352
Dependent Variable:,booking_status,AIC:,49679.354
Date:,2023-05-25 17:31,BIC:,49926.6374
No. Observations:,70168,Log-Likelihood:,-24813.0
Df Model:,26,LL-Null:,-38276.0
Df Residuals:,70141,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,10.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-4.1363,0.1022,-40.4552,0.0000,-4.3367,-3.9359
id,-0.0001,0.0000,-98.2690,0.0000,-0.0001,-0.0001
no_of_adults,-0.0490,0.0255,-1.9215,0.0547,-0.0990,0.0010
no_of_children,0.0319,0.0383,0.8340,0.4043,-0.0431,0.1070
no_of_weekend_nights,0.0307,0.0129,2.3734,0.0176,0.0053,0.0560
required_car_parking_space,-1.5788,0.1116,-14.1493,0.0000,-1.7975,-1.3601
arrival_year__2018,0.2959,0.0445,6.6502,0.0000,0.2087,0.3831
lead_time,0.0031,0.0003,10.4593,0.0000,0.0025,0.0037
arrival_month,-0.1233,0.0143,-8.6388,0.0000,-0.1512,-0.0953


arrival_date	
no_of_previous_bookings_not_canceled	

---
# 4. Model Development

In [21]:
for val in booking_df:
    print(f" '{val}',  ")

 'index',  
 'id',  
 'no_of_adults',  
 'no_of_children',  
 'no_of_weekend_nights',  
 'no_of_week_nights',  
 'type_of_meal_plan',  
 'required_car_parking_space',  
 'room_type_reserved',  
 'lead_time',  
 'arrival_year',  
 'arrival_month',  
 'arrival_date',  
 'market_segment_type',  
 'repeated_guest',  
 'no_of_previous_cancellations',  
 'no_of_previous_bookings_not_canceled',  
 'avg_price_per_room',  
 'no_of_special_requests',  
 'booking_status',  
 'set',  
 'type_meal_plan__0',  
 'type_meal_plan__1',  
 'type_meal_plan__2',  
 'type_meal_plan__3',  
 'type_market_segment__0',  
 'type_market_segment__1',  
 'type_market_segment__2',  
 'type_market_segment__3',  
 'type_market_segment__4',  
 'type_room_reserved__0',  
 'type_room_reserved__1',  
 'type_room_reserved__2',  
 'type_room_reserved__3',  
 'type_room_reserved__4',  
 'type_room_reserved__5',  
 'type_room_reserved__6',  
 'arrival_year__2017',  
 'arrival_year__2018',  
 'Range_of_LeadTime',  
 'Range_of_

In [22]:
# explanatory sets from last session

# creating a dictionary to store candidate models

candidate_dict = {

 # the final x variables set
 'x_var'   : [   'id',  
 'no_of_adults',  
 'no_of_children',  
 'no_of_weekend_nights',  
 'no_of_week_nights',
 'required_car_parking_space',  
 'lead_time',  
 'arrival_year__2017',  
 'arrival_year__2018',             
 'arrival_month',  

 'type_of_meal_plan',  
              
 'repeated_guest',  
 'no_of_previous_cancellations',  
 'no_of_previous_bookings_not_canceled',  
 'avg_price_per_room',  
 'no_of_special_requests', 
              
  'type_meal_plan__1',  
  'type_meal_plan__2',  
  'type_meal_plan__3',  
            
 'type_market_segment__1',  
 'type_market_segment__2',  
 'type_market_segment__3',  
   
 'type_room_reserved__1',  
 'type_room_reserved__2',  
 'type_room_reserved__3',  
 'type_room_reserved__4',  
 'type_room_reserved__5',  
              
 'Range_of_LeadTime',
 'Range_of_ArrivalMonth',  
 'Range_of_ArrivalDate', 
 'Range_of_RoomPrice'],
 

 # significant variables only (set 2)
 'x_var1'    : []
    

    
}

In [23]:
# declaring explanatory variables
booking_df_data   = booking_df.drop(['booking_status'], axis=1)
booking_df_data   =  booking_df.loc[ : , candidate_dict['x_var']]

# setting explanatory variable(s) with most correlated x-variable
x_train = booking_df[candidate_dict['x_var']] [booking_df['set'] == 'Training' ]

# setting response variable

y_train = booking_df[ 'booking_status' ][ booking_df['set']   == 'Training' ]

# developing training and validation sets
x_train_1, x_train_2, y_train_1, y_train_2 = train_test_split(
            x_train,
            y_train.astype(dtype = 'int'),
            random_state = 123,
            test_size    = 0.25,
            stratify     = y_train)


## 1. Decision Tree Model

In [24]:
# INSTANTIATING a classification tree object
tree_model = DecisionTreeClassifier()


# FITTING to the training data
tree_model_fit = tree_model.fit(x_train_1, y_train_1)


# PREDICTING on the response variable
tree_model_train_pred = tree_model_fit.predict(x_train_1)
tree_model_valid_pred = tree_model_fit.predict(x_train_2)


# SCORING the results (accuracy)
tree_model_train_score = tree_model.score(x_train_1, y_train_1).round(4) # training accuracy
tree_model_valid_score = tree_model.score(x_train_2, y_train_2).round(4) # validation accuracy

# SCORING the results (auc)
tree_model_train_auc = roc_auc_score(y_true  = y_train_1,
                                y_score = tree_model_train_pred).round(decimals = 4)

tree_model_valid_auc = roc_auc_score(y_true  = y_train_2,
                                y_score = tree_model_valid_pred).round(decimals = 4)

# displaying results
print('Training Accuracy:  ', tree_model_train_score)
print('Testing Accuracy:', tree_model_valid_score)
print('Training AUC:       ', tree_model_train_auc)
print('Testing AUC:     ', tree_model_valid_auc)

# print the gap
tree_model_gap = abs(tree_model_train_score - tree_model_valid_score).round(4)
print('Gap   :', tree_model_gap)

Training Accuracy:   1.0
Testing Accuracy: 0.7435
Training AUC:        1.0
Testing AUC:      0.7324
Gap   : 0.2565


## 2. Gradient Boosting Classifier Model

In [25]:
#INSTANTIATING Tuned Model
gbc_model = GradientBoostingClassifier()

# FITTING to the training data
gbc_model_fit = gbc_model.fit(x_train_1, y_train_1)


# PREDICTING on the response variable
gbc_model_train_pred = gbc_model_fit.predict(x_train_1)
gbc_model_valid_pred = gbc_model_fit.predict(x_train_2)


# SCORING the results (accuracy)
gbc_model_train_score = gbc_model.score(x_train_1, y_train_1).round(4) # training accuracy
gbc_model_valid_score = gbc_model.score(x_train_2, y_train_2).round(4) # validation accuracy

# SCORING the results (auc)
gbc_model_train_auc = roc_auc_score(y_true  = y_train_1,
                                y_score = gbc_model_train_pred).round(decimals = 4)

gbc_model_valid_auc = roc_auc_score(y_true  = y_train_2,
                                y_score = gbc_model_valid_pred).round(decimals = 4)

# displaying results
print('Training Accuracy:  ', gbc_model_train_score)
print('Validation Accuracy:', gbc_model_valid_score)
print('Training AUC:       ', gbc_model_train_auc)
print('Validation AUC:     ', gbc_model_valid_auc)

# print the gap
gbc_model_gap = abs(gbc_model_train_score - gbc_model_valid_score).round(4)
print('Gap :               ', gbc_model_gap)

Training Accuracy:   0.8139
Validation Accuracy: 0.8104
Training AUC:        0.7969
Validation AUC:      0.7932
Gap :                0.0035


## 3. XGradient Boosting Classifier Model

In [26]:
import xgboost as xgb

# INSTANTIATING Tuned Model
model = xgb.XGBClassifier()

# FITTING to the training data
model_fit = model.fit(x_train_1, y_train_1)

# PREDICTING on the response variable
model_train_pred = model_fit.predict(x_train_1)
model_valid_pred = model_fit.predict(x_train_2)

# SCORING the results (accuracy)
model_train_score = model.score(x_train_1, y_train_1).round(4) # training accuracy
model_valid_score = model.score(x_train_2, y_train_2).round(4) # validation accuracy

# SCORING the results (auc)
model_train_auc = roc_auc_score(y_true=y_train_1, y_score=model_train_pred).round(decimals=4)
model_valid_auc = roc_auc_score(y_true=y_train_2, y_score=model_valid_pred).round(decimals=4)

# displaying results
print('Training Accuracy:  ', model_train_score)
print('Validation Accuracy:', model_valid_score)
print('Training AUC:       ', model_train_auc)
print('Validation AUC:     ', model_valid_auc)

# print the gap
model_gap = abs(model_train_score - model_valid_score).round(4)
print('Gap :               ', model_gap)


Training Accuracy:   0.8685
Validation Accuracy: 0.82
Training AUC:        0.8594
Validation AUC:      0.8074
Gap :                0.0485


## 3.1 XGB -- mutiple times

In [54]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Define the get_mean_auc function
def get_mean_auc(xgb_oof):
    auc_scores = [roc_auc_score(y_valid, val_preds) for _, val_preds, y_valid in xgb_oof]
    return np.mean(auc_scores)

# Instantiate the XGBoost models list
xgb_models = []

# Instantiate the out-of-fold predictions list
xgb_oof = []

# Instantiate an empty array for test predictions
predictions = np.zeros(len(booking_test))

# Set the number of folds and random seed for reproducibility
FOLDS = 5
SEED = 42

# Perform stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(skf.split(x_train_1, y_train_1)):
    if (fold + 1) % 5 == 0 or (fold + 1) == 1:
        print(f'{"#"*24} Training FOLD {fold+1} {"#"*24}')

    X_train, y_train = x_train_1.iloc[train_idx], y_train_1.iloc[train_idx]
    X_valid, y_valid = x_train_1.iloc[val_idx], y_train_1.iloc[val_idx]
    watchlist = [(X_train, y_train), (X_valid, y_valid)]

    # XGBoost model and fit
    model = xgb.XGBClassifier(n_estimators=2000, n_jobs=-1, max_depth=4, eta=0.1, colsample_bytree=0.62)
    model.fit(X_train, y_train, eval_set=watchlist, early_stopping_rounds=300, verbose=0)

    val_preds = model.predict_proba(X_valid)[:, 1]
    val_score = roc_auc_score(y_valid, val_preds)
    best_iter = model.best_iteration

    idx_pred_target = (val_idx, val_preds, y_valid)
    print(f'{" "*20} auc: {val_score:.5f} best iteration: {best_iter}')

    xgb_oof.append(idx_pred_target)
    xgb_models.append(model)

    if val_score > 0.917:
        test_preds = model.predict_proba(booking_test.drop(columns=['no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled']))[:, 1]
        predictions += test_preds

predictions /= len(xgb_models)
mean_val_auc = get_mean_auc(xgb_oof)

print('*'*45)
print(f'Mean AUC: {mean_val_auc:.5f}')


######################## Training FOLD 1 ########################




                     auc: 0.89270 best iteration: 720




                     auc: 0.89489 best iteration: 674




                     auc: 0.90444 best iteration: 712




                     auc: 0.89894 best iteration: 635
######################## Training FOLD 5 ########################




                     auc: 0.89103 best iteration: 552
*********************************************
Mean AUC: 0.89640


# 5. Submit Process 

In [55]:
# setting explanatory variable(s) with most correlated x-variable
x_test = booking_df[candidate_dict['x_var']][booking_df['set'] == 'Testing']

# Create an empty array to store the test predictions
predictions = np.zeros(len(x_test))

# Iterate over the trained XGBoost models
for model in xgb_models:
    test_preds = model.predict_proba(x_test)[:, 1]
    predictions += test_preds

predictions /= len(xgb_models)

# Saving predictions with their respective Ids from the test set
submission = pd.DataFrame(data={
    'id': booking_df['id'][booking_df['set'] == 'Testing'],
    'booking_status': predictions
})

# Save predictions to a CSV file
submission.to_csv('submission.csv', index=False)
