## Playground season 3, episode 07
### Start: February 14, 2023
### End: February 27, 2023

## EDA adoptation
---
Some of the EDA methods and convert took from: https://www.kaggle.com/code/jcaliz/ps-s03e07-a-complete-eda

In [245]:
import pandas as pd
import numpy as np
import os

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns

import math
import matplotlib
import matplotlib.pyplot as plt

from matplotlib.ticker import MaxNLocator
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedShuffleSplit


In [246]:
if os.name == 'nt':
    data_path = r"D:\Coding_pratice\_Data\kaggle_playground_series\playground-series-s3e7"
else:
    data_path = '/Users/admin/_Work/Data/Practice/playground-series-s3e7'
    
train_csv = os.path.join(data_path, 'train.csv')
test_csv = os.path.join(data_path, 'test.csv')
hotel_csv = os.path.join(data_path, 'hotel_bookings.csv')
train_reservation_csv = os.path.join(data_path, 'Reservation Cancellation Prediction/train__dataset.csv')
test_reservation_csv = os.path.join(data_path, 'Reservation Cancellation Prediction/test___dataset.csv')
hotel_res_csv = os.path.join(data_path, 'Hotel Reservations.csv')

train_df = pd.read_csv(train_csv, index_col=0)
test_df = pd.read_csv(test_csv, index_col=0)
origin_train_df = pd.read_csv(train_reservation_csv)
origin_test_df = pd.read_csv(test_reservation_csv)
hotel_demand_df = pd.read_csv(hotel_csv)
hotel_res_df = pd.read_csv(hotel_res_csv, index_col=0)

"hotel_booking.csv" - Source: https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand

"Reservation Cancellation Prediction/xxx_dataset.csv" - Source: https://www.kaggle.com/datasets/gauravduttakiit/reservation-cancellation-prediction

"Hotel Reservations.csv" - Source: https://www.kaggle.com/datasets/ahsan81/hotel-reservations-classification-dataset

In [247]:
hotel_res_df.index.name = 'id'

## Feature notes
---
* *id*: unique identifier of each booking

* *no_of_adults*: Number of adults
* *no_of_children*: Number of children
* *no_of_weekend_nights*: Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel
* *no_of_week_nights*: Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel
* *type_of_meal_plan*: Type of meal plan booked by the customer
* *required_car_parking_space*: Does the customer require a car parking space? (0 - No, 1 - Yes)
* *room_type_reserved*: Type of room reserved by the customer. The values are ciphered (encoded) by INN Hotels.
* *lead_time*: Number of days between the date of booking and the arrival date
* *arrival_year*: Year of arrival date
* *arrival_month*: Month of arrival date
* *arrival_date*: Date of the month
* *market_segment_type*: Market segment designation
* *repeated_guest*: Is the customer a repeated guest? (0 - No, 1 - Yes)
* *no_of_previous_cancellations*: Number of previous bookings that were canceled by the customer prior to the current booking
* *no_of_previous_bookings_not_canceled*: Number of previous bookings not canceled by the customer prior to the current booking
* *avg_price_per_room*: Average price per day of the reservation; prices of the rooms are dynamic. (in euros)
* *no_of_special_requests*: Total number of special requests made by the customer (e.g. high floor, view from the room, etc)
* *booking_status*: Flag indicating if the booking was canceled or not.

Rename 'hotel_demand_df' columns to match train_df

In [248]:
hotel_demand_df.rename(
    columns= {
    'adults': 'no_of_adults',
    'children': 'no_of_children',
    'stays_in_weekend_nights': 'no_of_weekend_nights',
    'stays_in_week_nights': 'no_of_week_nights',
    'meal': 'type_of_meal_plan',
    'required_car_parking_spaces': 'required_car_parking_space',
    'reserved_room_type': 'room_type_reserved',
    'lead_time': 'lead_time',
    'arrival_date_year': 'arrival_year',
    'arrival_date_month': 'arrival_month',
    'arrival_date_day_of_month': 'arrival_date',
    'market_segment': 'market_segment_type',
    'is_repeated_guest': 'repeated_guest',
    'previous_cancellations': 'no_of_previous_cancellations',
    'previous_bookings_not_canceled': 'no_of_previous_bookings_not_canceled',
    'adr': 'avg_price_per_room',
    'total_of_special_requests': 'no_of_special_requests',
    'is_canceled': 'booking_status'
}, inplace=True)

In [249]:
hotel_demand_df['arrival_month'] = hotel_demand_df['arrival_month'].map({
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
})

hotel_demand_df['type_of_meal_plan'] = hotel_demand_df['type_of_meal_plan'].map({
        'BB': 0, 'HB': 2, 'SC': 1, 'Undefined': 1, 'FB': 3
})

hotel_demand_df['market_segment_type'] = hotel_demand_df['market_segment_type'].map({
        "Online TA": 1, "Offline TA/TO": 0, "Corporate": 2, "Complementary": 4, "Aviation": 3
})

hotel_demand_df['room_type_reserved'] = hotel_demand_df['room_type_reserved'].map({
        'A':  0, 'D':  1, 'E':  3, 'F':  2, 'G':  4, 'B':  5, 'C':  6
})

hotel_demand_df['market_segment_type'].fillna(5, inplace=True)
hotel_demand_df['room_type_reserved'].fillna(7, inplace=True)

hotel_res_df['type_of_meal_plan'] = hotel_res_df['type_of_meal_plan'].map(
    {
        'Meal Plan 1': 0, 'Not Selected': 2, 'Meal Plan 2': 1, 'Meal Plan 3': 3
    }
)
hotel_res_df['room_type_reserved'] = hotel_res_df['room_type_reserved'].map(
    {
        'Room_Type 1': 0, 'Room_Type 4': 1, 'Room_Type 6': 3, 
        'Room_Type 2': 2, 'Room_Type 5': 4, 'Room_Type 7': 5, 'Room_Type 3': 6
    }
)
hotel_res_df['market_segment_type'] = hotel_res_df['market_segment_type'].map(
    {
        "Online": 1, "Offline": 0, "Corporate": 2, "Complementary": 4, "Aviation": 3
    }
)
hotel_res_df['booking_status'] = hotel_res_df['booking_status'].map(
    {
        "Not_Canceled": 0, "Canceled": 1,
    }
)

In [250]:
print("Train shape:         ", train_df.shape)
print("Test shape:          ", test_df.shape)
print("Origin Train shape:  ", origin_train_df.shape)
print("Origin Test shape:   ", origin_test_df.shape)
print("Hotel demand shape:  ", hotel_demand_df.shape)
print("Hotel reserve shape: ", hotel_res_df.shape)

Train shape:          (42100, 18)
Test shape:           (28068, 17)
Origin Train shape:   (18137, 18)
Origin Test shape:    (18138, 17)
Hotel demand shape:   (119390, 32)
Hotel reserve shape:  (36275, 18)


### Distribution

Check numerical + ordinal features using plotly

In [251]:
from functools import reduce
from plotly.colors import n_colors, sample_colorscale


# Merge all dataframe
total_df = pd.concat([
    train_df.assign(set='train'),
    test_df.assign(set='test'),
    origin_train_df.assign(set='origin_train'),
    origin_test_df.assign(set='origin_test'),
    hotel_demand_df.assign(set='demand')
], ignore_index=True)

total_df.reset_index(inplace=True, drop=True)

def add_ordinal(fig, data_frame, feature, position=(1,1), scatter_mode='lines+markers'):
    target = data_frame.groupby(['set', feature], as_index=False)['booking_status'].mean()
    target.sort_values(feature, inplace=True)
    """
    ---
    """
    percentage = data_frame.groupby('set')[feature].value_counts(True)
    percentage = percentage.rename('%').reset_index()
    dataframe = percentage.set.unique()
    data_frames = [percentage[percentage.set==i] for i in dataframe]

    target_frames = [target[target.set == t] for t in target.set.unique()]
    target_frames = [frame for frame in target_frames if not frame.isna().values.any()]
    # Rename '%' as '%_<set_name>'
    [df.rename(columns={'booking_status': 'target_' + df.set.iloc[0]}, inplace=True) for df in target_frames]
    [df.rename(columns={'%': '%_' + df.set.iloc[0], 'set': 'set_ '+ df.set.iloc[0]}, inplace=True) for df in data_frames]
    df_merge = reduce(lambda left, right: pd.merge(
        left, right, on=feature, how='outer', sort = True
        ), data_frames)
    target_merge = reduce(lambda left, right: pd.merge(
        left, right, on=feature, how='outer', sort = True
        ), target_frames)

    # print(target_merge)

    keeping_cols = [feature] + [i for i in df_merge.columns if '%' in i]
    df_merge = df_merge[keeping_cols]
    keeping_target = [i for i in target_merge.columns if 'target' in i]
    target_merge = target_merge[keeping_target]

    df_merge = df_merge.astype({feature: int})
    # colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', df_merge[feature].max() + 1, colortype='rgb')
    scale_values = np.unique(np.array([df_merge[column].values for column in df_merge.columns[1:]]).flatten())
    scale_targets = np.unique(np.array([target_merge[column].values for column in target_merge.columns]).flatten())
    # colors_scale = sample_colorscale('Bluered', scale_values)
    # colors_target = sample_colorscale('Bluered', scale_targets)
    # print(df_merge)
    for i, group in enumerate(df_merge.columns[1:]):
        fig.add_trace(
            go.Bar(
                x = df_merge[feature],
                y = df_merge[group],
                name= group[2:] if '%' in group else group,
                legendgroup=i,
                showlegend = False if (position[0] > 1) or (position[1] > 1) else True,
                # layout_yaxis_range=[min(df_merge[group]) - 1, max(df_merge[group])]
            ), row=position[0], col=position[1]
        )
        
    for i, group in enumerate(target_merge.columns):
        fig.add_trace(
            go.Scatter(
                x = df_merge[feature],
                y = target_merge[group],
                name=group,
                legendgroup=i,
                mode=scatter_mode,
                showlegend = False if (position[0] > 1) or (position[1] > 1) else True,
                # layout_yaxis_range=[min(target_merge[group]) - 1, max(target_merge[group])]
            ), row=position[0], col=position[1]
        )
    """fig.add_trace(
        go.Table(
        header = dict(
            values = [f"<b>{column}</b>" for column in df_merge.columns] + [
                f"<b>{column}</b>" for column in target_merge.columns],
            line_color='white', fill_color='white',
            align='center',font=dict(color='black', size=12)
        ),
        cells = dict(
            values= [np.array(df_merge[feature]).round().astype(str)] + [
                df_merge[column].apply(
                    lambda x: np.format_float_scientific(x, precision=2,exp_digits=1)
                    ).values for column in df_merge.columns[1:]] + [
                        target_merge[column].apply(
                            lambda x: np.round(x, decimals=3)
                            ).values for column in target_merge.columns
                    ],
            line_color=[
                np.array(colors)[df_merge[feature].values]] + [
                    np.array(colors_scale)[
                        [scale_values.tolist().index(value) for value in df_merge[column].values]
                        ] for column in df_merge.columns[1:]] + [
                            np.array(colors_target)[
                                [scale_targets.tolist().index(value) for value in target_merge[column].values]
                                ] for column in target_merge.columns
                        ],
            fill_color=[
                np.array(colors)[df_merge[feature].values]] + [
                    np.array(colors_scale)[
                        [scale_values.tolist().index(value) for value in df_merge[column].values]
                        ] for column in df_merge.columns[1:]] + [
                            np.array(colors_target)[
                                [scale_targets.tolist().index(value) for value in target_merge[column].values]
                                ] for column in target_merge.columns
                        ],
            align='center', font=dict(color='white', size=11)
        )
    ), row=position[0], col=position[1])"""
    
def add_numeric(fig, feature, position=(1,1)):
    """
        Histogram idea steal from here: https://stackoverflow.com/questions/58770063/how-to-make-mixed-statistical-subplots-using-plotly-in-python
    """
    hist_frames = total_df[['set',feature]]
    # for group in hist_frames.set.unique():
    #     print(group)
    #     fig.add_trace(
    #         go.Histogram(
    #             x = hist_frames[hist_frames.set == group][feature],
    #             name = group,
    #             nbinsx=70,
    #             histnorm='probability density',
    #             showlegend = False if (position[0] > 1) or (position[1] > 1) else True,
    #         ), row=position[0], col=position[1]
    #     )
    #     fig.update_traces(overwrite=True, marker={"opacity": 0.7}) 
        # fig.update_yaxes(range = [min() - 1,max ()])
    group_labels = hist_frames.set.unique()
    hist_data = [hist_frames[hist_frames.set == group][feature] for group in group_labels]
    distplfig = ff.create_distplot(hist_data, group_labels, bin_size=4, curve_type='kde', 
                            show_rug=False)
    for k in range(len(distplfig.data)):
        fig.add_trace(distplfig.data[k],
        row=position[0], col=position[1]
    )

    target = total_df.groupby(['set', feature], as_index=False)['booking_status'].mean()
    target.sort_values(feature, inplace=True)
    target_frames = [target[target.set == t] for t in target.set.unique()]
    target_frames = [frame for frame in target_frames if not frame.isna().values.any()]
    # for i, group in enumerate(target_merge.columns):
    #     fig.add_trace(
    #         go.Scatter(
    #             x = df_merge[feature],
    #             y = target_merge[group],
    #             name=group,
    #             legendgroup=i,
    #             mode='markers',
    #             showlegend = False if (position[0] > 1) or (position[1] > 1) else True,
    #             # layout_yaxis_range=[min(target_merge[group]) - 1, max(target_merge[group])]
    #         ), row=position[0], col=position[1]
    #     )

labels = ['train', 'test', 'origin_train', 'origin_test', 'demand']

ordinal_features = [
    'no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights',
    'no_of_special_requests', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled'
]

numeric_features = [
    'lead_time', 'avg_price_per_room'
]
columns =  ordinal_features + numeric_features
# columns = ordinal_features
fig = make_subplots(
    rows= len(columns)//2 + 1,
    cols= len(columns) % 2 + 1,
    subplot_titles=[col_name + "_distribution" for col_name in columns],
    # specs=[
    #     [{"type": "table"}] * (len(columns)%2 + 1)
    # ] * (len(columns) // 2 + 1),
    horizontal_spacing=0.1,
    vertical_spacing=0.05
)
for i, column in enumerate(columns):
    if column in ordinal_features:
        add_ordinal(fig, total_df, column, (i//2 + 1, i%2 + 1))
    else:
        add_numeric(fig, column, (i//2 + 1, i%2 + 1))

fig.update_layout(height=1400, width=1000)
fig.show()

In [252]:
# pd.set_option('display.max_rows', None, 'display.max_columns', None)
print(total_df.groupby(['no_of_special_requests', 'no_of_week_nights'])['booking_status'].mean())

no_of_special_requests  no_of_week_nights
0                       0                    0.357382
                        1                    0.413986
                        2                    0.535799
                        3                    0.483224
                        4                    0.498584
                                               ...   
5                       4                    0.000000
                        5                    0.166667
                        6                    0.000000
                        7                    0.000000
                        10                   0.000000
Name: booking_status, Length: 120, dtype: float64


In [253]:
categorical_features = ['market_segment_type', 'repeated_guest', 'required_car_parking_space',
    'room_type_reserved', 'type_of_meal_plan']
columns = categorical_features
# columns = ordinal_features
fig = make_subplots(
    rows= len(columns)//2 + 1,
    cols= len(columns) % 2 + 1,
    subplot_titles=[col_name + "_distribution" for col_name in columns],
    # specs=[
    #     [{"type": "table"}] * (len(columns)%2 + 1)
    # ] * (len(columns) // 2 + 1),
    horizontal_spacing=0.1,
    vertical_spacing=0.05
)
for i, column in enumerate(columns):
    if column in ordinal_features or column in categorical_features:
        add_ordinal(fig, total_df, column, (i//2 + 1, i%2 + 1), 'markers')
    else:
        add_numeric(fig, column, (i//2 + 1, i%2 + 1))

fig.update_layout(height=1400, width=1000)
fig.show()

## Feature analyze
---

**Scattered**:

* no_of_adults
* no_of_weekend_nights
* no_of_week_nights
* no_of_previous_booking_not_cancelled
* avg_price_per_room
* room_type_preserved
* type_of_meal_plan
* lead_time

**Trend/Pattern**
* no_of_special_requests -> value increase - cancellation decrease 
* no_of_previous_cancellation -> cancellation decrease between [1, 11]
* market_segment_type -> value 0, 2, 3 and 4 have lower cancellation
* repeated_guest -> value 1 have lower cancellation
* required_car_parking_space -> value > 0 have lower cancellation

### Process arrive date

In [254]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

def if_near_holiday(date, holidays):
    before_holiday = any(
        ((holidays - date).days <= 7) & 
        ((holidays - date).days > 0)
    )
    return before_holiday

def process_arrival_date(df):
    temp = df.rename(columns={
        'arrival_date':'day',
        'arrival_month': 'month',
        'arrival_year': 'year'
    })
    df['date'] = pd.to_datetime(temp[['year', 'month', 'day']], errors='coerce')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['week'] = df['date'].dt.isocalendar().week.astype(float)
    df['dayofweek'] = df['date'].dt.dayofweek
    # df['quarter'] = df['date'].dt.quarter
    df['quarter'] = df['month'].apply(lambda x: (x-1)//3 + 1)
    df['dayofyear'] = df['date'].dt.dayofyear

    cal = calendar()
    holidays = cal.holidays(start=df['date'].min(), end=df['date'].max())
    df['is_holiday'] = df['date'].isin(holidays)
    df['near_holiday'] = df['date'].apply(lambda x: if_near_holiday(x, holidays))
    # df['near_holiday'] = dt['date']

    return df
total_df = process_arrival_date(total_df)
display(total_df.head())

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,...,date,year,month,day,week,dayofweek,quarter,dayofyear,is_holiday,near_holiday
0,2,0.0,0,2,1,0,0.0,9,2018,1,...,2018-01-14,2018.0,1.0,14.0,2.0,6.0,1.0,14.0,False,True
1,2,0.0,1,2,0,0,0.0,117,2018,7,...,2018-07-29,2018.0,7.0,29.0,30.0,6.0,3.0,210.0,False,False
2,2,0.0,0,1,0,0,0.0,315,2018,12,...,2018-12-02,2018.0,12.0,2.0,48.0,6.0,4.0,336.0,False,False
3,1,0.0,0,2,1,0,0.0,32,2018,12,...,2018-12-01,2018.0,12.0,1.0,48.0,5.0,4.0,335.0,False,False
4,2,0.0,1,0,0,0,0.0,258,2018,10,...,2018-10-16,2018.0,10.0,16.0,42.0,1.0,4.0,289.0,False,False


In [255]:
to_plot = total_df.groupby(['date', 'set']).size().rename('booking_count').reset_index()
to_plot_2 = total_df.groupby(['date', 'set'])['booking_status'].mean().reset_index()
fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=['Count of Arrival', 'Mean booking cancellation Based on Arrival Date']
)

colors = px.colors.qualitative.G10
for i, group in enumerate(to_plot.set.unique()):
    fig.add_trace(
        go.Scatter(
            x = to_plot[to_plot['set']==group]['date'],
            y = to_plot[to_plot['set']==group]['booking_count'],
            line_color=colors[i],
            name=group,
            legendgroup=group,
            mode='lines',
        ), row=1, col=1
    )

for i, group in enumerate(to_plot_2.set.unique()):
    fig.add_trace(
        go.Scatter(
            x = to_plot_2[to_plot_2['set']==group]['date'],
            y = to_plot_2[to_plot_2['set']==group]['booking_status'],
            line_color=colors[i],
            name=group,
            legendgroup=group,
            mode='lines',
            showlegend=False
        ), row=2, col=1
    )

fig.update_layout(height=700, width=1000)
fig.show()

In [256]:
cal = calendar()
holidays = cal.holidays(start=total_df['date'].min(), end=total_df['date'].max())
display(holidays)

DatetimeIndex(['2015-07-03', '2015-09-07', '2015-10-12', '2015-11-11',
               '2015-11-26', '2015-12-25', '2016-01-01', '2016-01-18',
               '2016-02-15', '2016-05-30', '2016-07-04', '2016-09-05',
               '2016-10-10', '2016-11-11', '2016-11-24', '2016-12-26',
               '2017-01-02', '2017-01-16', '2017-02-20', '2017-05-29',
               '2017-07-04', '2017-09-04', '2017-10-09', '2017-11-10',
               '2017-11-23', '2017-12-25', '2018-01-01', '2018-01-15',
               '2018-02-19', '2018-05-28', '2018-07-04', '2018-09-03',
               '2018-10-08', '2018-11-12', '2018-11-22', '2018-12-25'],
              dtype='datetime64[ns]', freq=None)

In [257]:
wrong_dates = total_df[['arrival_year', 'arrival_month', 'arrival_date', 'set']].loc[total_df.date.isna()]
display(
    wrong_dates.groupby('set').apply(
        lambda df: df[['arrival_year', 'arrival_month', 'arrival_date']] .apply(tuple, axis=1)
    ).unique()
)

array([(2018, 2, 29), (2018, 2, 30), (2018, 2, 31), (2017, 2, 29),
       (2018, 9, 31), (2018, 4, 31), (2017, 9, 31), (2017, 11, 31),
       (2018, 6, 31), (2018, 11, 31)], dtype=object)

In [258]:
features_date  = [ 'year', 'month', 'week', 'day', 'dayofweek', 'quarter', 'dayofyear']
columns = features_date
# columns = ordinal_features
fig = make_subplots(
    rows= len(columns)//2 + 1,
    cols= len(columns) % 2 + 1,
    subplot_titles=[col_name + "_distribution" for col_name in columns],
    # specs=[
    #     [{"type": "table"}] * (len(columns)%2 + 1)
    # ] * (len(columns) // 2 + 1),
    horizontal_spacing=0.1,
    vertical_spacing=0.05
)
for i, column in enumerate(columns):
    add_ordinal(fig, total_df, column, (i//2 + 1, i%2 + 1), 'markers')
    
fig.update_layout(height=1400, width=1000)
fig.show()

In [259]:
total_df.columns

Index(['no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space',
       'room_type_reserved', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_date', 'market_segment_type', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'avg_price_per_room', 'no_of_special_requests', 'booking_status', 'set',
       'hotel', 'arrival_date_week_number', 'babies', 'country',
       'distribution_channel', 'assigned_room_type', 'booking_changes',
       'deposit_type', 'agent', 'company', 'days_in_waiting_list',
       'customer_type', 'reservation_status', 'reservation_status_date',
       'date', 'year', 'month', 'day', 'week', 'dayofweek', 'quarter',
       'dayofyear', 'is_holiday', 'near_holiday'],
      dtype='object')

In [260]:
def process_booking_date(df):
    # temp = df.rename(columns={
    #     'arrival_year': 'year',
    #     'arrival_month': 'month',
    #     'arrival_date': 'day'
    # })

    df['booking_date'] = pd.to_datetime(df[['year', 'month', 'day']], errors='coerce')
    df['booking_data'] = df['booking_date'] - pd.Series(
        [pd.Timedelta(i, 'd') for i in df.lead_time],
        index=df.index
    )
    
    df['booking_year'] = df['booking_date'].dt.year
    df['booking_month'] = df['booking_date'].dt.month
    df['booking_week'] = df['booking_date'].dt.isocalendar().week.astype(float)
    df['booking_day'] = df['booking_date'].dt.day
    df['booking_dayofweek'] = df['booking_date'].dt.dayofweek
    # df['booking_quarter'] = df['booking_date'].dt.quarter
    df['booking_quarter'] = df['month'].apply(lambda x: (x-1)//3 + 1)
    df['booking_dayofyear'] = df['booking_date'].dt.dayofyear
    
    # df.drop(columns='booking_date', inplace=True)
    return df

total_df = process_booking_date(total_df)

In [261]:
discard_feature = ['arrival_year', 'arrival_month', 'arrival_date']
features = train_df.columns.tolist() + ['booking_quarter']
features = [e for e in features if e not in discard_feature]
features

['no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'type_of_meal_plan',
 'required_car_parking_space',
 'room_type_reserved',
 'lead_time',
 'market_segment_type',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room',
 'no_of_special_requests',
 'booking_status',
 'booking_quarter']

In [262]:
train = total_df[total_df.set.isin(['train', 'origin_train', 'demand'])][features]
train.dropna(axis=0, inplace=True)
train.reset_index(inplace=True, drop=True)

In [266]:
X_feature = train.columns.difference(['booking_status'])
X = train[X_feature]
X_test = total_df[total_df.set =='test'][X_feature][:len(test_df)]
X_test.fillna(0, inplace=True)
# X_test.dropna(axis=0, inplace=True)
y = train.booking_status

In [267]:
X_test[X_test.isna().any(axis=1)]

Unnamed: 0,avg_price_per_room,booking_quarter,lead_time,market_segment_type,no_of_adults,no_of_children,no_of_previous_bookings_not_canceled,no_of_previous_cancellations,no_of_special_requests,no_of_week_nights,no_of_weekend_nights,repeated_guest,required_car_parking_space,room_type_reserved,type_of_meal_plan


In [122]:
from sklearn.ensemble import RandomForestClassifier

cv = StratifiedShuffleSplit(n_splits=5, test_size = .25, random_state=123)

max_depth = list(range(3,18))
n_estimator = list(range(160, 200, 100))
criterion = ['entropy', 'gini']
param = {
    'n_estimators': n_estimator,
    'max_depth': max_depth,
    'criterion': criterion
}
grid_ = GridSearchCV(
    estimator=RandomForestClassifier(max_features='sqrt', random_state=123),
    param_grid = param,
    cv=cv,
    scoring='roc_auc'
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

rforest_grid = grid_.best_estimator_

KeyboardInterrupt: 

In [123]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score
import itertools

def tabnet_CV_train(params, X_train, y_train, X_val, y_val):
    
    # estimator_score = []
    # for i, (train_index, val_index) in enumerate(cv.get_n_splits(X, y)):
    #     X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    #     X_val, y_val = X.iloc[val_index], y.loc[val_index]

    tabnet = TabNetClassifier()
    params['n_a'] = params['n_d']
    tabnet.set_params(**params, verbose=0)
    tabnet.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[( X_val, y_val)], eval_name=['valid'],
        max_epochs=500, eval_metric=['auc'], patience=20
    )
    return tabnet, params

def holdout_grid_search(model_train, X_train, y_train, X_val, y_val, hyperparam, verbose):
    best_estimator = None
    best_hyperparam = {}
    best_score = 0

    hyper_param_l = list(hyperparam.values())
    combination_l_of_t = list(itertools.product(*hyper_param_l))
    combination_l_of_d = []

    for val_tuple in combination_l_of_t:
        param_d = {}

        for i, k in enumerate(hyperparam):
            param_d[k] = val_tuple[i]
        combination_l_of_d.append(param_d)

    for param_d in combination_l_of_d:
        estimator, param_d = model_train(param_d, X_train.values, y_train.values, X_val.values, y_val.values)

        y_prob = estimator.predict(X_val.values)
        estimator_score = roc_auc_score(y_val, y_prob)
        
        if estimator_score > best_score:
            best_score = estimator_score
            best_estimator = estimator
            best_hyperparam = param_d

    if verbose:
        print("hyperparam:")
        display(hyperparam)
        
        print("hyper_param_l")
        display(hyper_param_l)
        
        print("combination_l_of_t")
        display(combination_l_of_t)
        
        print(f"combination_l_of_d")
        display(combination_l_of_d)
        
    print(f"best_hyperparam")
    display(best_hyperparam)
    print(f"best_score: {best_score:.4f}")

    return best_estimator, best_hyperparam


In [146]:
X_train, X_val, y_train, y_val = train_test_split(X, y , test_size=0.25, random_state=123)

In [147]:

param_grid = dict(n_d = [8, 16],
                  n_a = [8],
                  n_steps = [3, 4, 5],
                  optimizer_params = [dict(lr=0.01), dict(lr=0.02)],
                  gamma = [1, 1.5, 2],
                  lambda_sparse = [1e-2, 1e-3, 1e-4],
                  momentum = [0.3, 0.4, 0.5],
                  n_shared = [2],
                  n_independent = [2],
                  clip_value = [2.],   
)
params = dict(n_d = 8,
            n_a = 8,
            n_steps = 5,
            optimizer_params = dict(lr=0.02),
            gamma = 1,
            lambda_sparse =  1e-3,
            momentum = 0.3,
            n_shared = 2,
            n_independent = 2,
            clip_value = 2.,
)
tabnet, _ = tabnet_CV_train(params, X_train.values, y_train.values, X_val.values, y_val.values)

Device used : cpu

Early stopping occurred at epoch 66 with best_epoch = 46 and best_valid_auc = 0.84409
Best weights from best epoch are automatically used!


In [268]:
y_pred = tabnet.predict(X_test.values)
y_pred

array([0., 0., 0., ..., 0., 0., 1.])

In [272]:
from datetime import date

today = date.today()

submission = pd.read_csv(
    os.path.join(data_path,'sample_submission.csv'))
submission.booking_status = y_pred

submission.to_csv(
    os.path.join(data_path, 'submission_{}.csv'.format(today)), index=False
)