In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta

pd.set_option('display.max_columns', None)

In [2]:
train_data = pd.read_csv('train_data.csv')
train_data = train_data.infer_objects()

val_data = pd.read_csv('val_data.csv')
val_data = val_data.infer_objects()

pd.set_option('display.max_columns', None)

In [3]:
train_data

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,City Hotel,0,17,2015,October,40,1,0,3,2,0.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Contract,113.33,0,2,Check-Out,2015-10-04
1,City Hotel,1,99,2016,July,30,21,0,1,2,1.0,0,BB,ESP,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,130.50,0,0,Canceled,2016-06-09
2,City Hotel,1,156,2017,May,19,12,1,2,1,0.0,0,BB,USA,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,130.80,0,0,Canceled,2016-12-07
3,City Hotel,1,129,2017,June,22,2,1,2,2,0.0,0,BB,PRT,Groups,TA/TO,0,0,0,A,A,0,Non Refund,154.0,,0,Transient,130.00,0,0,Canceled,2017-01-24
4,City Hotel,1,21,2015,October,44,31,2,4,2,0.0,0,BB,PRT,Online TA,TA/TO,0,1,0,A,A,0,No Deposit,9.0,,0,Contract,105.79,0,0,Canceled,2015-10-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55481,City Hotel,0,215,2017,August,34,25,0,1,2,3.0,0,BB,GBR,Direct,Direct,0,0,0,G,G,0,No Deposit,14.0,,0,Transient,245.00,0,1,Check-Out,2017-08-26
55482,City Hotel,1,229,2017,July,28,12,1,4,2,0.0,0,BB,PRT,Groups,TA/TO,0,0,0,A,A,0,Non Refund,,,0,Transient,110.00,0,0,Canceled,2016-11-25
55483,City Hotel,0,0,2016,January,4,19,0,1,1,0.0,0,BB,PRT,Corporate,Corporate,0,0,0,A,A,0,No Deposit,180.0,,0,Transient,66.00,1,0,Check-Out,2016-01-20
55484,City Hotel,0,53,2015,August,35,24,1,1,2,0.0,0,BB,ESP,Groups,TA/TO,0,0,0,A,A,0,No Deposit,1.0,,0,Transient-Party,62.00,0,0,Check-Out,2015-08-26


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55486 entries, 0 to 55485
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           55486 non-null  object 
 1   is_canceled                     55486 non-null  int64  
 2   lead_time                       55486 non-null  int64  
 3   arrival_date_year               55486 non-null  int64  
 4   arrival_date_month              55486 non-null  object 
 5   arrival_date_week_number        55486 non-null  int64  
 6   arrival_date_day_of_month       55486 non-null  int64  
 7   stays_in_weekend_nights         55486 non-null  int64  
 8   stays_in_week_nights            55486 non-null  int64  
 9   adults                          55486 non-null  int64  
 10  children                        55484 non-null  float64
 11  babies                          55486 non-null  int64  
 12  meal                            

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
import math

class FeatureExtractor(TransformerMixin):
    def  __init__(self):
        print('Feature Extractor initiated...')

    def fit(self,X,y=None):
        print('Fitting data...')
        return self

    def transform(self,X,y=None):
        print('Extracting data...')
        X_ = X.copy()
        X_.rename(columns={'arrival_date_year':'arrival_year', 'arrival_date_month':'arrival_month', 'arrival_date_day_of_month':'arrival_day_of_month', 'arrival_date_week_number':'arrival_week_number'}, inplace=True)
        X_['reservation_status_date'] = pd.to_datetime(X_['reservation_status_date'], format='%Y-%m-%d')
        X_['arrival_month'] = X_.apply(lambda x: self.__get_month_index(x['arrival_month']), axis=1)
        X_ = self.__null_handler(X_)

        X_['children'] = X_['children'].apply(int)
        X_['agent'] = X_['agent'].apply(int)
        X_['company'] = X_['company'].apply(int)
        
        X_['arrival_date'] = X_.apply(lambda x: self.__get_date(x['arrival_year'], x['arrival_month'], x['arrival_day_of_month']), axis=1)
        X_['arrival_date'] = pd.to_datetime(X_['arrival_date'], format='%Y-%m-%d')
        X_['arrival_day_of_week'] = X_['arrival_date'].dt.day_of_week

        X_['booking_date'] = X_.apply(lambda x: x['arrival_date'] - timedelta(days=x['lead_time']), axis=1)
        X_['booking_year'] = X_['booking_date'].dt.year
        X_['booking_month'] = X_['booking_date'].dt.month
        X_['booking_day_of_month'] = X_['booking_date'].dt.day
        X_['booking_day_of_week'] = X_['booking_date'].dt.day_of_week
        X_['booking_week_number'] = X_['booking_date'].dt.isocalendar().week

        X_['arrival_day_of_week'] = X_['arrival_date'].dt.dayofweek

        X_['is_family'] = X_.apply(lambda x: self.__family(x['children'] + x['babies']), axis=1)

        timeseries_labels = ['arrival_month','arrival_week_number','arrival_day_of_month','arrival_day_of_week',
                'booking_month', 'booking_week_number', 'booking_day_of_month', 'booking_day_of_week']

        for label in timeseries_labels:
            X_[label + "_norm"] = 2 * math.pi * X_[label] / X_[label].max()
            X_["cos_" + label] = np.cos(X_[label + "_norm"])
            X_["sin_" + label] = np.sin(X_[label + "_norm"])

            X_.drop(labels=[label + '_norm', label], axis=1, inplace=True)

        X_.drop(labels=['is_canceled','adr','reservation_status','reservation_status_date','arrival_date','booking_date','hotel'],axis=1,inplace=True)

        # self.__view_data_specs(X_)
        return X_

    def __null_handler(self,X):
        X.agent.fillna(0, inplace=True)
        X.company.fillna(0, inplace=True)
        X.children.fillna(0, inplace=True)
        X.country.fillna('PRT', inplace=True)
        X.loc[X[X['market_segment'] == 'Undefined'].index, 'market_segment'] = 'Online TA'
        X.loc[X[X['distribution_channel'] == 'Undefined'].index, 'distribution_channel'] = 'TA/TO'
        return X

    def __get_month_index(self,month):
        months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
        return months.index(month)+1

    def __get_date(self,year,month,day):
        return str(year) + '-' + str(month) + '-' + str(day)

    def __family(self,value):
        if value > 0:
            return 1
        else:
            return 0

    def __view_data_specs(self,data):
        print(data.columns)
        print(data.shape)
        print(data.info())

# feature_extractor = FeatureExtractor()
# feature_extractor.fit(train_data)
# transformed_data = feature_extractor.transform(train_data)
# transformed_data.info()

In [6]:
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.compose import make_column_transformer,make_column_selector,ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer

columnTransformer = ColumnTransformer([
    ('cat_transformer', OneHotEncoder(), make_column_selector(dtype_include=object)),
    ('num_transformer', StandardScaler(), make_column_selector(dtype_include=int))
], remainder='passthrough')

full_transformer = Pipeline([
    ('feature_extractor', FeatureExtractor()),
    ('col_transformer', columnTransformer),
])

# full_transformer.fit(train_data)
# transformed_data = full_transformer.transform(train_data)

Feature Extractor initiated...


In [7]:
# Creating Cancel Lead Time

def get_cancel_lead_time(data):
    duplicate = data.copy()
    duplicate.rename(columns={'arrival_date_year':'arrival_year', 'arrival_date_month':'arrival_month', 'arrival_date_day_of_month':'arrival_day_of_month', 'arrival_date_week_number':'arrival_week_number'}, inplace=True)
    duplicate['reservation_status_date'] = pd.to_datetime(duplicate['reservation_status_date'], format='%Y-%m-%d')
    duplicate['arrival_month'] = duplicate.apply(lambda x: __get_month_index(x['arrival_month']), axis=1)
    duplicate['arrival_date'] = duplicate.apply(lambda x: __get_date(x['arrival_year'], x['arrival_month'], x['arrival_day_of_month']), axis=1)
    duplicate['arrival_date'] = pd.to_datetime(duplicate['arrival_date'], format='%Y-%m-%d')
    duplicate['cancel_lead_time'] = (duplicate['arrival_date'] - duplicate['reservation_status_date']).dt.days
    duplicate.loc[duplicate['cancel_lead_time'] < 0, 'cancel_lead_time'] = 0
    return duplicate['cancel_lead_time']

def __get_date(year,month,day):
    return str(year) + '-' + str(month) + '-' + str(day)

def __get_month_index(month):
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    return months.index(month)+1

In [8]:
X_train,y_train = train_data,get_cancel_lead_time(train_data)
X_val,y_val = val_data,get_cancel_lead_time(val_data)

In [29]:
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,confusion_matrix,accuracy_score

def test_model(model):
    # print(model)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)

    print(f'-Train score: {model.score(X_train,y_train)}')
    print(f'-Val score: {model.score(X_val,y_val)}')

    test_data = pd.read_csv('test_data.csv')
    X_test,y_test = test_data,get_cancel_lead_time(test_data)
    print(f'-Test score: {model.score(X_test,y_test)}')

In [10]:
import pickle
import joblib 

def create_pkl(model):
    filename = 'm1_pipeline.pkl'
    pickle_out = open(filename,'wb')
    pickle.dump(model,pickle_out)
    # joblib.dump(model, pickle_out)
    pickle_out.close()
    print(f'Pkl File: {filename}')

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor,ExtraTreesClassifier,AdaBoostRegressor,GradientBoostingRegressor

model_pipeline = Pipeline([
    ('full_transformer', full_transformer),
    ('forest_reg', RandomForestRegressor(n_estimators=200,n_jobs=10))
])

model_pipeline

In [12]:
test_model(model_pipeline)

Fitting data...
Extracting data...
Extracting data...
Extracting data...
-Train score: 0.9846051415244558
Extracting data...
-Val score: 0.9018183266871983
Extracting data...
-Test score: 0.9077137550784973


In [16]:
y_pred = model_pipeline.predict(X_val)

Extracting data...


In [26]:
val_data['prediction'] = y_pred

def get_pred(x):
    if x > 0:
        return 1
    else :
        return 0

val_data['prediction'] = val_data['prediction'].apply(lambda x: get_pred(x))

263.0
2.055
8.2
26.04017489786885
0.0
0.645
0.05
0.0
39.0
1.0
2.64
0.12
0.04
51.89033333333333
104.0
0.2618232281629565
0.335
21.8275
2.615
100.0
0.005
62.0
0.1
7.62
4.555
39.0
3.525833333333333
0.0
55.535
8.09625
0.0
1.315
5.485
25.73
323.0
33.0
27.312166666666666
6.145
0.03
67.0
0.0
19.52
0.25351253084645853
2.1
52.176
18.64
201.0
11.0225
1.715
10.345
0.0
0.0
217.0
79.0
29.77
0.0
2.19
16.175
1.05
20.36
22.51
4.0
139.0
58.67
0.025
2.975
76.0
1.555
1.09
10.831666666666665
16.29
0.0
169.0
1.965
0.455
0.0
0.0
0.0
4.985
7.585
0.155
0.89
0.0
0.0
2.055
18.815
129.0
0.0
158.78
370.0
0.0
0.0
0.0
186.0
0.0
34.0
0.0
31.94
1.895
0.02
0.0
0.355
8.255
41.375
52.0
0.245
0.025
4.5707457322551654
109.20233333333334
25.562122377622376
23.86
0.345
78.0
0.0
80.0
2.3
37.79181430905695
135.0
0.0
84.355
114.97
0.19
50.885
0.0
0.0
0.1
219.16750000000005
0.0
0.0
0.0
7.0
44.33387418021413
17.383154761904763
0.0
9.975
3.975
0.02479124531288651
0.0
2.065
186.0
8.795
1.27
20.505
227.0
66.0
0.12
0.0
0.36
1.1
5.48

In [27]:
val_data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,prediction
0,City Hotel,1,339,2015,September,39,21,1,1,2,0.0,0,BB,PRT,Groups,TA/TO,0,1,0,A,A,0,Non Refund,1.0,,0,Contract,62.0,0,0,Canceled,2015-01-01,1
1,City Hotel,0,29,2016,May,20,8,2,1,2,0.0,0,BB,BRA,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,155.0,,0,Transient,145.0,0,1,Check-Out,2016-05-11,1
2,City Hotel,1,276,2017,May,19,12,0,2,2,0.0,0,BB,CHE,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,135.9,0,2,Canceled,2017-05-09,1
3,City Hotel,1,88,2015,September,39,26,2,2,2,0.0,0,BB,PRT,Groups,TA/TO,0,0,0,A,A,0,Non Refund,1.0,,0,Transient,170.0,0,0,Canceled,2015-08-21,1
4,City Hotel,0,346,2016,September,38,13,0,2,1,0.0,0,HB,DEU,Offline TA/TO,TA/TO,0,0,0,A,D,1,No Deposit,6.0,,0,Transient-Party,90.0,0,0,Check-Out,2016-09-15,0


In [28]:
confusion_matrix(val_data['is_canceled'],val_data['prediction'])

array([[2673, 6642],
       [  78, 6540]])

In [30]:
accuracy_score(val_data['is_canceled'],val_data['prediction'])

0.5782338542647336