In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta

In [31]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

def get_month_index(month):
    return months.index(month)+1

In [32]:
raw_data = pd.read_csv('model1_data.csv', converters={'arrival_date_month': get_month_index},parse_dates=['reservation_status_date'])
raw_data.rename(columns={'arrival_date_year':'arrival_year', 'arrival_date_month':'arrival_month', 
'arrival_date_day_of_month':'arrival_day_of_month', 'arrival_date_week_number':'arrival_week_number'}, inplace=True)
pd.set_option('display.max_columns', None)
city_data = raw_data.copy(deep=True)

In [33]:
raw_data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_year,arrival_month,arrival_week_number,arrival_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,City Hotel,0,190,2016,5,19,6,2,2,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,B,B,0,No Deposit,8.0,,0,Transient,100.3,0,0,Check-Out,2016-05-10
1,City Hotel,1,343,2015,9,39,25,2,3,2,0.0,0,BB,PRT,Groups,TA/TO,0,1,0,A,A,1,Non Refund,1.0,,0,Transient,170.0,0,0,Canceled,2015-09-09
2,City Hotel,0,262,2016,11,45,4,0,2,3,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,,0,Transient,146.7,0,2,Check-Out,2016-11-06
3,City Hotel,1,206,2017,3,13,26,2,1,2,0.0,0,HB,PRT,Groups,TA/TO,0,0,0,A,A,0,No Deposit,296.0,,0,Transient-Party,114.0,0,0,Canceled,2016-12-10
4,City Hotel,1,60,2015,7,29,13,1,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,0,No Deposit,14.0,,0,Transient,76.5,0,0,No-Show,2015-07-13


In [34]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31732 entries, 0 to 31731
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   hotel                           31732 non-null  object        
 1   is_canceled                     31732 non-null  int64         
 2   lead_time                       31732 non-null  int64         
 3   arrival_year                    31732 non-null  int64         
 4   arrival_month                   31732 non-null  int64         
 5   arrival_week_number             31732 non-null  int64         
 6   arrival_day_of_month            31732 non-null  int64         
 7   stays_in_weekend_nights         31732 non-null  int64         
 8   stays_in_week_nights            31732 non-null  int64         
 9   adults                          31732 non-null  int64         
 10  children                        31729 non-null  float64       
 11  ba

In [35]:
# Null Handling

# children : mode
# country : mode
# agent : category
# company : category

# children_mode = city_data.children.mode()
# city_data['children'].fillna(children_mode, inplace=True)

# country_mode = city_data.country.mode()
# city_data['country'].fillna(country_mode, inplace=True)

city_data.agent.fillna(0, inplace=True)

city_data.company.fillna(0, inplace=True)

city_data.dropna(inplace=True)

In [36]:
def get_date(year,month,day):
    return str(year) + '-' + str(month) + '-' + str(day)

city_data['arrival_date'] = city_data.apply(lambda x: get_date(x['arrival_year'], x['arrival_month'], x['arrival_day_of_month']), axis=1)
city_data['arrival_date'] = pd.to_datetime(city_data['arrival_date'], format='%Y-%m-%d')
city_data['arrival_day_of_week'] = city_data['arrival_date'].dt.day_of_week

city_data['booking_date'] = city_data.apply(lambda x: x['arrival_date'] - timedelta(days=x['lead_time']), axis=1)
city_data['booking_year'] = city_data['booking_date'].dt.year
city_data['booking_month'] = city_data['booking_date'].dt.month
city_data['booking_day_of_month'] = city_data['booking_date'].dt.day
city_data['booking_day_of_week'] = city_data['booking_date'].dt.day_of_week
city_data['booking_week_number'] = city_data['booking_date'].dt.isocalendar().week

# def get_month_day(date):
#     return str(date)[5:]

# city_data['arrival_day_month'] = city_data.apply(lambda x: get_month_day(x['arrival_date']), axis=1)


# city_data['booked'] = 1

city_data['arrival_day_of_week'] = city_data['arrival_date'].dt.dayofweek

# city_data['cancel_lead_time'] = (city_data['arrival_date'] - city_data['reservation_status_date']).dt.days
# city_data.loc[city_data ['cancel_lead_time'] < 0, 'cancel_lead_time'] = 0

# city_data['stay_duration'] = (city_data['reservation_status_date'] - city_data['arrival_date']).dt.days


def family(value):
    if value > 0:
        return 1
    else:
        return 0

city_data['is_family'] = city_data.apply(lambda x: family(x['children'] + x['babies']), axis=1)

In [37]:
city_data.drop(labels=['adr','reservation_status','reservation_status_date','arrival_date','booking_date','hotel'],axis=1,inplace=True)

In [38]:
# city_data = data[data['hotel'] == 'City Hotel'].drop(labels='hotel',axis=1)
# resort_data = data[data['hotel'] == 'Resort Hotel'].drop(labels='hotel',axis=1)

In [39]:
city_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31723 entries, 0 to 31731
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     31723 non-null  int64  
 1   lead_time                       31723 non-null  int64  
 2   arrival_year                    31723 non-null  int64  
 3   arrival_month                   31723 non-null  int64  
 4   arrival_week_number             31723 non-null  int64  
 5   arrival_day_of_month            31723 non-null  int64  
 6   stays_in_weekend_nights         31723 non-null  int64  
 7   stays_in_week_nights            31723 non-null  int64  
 8   adults                          31723 non-null  int64  
 9   children                        31723 non-null  float64
 10  babies                          31723 non-null  int64  
 11  meal                            31723 non-null  object 
 12  country                         

In [40]:
city_data

Unnamed: 0,is_canceled,lead_time,arrival_year,arrival_month,arrival_week_number,arrival_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,arrival_day_of_week,booking_year,booking_month,booking_day_of_month,booking_day_of_week,booking_week_number,is_family
0,0,190,2016,5,19,6,2,2,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,B,B,0,No Deposit,8.0,0.0,0,Transient,0,0,4,2015,10,29,3,44,0
1,1,343,2015,9,39,25,2,3,2,0.0,0,BB,PRT,Groups,TA/TO,0,1,0,A,A,1,Non Refund,1.0,0.0,0,Transient,0,0,4,2014,10,17,4,42,0
2,0,262,2016,11,45,4,0,2,3,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,0.0,0,Transient,0,2,4,2016,2,16,1,7,0
3,1,206,2017,3,13,26,2,1,2,0.0,0,HB,PRT,Groups,TA/TO,0,0,0,A,A,0,No Deposit,296.0,0.0,0,Transient-Party,0,0,6,2016,9,1,3,35,0
4,1,60,2015,7,29,13,1,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,A,A,0,No Deposit,14.0,0.0,0,Transient,0,0,0,2015,5,14,3,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31727,1,27,2016,12,50,7,0,4,3,0.0,0,BB,ESP,Online TA,TA/TO,0,0,0,D,E,3,No Deposit,9.0,0.0,0,Transient,0,1,2,2016,11,10,3,45,0
31728,0,1,2016,3,13,24,0,1,2,1.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,D,0,No Deposit,9.0,0.0,0,Transient,0,2,3,2016,3,23,2,12,1
31729,0,23,2016,1,1,2,2,1,1,0.0,0,HB,ITA,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,26.0,0.0,0,Transient-Party,0,0,5,2015,12,10,3,50,0
31730,0,0,2016,2,7,8,1,0,3,2.0,0,BB,PRT,Direct,Direct,0,0,0,G,G,2,No Deposit,0.0,0.0,0,Transient,0,0,0,2016,2,8,0,6,1


### Preprocessing

In [41]:
cat_columns = ['meal','country','market_segment','distribution_channel','reserved_room_type','assigned_room_type','deposit_type',
                'customer_type']

cat_data = city_data[cat_columns]

In [42]:
num_data = city_data.drop(labels=cat_columns+['is_canceled'],axis=1)
columns = num_data.columns
num_data = num_data.to_numpy()
num_data

array([[190, 2016, 5, ..., 3, 44, 0],
       [343, 2015, 9, ..., 4, 42, 0],
       [262, 2016, 11, ..., 1, 7, 0],
       ...,
       [23, 2016, 1, ..., 3, 50, 0],
       [0, 2016, 2, ..., 0, 6, 1],
       [116, 2017, 3, ..., 0, 48, 0]], dtype=object)

In [43]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn import tree
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,confusion_matrix


In [44]:
encoder = OneHotEncoder(sparse=True).fit(cat_data)
cat_data = encoder.transform(cat_data).toarray()
cat_data.shape

(31723, 179)

In [26]:
encoder.categories_

[array(['BB', 'FB', 'HB', 'SC'], dtype=object),
 array(['AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM', 'ATA', 'ATF',
        'AUS', 'AUT', 'AZE', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BIH',
        'BLR', 'BOL', 'BRA', 'BRB', 'CAF', 'CHE', 'CHL', 'CHN', 'CIV',
        'CMR', 'CN', 'COL', 'COM', 'CPV', 'CRI', 'CYP', 'CZE', 'DEU',
        'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ESP', 'EST', 'FIN',
        'FRA', 'FRO', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', 'GNB',
        'GRC', 'GTM', 'HKG', 'HRV', 'HUN', 'IDN', 'IMN', 'IND', 'IRL',
        'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JEY', 'JOR', 'JPN', 'KAZ',
        'KEN', 'KHM', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LIE', 'LKA',
        'LTU', 'LUX', 'LVA', 'MAC', 'MAR', 'MCO', 'MDV', 'MEX', 'MKD',
        'MLT', 'MOZ', 'MUS', 'MYS', 'NAM', 'NGA', 'NLD', 'NOR', 'NZL',
        'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'POL', 'PRI', 'PRT', 'PRY',
        'QAT', 'ROU', 'RUS', 'RWA', 'SAU', 'SEN', 'SGP', 'SLV', 'SRB',
        'STP', 'SUR', 'SVK', '

In [27]:
encoded_data = np.concatenate((num_data,cat_data),axis=1)
encoded_data.shape

(31723, 205)

In [28]:
columns = columns.to_list()

for cat in encoder.categories_:
    columns.extend(cat)

len(columns)

205

In [29]:
# corr_data = pd.DataFrame(encoded_data, columns=columns)
# corr_data['is_canceled'] = city_data['is_canceled']

In [48]:
# corr_data.corr()

In [49]:
# corr = corr_data.corr().sort_values()
# corr.dropna(inplace=True)
# corr.drop(labels='is_canceled',inplace=True)
# corr = corr[:15].append(corr[-15:])

# fig,ax = plt.subplots(figsize=(5,10))
# sns.barplot(y=corr.index, x=corr.values)

### Model Building

In [50]:
# X = city_data.drop(labels='is_canceled',axis=1)
# y = city_data['is_canceled']

X = encoded_data
y = city_data['is_canceled']

In [51]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [52]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((27757, 209), (27757,), (11896, 209), (11896,))

In [56]:
def test_model(model):
    model.fit(X_train,y_train)
    print(model)
    y_pred = model.predict(X_test)

    print(f'Train score: {model.score(X_train,y_train)}')
    print(f'Test score: {model.score(X_test,y_test)}')
    print(f'Confusion Matrix: {confusion_matrix(y_test,y_pred)}')


In [71]:
import pickle

def create_pkl(model):
    filename = 'm1_' + str(model)[:-2] + '.pkl'
    pickle_out = open(filename,'wb')
    pickle.dump(model,pickle_out)
    pickle_out.close()
    print(f'Pkl File: {filename}')

In [72]:
tree_clf = DecisionTreeClassifier()
test_model(tree_clf)
create_pkl(tree_clf)

DecisionTreeClassifier()
Train score: 0.9951723889469323
Test score: 0.8340618695359785
Confusion Matrix: [[5876 1021]
 [ 953 4046]]
Pkl File: m1_DecisionTreeClassifier.pkl


In [162]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier,BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


In [163]:
# svm_clf = Pipeline([
#     ('std_sc',StandardScaler()),
#     ('svc', SVC(probability=True,verbose=True,random_state=0))
# ])

std_sclr = StandardScaler()
X_transform = std_sclr.fit_transform(X_train)

svc = SVC(probability=True,random_state=0)

clf = svc
params = dict(
    C = [0.1,0.3,0.5,0.7,0.9,1],
    kernel = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    degree = [2,3,4,5],
)

clf = RandomizedSearchCV(clf, params, verbose=1)
search = clf.fit(X_transform,y_train)
search.best_params_
search.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
svm_clf = Pipeline([
    ('std_sc',StandardScaler()),
    ('svc', SVC(probability=True))
])

In [None]:
test_model(VotingClassifier(
    estimators=[
        ('dc', DecisionTreeClassifier()),
        ('rf',RandomForestClassifier()),
        ('svc',svm_clf)],
    voting='hard'
))

In [None]:
test_model(VotingClassifier(
    estimators=[
        ('dc', DecisionTreeClassifier()),
        ('rf',RandomForestClassifier()),
        ('svc',svm_clf)],
    voting='soft'
))

In [None]:
test_model(BaggingClassifier(
    base_estimator=RandomForestClassifier(),
    bootstrap=False
))

In [None]:
test_model(svm_clf)

In [None]:
test_model(RandomForestClassifier())

In [None]:
test_model(LogisticRegression())

In [None]:
export PATH=/usr/lib/libcudart${PATH:+:${PATH}}
export LD_LIBRARY_PATH=/usr/lib/libcudart:${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

sudo chmod a+r /usr/lib/libcuda*

In [None]:
import tensorflow as tf

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)

In [None]:
cpus = tf.config.experimental.list_physical_devices('CPU')
print(cpus)

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print(physical_devices)
if physical_devices:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)


In [None]:
tf.test.is_gpu_available()

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
!nvidia-smi