## Import

In [25]:
import warnings

warnings.filterwarnings('ignore')

In [90]:
import random as rnd
import warnings

import matplotlib
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Reading data:

datapath : "../data"

In [107]:
traindf = pd.read_csv('../data/train.csv', index_col="ID")
traindf.sample(2)

Unnamed: 0_level_0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
54305,Resort Hotel,0,122,2016,August,32,4,3,8,2,0.0,0,HB,ESP,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient-Party,204.950009,0,0,Check-Out,2016-08-15
28546,City Hotel,1,94,2016,March,10,4,0,2,2,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,1,0,A,A,0,Non Refund,154.0,,28,Transient,70.346076,0,0,Canceled,2015-12-29


### Drop columns not in test data and Clean missing data
* Drop columns: `is_canceled`, `reservation_status`, `reservation_status_date`, where they aren't in test data

In [108]:
testCol = set(pd.read_csv('../data/test.csv', index_col="ID").columns)
removeCols = [col for col in traindf.columns if col not in testCol]
removeCols.remove("adr")
print(removeCols)

['is_canceled', 'reservation_status', 'reservation_status_date']


In [109]:
traindf = traindf.drop(removeCols, axis=1)
traindf.sample(2)

Unnamed: 0_level_0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
41264,Resort Hotel,217,2016,May,21,17,2,5,2,0.0,0,BB,GBR,Offline TA/TO,TA/TO,0,0,0,E,E,0,No Deposit,40.0,,0,Transient,54.423074,0,0
49808,City Hotel,45,2016,July,28,5,0,1,2,1.0,0,BB,PRT,Online TA,TA/TO,0,0,0,A,E,0,No Deposit,9.0,,0,Transient,135.416362,0,1


### Missing data, processing agent, company, country data
The below code shows column name of missing data

In [110]:
tmp = pd.to_numeric((traindf.isnull().sum() / traindf.shape[0]).map('{:,.2f}'.format))
tmp[tmp > 0]

country    0.01
agent      0.14
company    0.94
dtype: float64

Probrabaly one booking with `company` is more unlikely to chacel, also the one with specific `agent`, so turn them to boolean

For `Country`, take the nlargest(10)

In [111]:
traindf["children"] = traindf["children"].fillna(0)

traindf["company"] = (~traindf["company"].isna()).astype("int64")
traindf["agent"] = (~traindf["agent"].isna()).astype("int64")

TopCountrySet = set(traindf["country"].value_counts()[:10].index)
traindf["country"] = traindf["country"].apply(lambda x:x if x in TopCountrySet else "Other")
traindf = pd.get_dummies(traindf, prefix="", prefix_sep="", columns=["country"])

In [112]:
traindf.sample(5)

Unnamed: 0_level_0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,BEL,BRA,DEU,ESP,FRA,GBR,IRL,ITA,NLD,Other,PRT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
62993,City Hotel,254,2016,September,39,24,1,1,2,0.0,0,HB,Groups,TA/TO,0,0,0,A,A,0,Non Refund,1,0,0,Transient,79.186749,0,0,0,0,0,0,0,0,0,0,0,0,1
3728,Resort Hotel,68,2015,August,32,8,1,1,3,0.0,0,BB,Online TA,TA/TO,0,0,0,D,D,2,No Deposit,1,0,0,Transient,143.43665,0,3,0,0,0,0,0,0,0,0,0,0,1
23699,City Hotel,4,2016,January,5,25,1,3,1,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,1,0,0,Transient,64.560495,0,0,0,0,0,1,0,0,0,0,0,0,0
56563,Resort Hotel,188,2016,August,34,18,2,4,2,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,1,0,0,Transient,69.186753,0,1,0,0,0,0,0,0,0,0,0,0,1
87065,City Hotel,11,2017,March,9,3,0,2,2,0.0,0,SC,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,1,0,0,Transient,85.149107,0,0,1,0,0,0,0,0,0,0,0,0,0


### Feature Transformation

`children`: `children` + `babies`  
`stays_in_nights`: `stays_in_week_nights` + `stays_in_weekend_nights`    
`hotel`: Convert hotel as `Resort Hotel` to 0, `City Hotel` to 1  

Create new feature `is_same_room`: Check if `reserved_room_type` is same as `assigned_room_type`(If so 1, else 0)

drop: `arrival_date_year` -- While training, most of them are 0  
      `arrival_date_week_number` -- Too trivial  
      `arrival_date_day_of_month` -- Too trivial  

One hot encoding: `meal`  
One hot encoding: `reserved_room_type` and `assigned_room_type`  
One hot encoding: `arrival_date_month`  
One hot encoding: `market_segment` `distribution_channel` `deposit_type` `customer_type`

In [113]:
traindf = pd.get_dummies(traindf, prefix="", prefix_sep="", columns=["arrival_date_month"])
traindf["stays_in_nights"] = traindf["stays_in_week_nights"] + traindf["stays_in_weekend_nights"] 
traindf["children"] = traindf["children"] + traindf["babies"]
traindf = traindf.drop(["babies"], axis=1)
traindf["hotel"] = traindf["hotel"].map({"Resort Hotel": 0, "City Hotel": 1})

traindf = traindf.drop(columns=["arrival_date_year", "arrival_date_week_number", "arrival_date_day_of_month"])

traindf["is_same_room"] = (traindf["reserved_room_type"] == traindf["assigned_room_type"]).map({True: 1, False: 0})
traindf = pd.get_dummies(traindf, columns=["reserved_room_type", "assigned_room_type"], prefix=["reserved", "assigned"])

traindf = pd.get_dummies(traindf, columns=["meal", "market_segment", "distribution_channel", "deposit_type", "customer_type"])

In [127]:
traindf.sample(2)

Unnamed: 0_level_0,hotel,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,BEL,BRA,DEU,ESP,FRA,GBR,IRL,ITA,NLD,Other,PRT,April,August,December,February,January,July,June,March,May,November,October,September,stays_in_nights,is_same_room,reserved_A,reserved_B,reserved_C,reserved_D,reserved_E,reserved_F,reserved_G,reserved_H,reserved_L,reserved_P,assigned_A,assigned_B,assigned_C,assigned_D,assigned_E,assigned_F,assigned_G,assigned_H,assigned_I,assigned_K,assigned_L,assigned_P,meal_BB,meal_FB,meal_HB,meal_SC,meal_Undefined,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,distribution_channel_Corporate,distribution_channel_Direct,distribution_channel_GDS,distribution_channel_TA/TO,distribution_channel_Undefined,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1
78608,0,315,0,2,2,0.0,0,0,0,0,1,0,0,-18.856372,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,2,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0
55502,0,12,0,2,2,0.0,0,0,0,0,1,0,0,205.079976,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,2,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0


Final Adjustment:

In [174]:
traindf = traindf[traindf["adr"] < 3000]

### Train Model and Evaluate best model performance:

Since I only know about regression, I only use SVR as my model, and save the result as 

In [175]:
new_train_x = traindf.drop("adr", axis=1).values
new_train_y = traindf["adr"].values

In [185]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler

In [196]:
X_train, X_test, y_train, y_test = train_test_split(new_train_x, new_train_y, test_size=0.3)
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train = scaler.transform(X_train)
regr = Ridge()
regr.fit(X_train, y_train)
X_test = scaler.transform(X_test)
y_pred = regr.predict(X_test)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print(f"MSE is: {mean_squared_error(y_test, y_pred)} \nR2_score is: {r2_score(y_test, y_pred)}")

MSE is: 962.952636055753 
R2_score is: 0.5856340688395343


In [189]:
for col, value in zip(traindf.drop("adr", axis=1).columns, regr.coef_):
    print(f"{col}: {value:.2f}")

hotel: 17.76
lead_time: -17.24
stays_in_weekend_nights: 7.93
stays_in_week_nights: -6.76
adults: 3.48
children: 4.71
is_repeated_guest: -2.36
previous_cancellations: -1.47
previous_bookings_not_canceled: 0.13
booking_changes: 0.50
agent: -1.22
company: 0.46
days_in_waiting_list: 1.06
required_car_parking_spaces: 1.50
total_of_special_requests: 0.79
BEL: -0.06
BRA: 0.41
DEU: -0.38
ESP: 1.03
FRA: 0.26
GBR: -0.96
IRL: -0.52
ITA: 0.08
NLD: -0.14
Other: 0.78
PRT: -0.42
April: -1.60
August: 8.45
December: -3.65
February: -5.92
January: -5.75
July: 5.67
June: 3.26
March: -4.55
May: 1.32
November: -3.81
October: 0.04
September: 5.27
stays_in_nights: -1.96
is_same_room: 2.36
reserved_A: -4.04
reserved_B: -3.00
reserved_C: 0.64
reserved_D: 1.53
reserved_E: 1.59
reserved_F: 2.96
reserved_G: 3.44
reserved_H: 3.75
reserved_L: 0.31
reserved_P: -0.12
assigned_A: -1.92
assigned_B: 0.24
assigned_C: 1.34
assigned_D: -0.03
assigned_E: 1.50
assigned_F: 2.15
assigned_G: 1.93
assigned_H: -0.23
assigned_I: -