## Import

In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import random as rnd
import warnings

import matplotlib
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Reading data:

datapath : "../data"

In [21]:
traindf = pd.read_csv('../data/train.csv', index_col="ID")
traindf.sample(2)

Unnamed: 0_level_0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
13710,Resort Hotel,0,5,2015,October,41,10,1,1,2,0.0,0,BB,ESP,Online TA,TA/TO,0,0,0,A,G,0,No Deposit,240.0,,0,Transient,65.409366,0,2,Check-Out,2015-10-12
19726,City Hotel,0,135,2015,December,49,5,2,1,2,0.0,0,BB,PRT,Groups,TA/TO,0,0,0,A,D,0,No Deposit,1.0,,0,Transient-Party,74.339792,0,0,Check-Out,2015-12-08


### Drop columns not in test data and Clean missing data
* Drop columns: `is_canceled`, `reservation_status`, `reservation_status_date`, where they aren't in test data

In [22]:
testCol = set(pd.read_csv('../data/test.csv', index_col="ID").columns)
removeCols = [col for col in traindf.columns if col not in testCol]
removeCols.remove("adr")
print(removeCols)

['is_canceled', 'reservation_status', 'reservation_status_date']


In [23]:
traindf = traindf.drop(removeCols, axis=1)
traindf.sample(2)

Unnamed: 0_level_0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
85256,City Hotel,17,2017,February,8,21,0,2,2,0.0,0,SC,CHN,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,9.0,,0,Transient,71.824894,0,0
53195,City Hotel,170,2016,July,31,28,0,3,2,1.0,0,SC,FRA,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,7.0,,0,Transient,36.564623,0,2


### Missing data
The below code shows column name of missing data

In [24]:
tmp = pd.to_numeric((traindf.isnull().sum() / traindf.shape[0]).map('{:,.2f}'.format))
tmp[tmp > 0]

country    0.01
agent      0.14
company    0.94
dtype: float64

* The company feature's 94% is missing. Because of that, this feature will be eliminated. 
* Another missing data has occurred in country and agent features. Since missing data of country is less than 1%, these data will replace with most frequent value. 
* However, the agent missing features are more than the country. For this feature, missing data will be imputed as 0

In [25]:
# In this state, avoid too many feature, I just drop agent and country columns

traindf["children"] = traindf["children"].fillna(0)
traindf["country"] = traindf["country"].fillna(traindf["country"].mode().index[0])
traindf["agent"] = traindf["agent"].fillna("0")
traindf = traindf.drop(["company", "agent", "country"], axis=1)

In [26]:
traindf.sample(5)

Unnamed: 0_level_0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
56821,City Hotel,134,2016,August,34,19,2,2,2,1.0,0,BB,Online TA,TA/TO,0,0,0,D,D,8,No Deposit,0,Transient,132.471656,0,2
87575,City Hotel,25,2017,March,10,6,1,3,2,0.0,0,BB,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient,87.225606,0,0
48925,City Hotel,17,2016,June,27,29,0,2,3,1.0,0,BB,Direct,Direct,0,0,0,G,G,0,No Deposit,0,Transient,241.150661,0,1
20675,Resort Hotel,1,2015,December,51,18,0,1,2,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,D,0,No Deposit,0,Group,24.853688,0,0
59952,City Hotel,183,2016,September,37,8,0,3,2,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,0,Transient-Party,89.87191,0,1


### Feature Transformation

`date`: Combine `arrival_date_year`, `arrival_date_month`, `arrival_date_week_number` to date object
<br>One hot encoding `arrival_date_month`

In [27]:
# def create_date(year: pd.Series, month: pd.Series, date: pd.Series) -> pd.Series:
#     return pd.to_datetime(year.astype(str) + month + date.astype(str), format="%Y%B%d")

# traindf.insert(loc=0, column="date", 
#           value=create_date(traindf["arrival_date_year"], traindf["arrival_date_month"], traindf["arrival_date_day_of_month"]))
# traindf = traindf.drop(["arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"], axis=1)

month = pd.get_dummies(traindf.arrival_date_month)
traindf = pd.concat([traindf, month], axis=1, join="inner")
traindf = traindf.drop(["arrival_date_month"], axis=1)

`hotel`: Convert hotel as `Resort Hotel` to 0, `City Hotel` to 1

In [28]:
traindf["hotel"] = traindf["hotel"].map({"Resort Hotel": 0, "City Hotel": 1})

Create new feature `is_same_room`: Check if `reserved_room_type` is same as `assigned_room_type`(If so 1, else 0)
<br>One hot encoding `reserved_room_type` and `assigned_room_type`

In [29]:
traindf["is_same_room"] = (traindf["reserved_room_type"] == traindf["assigned_room_type"]).map({True: 1, False: 0})
reserved = pd.get_dummies(traindf.reserved_room_type, prefix="reserved_")
assigned = pd.get_dummies(traindf.assigned_room_type, prefix="assigned_")
traindf = pd.concat([traindf, reserved, assigned], axis=1, join="inner")
traindf = traindf.drop(["reserved_room_type", "assigned_room_type"], axis=1)

`children`: `children` + `babies`

In [30]:
traindf["children"] = traindf["children"] + traindf["babies"]
traindf = traindf.drop(["babies"], axis=1)

One hot encoding: `meal`

In [31]:
meal = pd.get_dummies(traindf.meal, prefix="meal_")
traindf = pd.concat([traindf, meal], axis=1, join="inner")
traindf = traindf.drop(["meal"], axis=1)

In [32]:
traindf.sample(2)

Unnamed: 0_level_0,hotel,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,April,August,December,February,January,July,June,March,May,November,October,September,is_same_room,reserved__A,reserved__B,reserved__C,reserved__D,reserved__E,reserved__F,reserved__G,reserved__H,reserved__L,reserved__P,assigned__A,assigned__B,assigned__C,assigned__D,assigned__E,assigned__F,assigned__G,assigned__H,assigned__I,assigned__K,assigned__L,assigned__P,meal__BB,meal__FB,meal__HB,meal__SC,meal__Undefined
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
46422,1,188,2016,25,15,0,2,1,0.0,Offline TA/TO,TA/TO,0,0,0,0,Non Refund,39,Transient,104.307768,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
89205,1,615,2017,11,16,0,1,1,0.0,Groups,TA/TO,0,0,0,0,Non Refund,0,Transient,22.505683,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


### Train Model and Evaluate best model performance:

Since I only know about regression, I only use SVR as my model, and save the result as 

In [33]:
traindf = pd.get_dummies(traindf)
new_train_x = traindf.drop("adr", axis=1).values
new_train_y = traindf["adr"].values

In [34]:
traindf = pd.get_dummies(traindf, columns=["arrival_date_year"])
traindf = traindf.drop(["arrival_date_week_number", "arrival_date_day_of_month"], axis=1)

In [36]:
traindf.to_csv("PredictAdr.csv")

In [29]:
# Scaler

#from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
#print(scaler.fit(new_train_x))
#print(scaler.transform(new_train_x))
#print(scaler.transform(new_train_y))

In [30]:
X_train, X_test, y_train, y_test = train_test_split(new_train_x, new_train_y, test_size=0.3)
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [33]:
# The result here shows the avg E_in

np.linalg.norm((y_pred - y_test)) / len(y_pred)

0.17806214424870953

In [34]:
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(new_train_x, new_train_y)
y_pred = regr.predict(new_train_x)

In [35]:
y_pred_csv = pd.DataFrame(y_pred)

In [42]:
y_pred_csv.to_csv("adr_predict.csv", header=["adr"])

In [44]:
adr = pd.DataFrame(new_train_y)

In [45]:
y_pred_csv

Unnamed: 0,0
0,102.197540
1,60.598775
2,72.030726
3,72.030726
4,72.030726
...,...
91526,96.868011
91527,-2.138195
91528,84.689882
91529,44.351993


In [46]:
adr

Unnamed: 0,0
0,-6.305161
1,75.052227
2,74.546401
3,76.376288
4,49.411647
...,...
91526,79.223571
91527,-6.822102
91528,90.814554
91529,38.135565


In [47]:
diff = y_pred_csv - adr

In [50]:
diff = diff.apply(abs)

In [57]:
indexes = diff[diff[0] > 50].index

In [61]:
perform_bad = traindf.loc[indexes]

In [77]:
perform_bad[perform_bad["adr"] < 10]["adr"]

0        -6.305161
63       -0.729340
64       -2.902196
119     -88.642804
341      -7.897349
           ...    
91177     7.805044
91261   -62.157892
91263   -59.188184
91275   -68.646270
91305   -44.775831
Name: adr, Length: 1246, dtype: float64

In [78]:
traindf[traindf["adr"] < 10]["adr"]

ID
0        -6.305161
63       -0.729340
64       -2.902196
119     -88.642804
148       8.960203
           ...    
91283   -27.102503
91305   -44.775831
91500   -14.504251
91503   -12.919657
91527    -6.822102
Name: adr, Length: 4010, dtype: float64