## Get data

In [1]:
!wget http://students.mimuw.edu.pl/~wd393711/iml/hotel-booking-demand.zip
!unzip hotel-booking-demand.zip

--2020-04-02 17:58:16--  http://students.mimuw.edu.pl/~wd393711/iml/hotel-booking-demand.zip
Resolving students.mimuw.edu.pl (students.mimuw.edu.pl)... 193.0.96.129, 2001:6a0:5001:1::3
Connecting to students.mimuw.edu.pl (students.mimuw.edu.pl)|193.0.96.129|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1308365 (1.2M) [application/zip]
Saving to: ‘hotel-booking-demand.zip.2’


2020-04-02 17:58:18 (1.32 MB/s) - ‘hotel-booking-demand.zip.2’ saved [1308365/1308365]

Archive:  hotel-booking-demand.zip
replace hotel_bookings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: hotel_bookings.csv      


## Some imports

In [0]:
import pandas as pd
import numpy as np

## Load data

In [0]:
def process_data(data):
    # Indicate if the booking was made by a company
    data['has_company'] = data['company'].isnull()
    data = data.drop(columns='company')
    # Indicate if the booking was made by an agency
    data['has_agent'] = data['agent'].isnull()
    data = data.drop(columns='agent')
    # Encode months as oredered categorical values
    data['arrival_date_month'] = data['arrival_date_month'].astype('category').cat.set_categories(["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], ordered=True)
    # Drop not used information
    data = data.drop(columns='reservation_status_date')
    # Encode reservation status, custome type, deposit type
    data = pd.get_dummies(data, columns=['reservation_status', 'customer_type', 'deposit_type', 'market_segment', 'distribution_channel', 'meal'])
    # Encode difference in room type
    data['got_higher_type_room'] = data['reserved_room_type'] > data['assigned_room_type']
    data['got_lower_type_room'] = data['reserved_room_type'] < data['assigned_room_type']
    data['reserved_room_type'] = data['reserved_room_type'].astype('category')
    data['assigned_room_type'] = data['assigned_room_type'].astype('category')
    # Treat counties as categorial data
    data = data.drop(columns=['country'])
    # Drop hotel name
    data = data.drop(columns='hotel')
    # If children is Nan treat it as 0
    data['children'] = data['children'].fillna(0)
    # Encode categorial data with ints
    categorial_columns = data.select_dtypes(['category']).columns
    data[categorial_columns] = data[categorial_columns].apply(lambda x: x.cat.codes)
    return data

In [0]:
def split_to_test_and_train(data, p=0.1):
    train_mask = np.random.rand(len(data)) < 0.9
    return data[train_mask], data[~train_mask]

def split_to_x_y(data, y_column="is_repeated_guest"):
    return data.loc[:, data.columns != y_column], data[y_column]

def get_data(data):
    return (split_to_x_y(ds) for ds in split_to_test_and_train(data))

In [0]:
data = process_data(pd.read_csv("hotel_bookings.csv"))
(train_x, train_y), (test_x, test_y) = get_data(data)

In [0]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
models = {"XGBClassifier": XGBClassifier(), "XGBClassifier_spw16": XGBClassifier(scale_pos_weight=16.), "XGBClassifier_spw30": XGBClassifier(scale_pos_weight=30.), "RandomForestClassifier": RandomForestClassifier()}

## Train models

In [0]:
for model_name, model in models.items():
    model.fit(train_x, train_y)

## Evaluate models


In [0]:
# Describe the distribution of the result in the test dataset
pd.DataFrame(test_y).describe()

In [0]:
from sklearn.metrics import accuracy_score
for model_name, model in models.items():
    train_accuracy = accuracy_score(test_y, model.predict(test_x))
    test_accuracy = accuracy_score(test_y, model.predict(test_x))
    test_wrong_answer_distribution = pd.DataFrame(test_y[model.predict(test_x) != test_y.values]).describe().loc['mean'].values[0]
    print("model:", model_name, "\n",
          "\ttrain acc:", train_accuracy, "\n"
          "\ttest acc:", test_accuracy, "\n"
          "\tmean value of the wrong answer for prediction on the test part:", test_wrong_answer_distribution)