# Data Preprocessing

In [100]:
import pandas as pd
bookings_df = pd.read_csv('processed_booking.csv')

bookings_df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
bookings_df.drop(columns=['neighborhood', 'bed_type', 'room_type', 'review_title'], inplace=True)


non_numeric_columns = bookings_df.select_dtypes(exclude=['float64', 'int64', 'int']).columns
non_numeric_columns



Index(['name', 'snapshot_date'], dtype='object')

The data is splitted into pages and comes sorted by default. To gather information about how bookings are sorted in a single page, the data's primary key is \<snapshot_date, time_to_travel, length_of_stay\>. The index of the entries within the is given by the column `Unnamed: 0` (wrongfuly but will be fixed in the dataframe it self)

In [101]:
primary_key = ['snapshot_date', 'length_of_stay', 'time_to_travel']
pages = []
for key, group in bookings_df.groupby(primary_key):
    # Drop the original index column and reset the DataFrame index
    group = group.drop(columns=['index']).reset_index(drop=True)
    # Add a new column 'hotel_index' based on the new index
    group['hotel_index'] = group.index
    pages.append((key, group))
    
print(pages[2][0])   # This prints the group key (the page identifier)
pages[2][1]

('2025-02-02', np.int64(1), np.int64(3))


Unnamed: 0,name,original_price,discounted_price,discount,review_score,number_of_reviews,star_rating,is_apartment,kms_from_centre,location_score,...,neighborhood_Murray Hill,neighborhood_New York,neighborhood_NoMad,neighborhood_SoHo,neighborhood_Tribeca,neighborhood_Union Square,neighborhood_Upper East Side,neighborhood_Upper West Side,neighborhood_Washington Heights,hotel_index
0,Iroquois New York Times Square,447,447,0,8.5,2625.0,5.0,0,1.4,9.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,Riverside Tower Hotel,129,129,0,8.5,2271.0,2.0,0,1.9,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,UNTITLED at 3 Freeman Alley,234,234,0,8.4,565.0,4.0,0,5.2,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,Aura Hotel Times Square Newly Renovated,187,166,0,7.6,447.0,4.0,0,1.3,9.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,"Holiday Inn Express Manhattan Midtown West, an...",95,95,0,7.3,1749.0,3.0,0,1.1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,The Marlton Hotel,422,380,0,8.4,1172.0,4.0,0,4.1,9.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92
93,Park Central,232,232,0,7.3,7464.0,4.0,0,1.0,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93
94,Sixty LES,322,322,0,7.7,848.0,4.0,0,5.2,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94
95,Pod 51,118,118,0,7.9,11275.0,3.0,0,1.7,9.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95


In [102]:
# pages that have a TTT that is even will be used for training, and even ones will be used for testing
train_pages = [page for page in pages if page[0][2] % 2 == 0]
test_pages = [page for page in pages if page[0][2] % 2 != 0]

print(len(train_pages), len(test_pages))

225 225


## Generate Hotel Pairs

This function creates all unique hotel pairs using every available combination of indices. It leverages `itertools.combinations` to generate index pairs (i, j) with i < j. Before forming each pair, the order of the indices is randomly shuffled so that the model does not always treat the first hotel as the primary entry. This randomization helps prevent any bias related to parameter order.

In [103]:
import pandas as pd
import itertools
import random

def get_hotel_pairs(page: pd.DataFrame) -> pd.DataFrame:
    pairs_list = []
    n = len(page)
    for i, j in itertools.combinations(range(n), 2):
        if random.choice([True, False]):
            idx_order = [i, j]
        else:
            idx_order = [j, i]
        
        row1 = page.iloc[idx_order[0]]
        row2 = page.iloc[idx_order[1]]
        
        pair_dict = {}
        for col in page.columns:
            pair_dict[f'hotel1_{col}'] = row1[col]
            pair_dict[f'hotel2_{col}'] = row2[col]
        
        pairs_list.append(pair_dict)
    
    # Convert the list of dictionaries to a DataFrame.
    pairs_df = pd.DataFrame(pairs_list)
    return pairs_df

### Split to train and test pages
pages that have a TTT that is even will be used for training, and even ones will be used for testing

In [104]:
get_hotel_pairs(pages[2][1])

Unnamed: 0,hotel1_name,hotel2_name,hotel1_original_price,hotel2_original_price,hotel1_discounted_price,hotel2_discounted_price,hotel1_discount,hotel2_discount,hotel1_review_score,hotel2_review_score,...,hotel1_neighborhood_Union Square,hotel2_neighborhood_Union Square,hotel1_neighborhood_Upper East Side,hotel2_neighborhood_Upper East Side,hotel1_neighborhood_Upper West Side,hotel2_neighborhood_Upper West Side,hotel1_neighborhood_Washington Heights,hotel2_neighborhood_Washington Heights,hotel1_hotel_index,hotel2_hotel_index
0,Riverside Tower Hotel,Iroquois New York Times Square,129,447,129,447,0,0,8.5,8.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0
1,UNTITLED at 3 Freeman Alley,Iroquois New York Times Square,234,447,234,447,0,0,8.4,8.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0
2,Aura Hotel Times Square Newly Renovated,Iroquois New York Times Square,187,447,166,447,0,0,7.6,8.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0
3,Iroquois New York Times Square,"Holiday Inn Express Manhattan Midtown West, an...",447,95,447,95,0,0,8.5,7.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4
4,Iroquois New York Times Square,Royalton New York,447,199,447,199,0,0,8.5,7.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,Pod 51,Park Central,118,232,118,232,0,0,7.9,7.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95,93
4652,Park Central,Life Hotel New York,232,185,232,185,0,0,7.3,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93,96
4653,Sixty LES,Pod 51,322,118,322,118,0,0,7.7,7.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94,95
4654,Sixty LES,Life Hotel New York,322,185,322,185,0,0,7.7,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94,96


In [105]:
# Create the training and testing datasets
# train_pairs = pd.concat([get_hotel_pairs(page) for key, page in train_pages])
# test_pairs = pd.concat([get_hotel_pairs(page) for key, page in test_pages])
# do  the above in parallel
from joblib import Parallel, delayed
train_pairs_list = Parallel(n_jobs=-1)(delayed(get_hotel_pairs)(page) for key, page in train_pages)
train_pairs = pd.concat(train_pairs_list)

In [106]:
#find non numeric columns
non_numeric_columns = train_pairs.select_dtypes(exclude=['float64', 'int64', 'int']).columns
non_numeric_columns

Index(['hotel1_name', 'hotel2_name', 'hotel1_snapshot_date',
       'hotel2_snapshot_date'],
      dtype='object')

In [107]:
# train a descision tree model
from sklearn.tree import DecisionTreeClassifier


X_train = train_pairs.drop(columns=[
    'hotel1_hotel_index',
    'hotel2_hotel_index',
    'hotel1_snapshot_date',
    'hotel2_snapshot_date',
    'hotel1_name',
    'hotel2_name',
    'hotel1_length_of_stay',
    'hotel2_length_of_stay',
    'hotel1_time_to_travel',
    'hotel2_time_to_travel']) 
y_train = (train_pairs['hotel1_hotel_index'] > train_pairs['hotel2_hotel_index']).apply(lambda x: 1 if x else -1)

decisionTreeRegressor = DecisionTreeClassifier()
decisionTreeRegressor.fit(X_train, y_train)



# Data Science Model
The Data Science model we chose is a Decision Tree classifier. This is the best model for the job because it is given all the params from both hotels and it can classify by utilizing them which hotel should appear first on the page. 
The decision tree is trained on a search grid to be able to tune hyper parameters and get the configuration best fit for the job.

In [108]:
# creata a descision tree model with grid search
import os
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 5],
}
num_of_cores = os.cpu_count()

grid_search_regressor = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, n_jobs=num_of_cores // 4)
grid_search_regressor.fit(X_train, y_train)

grid_search_regressor.best_params_

{'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}

In [109]:
# shuffle a page to see the score:
# shuffle the rows of the page
def shuffle_page(page):
    shuffled_page = page.copy()
    shuffled_page = shuffled_page.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return shuffled_page

In [110]:
test_pages[0][1]

Unnamed: 0,name,original_price,discounted_price,discount,review_score,number_of_reviews,star_rating,is_apartment,kms_from_centre,location_score,...,neighborhood_Murray Hill,neighborhood_New York,neighborhood_NoMad,neighborhood_SoHo,neighborhood_Tribeca,neighborhood_Union Square,neighborhood_Upper East Side,neighborhood_Upper West Side,neighborhood_Washington Heights,hotel_index
0,Iroquois New York Times Square,292,218,0,8.5,2625.0,5.0,0,1.0,9.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,Four Points by Sheraton New York Downtown,130,111,0,7.8,1987.0,3.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,"The Mayfair Hotel Times Square, Ascend Hotel C...",141,141,0,8.6,1895.0,3.0,0,1.0,9.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,The Washington by LuxUrban,160,160,0,7.7,9442.0,4.0,0,1.0,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,Hampton Inn by Hilton New York Times Square,152,152,0,7.8,2432.0,3.0,0,1.0,9.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Fairfield Inn by Marriott New York Manhattan/F...,152,152,0,7.8,3622.0,3.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94
95,Arthouse Hotel,231,231,0,7.9,2269.0,4.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95
96,MOXY NYC Times Square,192,192,0,8.1,5954.0,4.0,0,1.0,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96
97,Hilton Garden Inn New York/Tribeca,186,186,0,7.5,661.0,3.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,97


In [111]:
shuffle_page(test_pages[0][1])

Unnamed: 0,name,original_price,discounted_price,discount,review_score,number_of_reviews,star_rating,is_apartment,kms_from_centre,location_score,...,neighborhood_Murray Hill,neighborhood_New York,neighborhood_NoMad,neighborhood_SoHo,neighborhood_Tribeca,neighborhood_Union Square,neighborhood_Upper East Side,neighborhood_Upper West Side,neighborhood_Washington Heights,hotel_index
0,Sixty LES,263,263,0,7.7,848.0,4.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62
1,Hotel Belleclaire Central Park,238,177,0,8.1,5084.0,4.0,0,1.0,9.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,40
2,Arthouse Hotel,231,231,0,7.9,2269.0,4.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95
3,Pod Times Square,129,129,0,8.2,23072.0,3.0,0,1.0,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18
4,Hilton Garden Inn New York/Tribeca,186,186,0,7.5,661.0,3.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Park Central,186,186,0,7.3,7464.0,4.0,0,1.0,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60
95,Hotel AKA NoMad,325,325,0,8.3,663.0,4.0,0,1.0,9.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,71
96,citizenM New York Bowery,209,209,0,8.7,3312.0,4.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
97,"voco Times Square South New York, an IHG Hotel",197,197,0,8.1,3803.0,4.0,0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92


In [112]:
def drop_relevant_columns(hotel : pd.DataFrame) -> pd.DataFrame:
    
    return hotel.drop(['name', 'hotel_index', 'snapshot_date', 'length_of_stay', 'time_to_travel'], errors='ignore')


In [123]:

def compare_hotels(model, columns):
    def compare(hotel1, hotel2):

        
        hotel1 = drop_relevant_columns(hotel1)
        hotel2 = drop_relevant_columns(hotel2)
        
        # Print the Series after dropping relevant columns
        
        pair_dict = {}
        for col in hotel1.index:  # Access the index of a Series
            pair_dict[f'hotel1_{col}'] = hotel1[col]
            pair_dict[f'hotel2_{col}'] = hotel2[col]
        
        pair_df = pd.DataFrame([pair_dict])[columns]
        
        # Print final columns of pair_df
        return model.predict(pair_df)[0]
    
    return compare


In [126]:
import pandas as pd
import functools

# Suppose hotels_list is defined as:
hotels_list = [hotel for _, hotel in shuffle_page(test_pages[0][1]).iterrows()]
print(compare_hotels(decisionTreeRegressor, X_train.columns.to_list())(hotels_list[1], hotels_list[0]))

# Sort using the inverted comparator
sorted_hotels = sorted(hotels_list, key=functools.cmp_to_key(compare_hotels(decisionTreeRegressor, X_train.columns.to_list())))

# Convert the sorted list of Series back into a DataFrame (if needed)
sorted_page = pd.DataFrame(sorted_hotels).reset_index(drop=True)

# Testing the order: print hotel names if order differs
out_of_order = 0
for new_index, sorted_hotel in enumerate(sorted_hotels):
    if new_index != sorted_hotel['hotel_index']:
        print( abs(new_index - sorted_hotel['hotel_index']))
        out_of_order += 1
    else:
        print(sorted_hotel['name'])
        
print(len (hotels_list), out_of_order)
sorted_page['hotel_index']

-1
Iroquois New York Times Square
2
4
51
20
34
55
86
6
5
5
1
1
4
2
10
The Belvedere Hotel
1
12
32
30
4
16
15
24
24
30
5
37
4
2
3
35
58
36
20
DoubleTree by Hilton New York Downtown
46
5
1
17
48
21
24
31
25
46
13
34
33
4
6
26
16
19
LUMA Hotel - Times Square
13
26
23
38
16
11
33
16
16
12
59
1
24
35
6
9
27
21
25
20
50
21
4
7
17
22
12
42
6
11
59
8
11
23
3
20
7
51
6
20
12
24
40
99 95


0      0
1      3
2      6
3     54
4     24
      ..
94    88
95    75
96    84
97    73
98    58
Name: hotel_index, Length: 99, dtype: int64

In [127]:
# the error is calculated by taking the difference between the index of the hotel and the index of the hotel in the sorted list, squaring it and taking the mean of all the errors
def test_page_error(model, columns, page):
    shuffeled_page = shuffle_page(page)
    hotels_list = [hotel for _, hotel in shuffeled_page.iterrows()]

    # Sort using the inverted comparator
    sorted_hotels = sorted(hotels_list, key=functools.cmp_to_key(compare_hotels(model, columns)))

    # Testing the order: print hotel names if order differs
    mean_squared_error = 0
    errors = []
    for new_index, sorted_hotel in enumerate(sorted_hotels):
        squared_error = (new_index - sorted_hotel['hotel_index']) ** 2
        errors.append(squared_error)
    
    mean_squared_error = sum(errors) / len(errors)
    std_squared_error = (sum([(error - mean_squared_error) ** 2 for error in errors]) / len(errors)) ** 0.5
    
    # find the standard deviation of the squared errors

    return mean_squared_error, std_squared_error

In [128]:
errors  = [test_page_error(grid_search_regressor, X_train.columns.to_list(), page) for _, page in test_pages] 
errors = pd.DataFrame(errors, columns=['mean_squared_error', 'std_squared_error'])
errors.describe()


train_pairs[X_train.columns.to_list()]

Unnamed: 0,hotel1_original_price,hotel2_original_price,hotel1_discounted_price,hotel2_discounted_price,hotel1_discount,hotel2_discount,hotel1_review_score,hotel2_review_score,hotel1_number_of_reviews,hotel2_number_of_reviews,...,hotel1_neighborhood_Tribeca,hotel2_neighborhood_Tribeca,hotel1_neighborhood_Union Square,hotel2_neighborhood_Union Square,hotel1_neighborhood_Upper East Side,hotel2_neighborhood_Upper East Side,hotel1_neighborhood_Upper West Side,hotel2_neighborhood_Upper West Side,hotel1_neighborhood_Washington Heights,hotel2_neighborhood_Washington Heights
0,141,353,141,261,0,0,8.6,8.5,1895.0,2625.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,353,125,261,125,0,0,8.5,7.3,2625.0,2515.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,180,353,154,261,0,0,7.8,8.5,1987.0,2625.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,353,113,261,97,0,0,8.5,6.6,2625.0,15317.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,353,163,261,163,0,0,8.5,8.6,2625.0,6374.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4748,1896,999,1521,999,0,0,8.7,8.3,1601.0,5048.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4749,1694,1896,1694,1521,0,0,8.1,8.7,1586.0,1601.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4750,1047,999,1047,999,0,0,8.9,8.3,1548.0,5048.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4751,1047,1694,1047,1694,0,0,8.9,8.1,1548.0,1586.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Optimizing the decision tree

the mean of the mean squared error sits on 686, which means that on average, the distance of the sorted new index given by the model is 26 from its original position. This means that the model is not efficient and there could be improvements.

the following improvements will be implemented: 

1. The model it self right now returns a binary answer - 1 if the first hotel param comes first and -1 if the opposite. While this simplifies the implementation of the compare function, it does not penalize the model when distant hotel records appear close to each other after sorting.  
**Solution**: use decision tree regressor instead.

2. The model seems to perform better on test pages that have higher TTT.  
**Solution**: shuffle the training data

In [129]:
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor


X_train = train_pairs.drop(columns=[
    'hotel1_hotel_index',
    'hotel2_hotel_index',
    'hotel1_snapshot_date',
    'hotel2_snapshot_date',
    'hotel1_name',
    'hotel2_name',
    'hotel1_length_of_stay',
    'hotel2_length_of_stay',
    'hotel1_time_to_travel',
    'hotel2_time_to_travel']) 
y_train = (train_pairs['hotel1_hotel_index'] - train_pairs['hotel2_hotel_index'])

X_train, y_train = shuffle(X_train, y_train, random_state=42)

decision_tree_regressor_param_grid = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 5],
    'criterion': ['squared_error', 'absolute_error']
}

grid_search_regressor = GridSearchCV(DecisionTreeRegressor(criterion=''), decision_tree_regressor_param_grid, cv=5, n_jobs=num_of_cores // 4)
grid_search_regressor.fit(X_train, y_train)

60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HamamaHome\anaconda3\envs\DS-101-Final\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HamamaHome\anaconda3\envs\DS-101-Final\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HamamaHome\anaconda3\envs\DS-101-Final\Lib\site-packages\sklearn\tree\_classes.py", line 1404, in fit
    super()._fit(
  File "c:\Users\HamamaHome\anaconda3\env

In [130]:
errors  = [test_page_error(grid_search_regressor, X_train.columns, page) for _, page in test_pages] 
errors = pd.DataFrame(errors, columns=['mean_squared_error', 'std_squared_error'])
errors.describe()

Unnamed: 0,mean_squared_error,std_squared_error
count,225.0,225.0
mean,689.072419,1038.346768
std,180.106831,238.855845
min,322.909091,462.928908
25%,548.686869,873.444206
50%,673.565217,1042.004287
75%,806.2,1191.102664
max,1299.115789,1645.608108


## Optimizatation trial 2:
As we can see from the table above, the tree regressors are having only slightly better predictions for sorting, but switching to a regressor does not effectivly boosts the accuracy.
Our suspection, is that the tree is flooded with parameters, from the one hot encoded neighborhoods.
In this step, we will test the effect of removing it from the training data

In [131]:

X_train = train_pairs.drop(columns=[
    'hotel1_hotel_index',
    'hotel2_hotel_index',
    'hotel1_snapshot_date',
    'hotel2_snapshot_date',
    'hotel1_name',
    'hotel2_name',
    'hotel1_length_of_stay',
    'hotel2_length_of_stay',
    'hotel1_time_to_travel',
    'hotel2_time_to_travel']) 

# remove every column that starts with prefix 'neighborhood'
X_train = X_train.loc[:, ~X_train.columns.str.contains(r'neighborhood')]
y_train = (train_pairs['hotel1_hotel_index'] - train_pairs['hotel2_hotel_index'])

X_train, y_train = shuffle(X_train, y_train, random_state=42)

decision_tree_regressor_param_grid = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 5],
    'criterion': ['squared_error', 'absolute_error']
}
grid_search_regressor = GridSearchCV(DecisionTreeRegressor(criterion=''), decision_tree_regressor_param_grid, cv=5, n_jobs=num_of_cores // 4)
grid_search_regressor.fit(X_train, y_train)

60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HamamaHome\anaconda3\envs\DS-101-Final\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HamamaHome\anaconda3\envs\DS-101-Final\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HamamaHome\anaconda3\envs\DS-101-Final\Lib\site-packages\sklearn\tree\_classes.py", line 1404, in fit
    super()._fit(
  File "c:\Users\HamamaHome\anaconda3\env

In [132]:
errors  = [test_page_error(grid_search_regressor, X_train.columns, page) for _, page in test_pages] 
errors = pd.DataFrame(errors, columns=['mean_squared_error', 'std_squared_error'])
errors.describe()

Unnamed: 0,mean_squared_error,std_squared_error
count,225.0,225.0
mean,688.881086,1022.248635
std,186.232073,230.623565
min,322.102041,477.409291
25%,562.969072,875.076911
50%,660.783505,1021.37061
75%,782.86,1162.654266
max,1695.22449,1963.659924


Removing the neighborhood data did not impact the model's performance. This means that the neighborhood does not affect the sorting of the hotels on Booking.com. While not impacting the model's performance, This change will reduce the size of the training data, and will optimize training and testing time

## Optimization 3: Switching to XGBoost

Why use just one tree when you can use a whole forest?! why use regression models if you dont penalize enough on a not close enough to original sorting? 

Enter XGBoost, gradient boosted forest ensembler.

In [135]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
    --------------------------------------- 2.6/150.0 MB 11.6 MB/s eta 0:00:13
   --- ------------------------------------ 14.4/150.0 MB 33.6 MB/s eta 0:00:05
   ------- -------------------------------- 27.8/150.0 MB 44.0 MB/s eta 0:00:03
   ---------- ----------------------------- 41.2/150.0 MB 48.5 MB/s eta 0:00:03
   -------------- ------------------------- 54.3/150.0 MB 52.4 MB/s eta 0:00:02
   ----------------- ---------------------- 67.1/150.0 MB 54.2 MB/s eta 0:00:02
   --------------------- ------------------ 81.0/150.0 MB 55.6 MB/s eta 0:00:02
   ------------------------- -------------- 94.6/150.0 MB 56.5 MB/s eta 0:00:01
   ---------------------------- ---------- 108.5/150.0 MB 57.3 MB/s eta

In [137]:
from xgboost import XGBRegressor
xgb_regressor_param_grid = {
    'max_depth': [5, 10],
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.3],
    'min_leaf_samples': [5],
    'objective': ['reg:squarederror']
}

xgboost_grid = grid_search_regressor = GridSearchCV(XGBRegressor(), xgb_regressor_param_grid, cv=5, n_jobs=num_of_cores // 4)
xgboost_grid.fit(X_train, y_train)

Parameters: { "min_leaf_samples" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [138]:
errors  = [test_page_error(xgboost_grid, X_train.columns, page) for _, page in test_pages] 
errors = pd.DataFrame(errors, columns=['mean_squared_error', 'std_squared_error'])
errors.describe()

Unnamed: 0,mean_squared_error,std_squared_error
count,225.0,225.0
mean,383.886883,688.170951
std,149.492192,243.981136
min,110.639175,154.386193
25%,283.919192,503.726745
50%,345.541667,662.568973
75%,447.285714,866.619979
max,979.897959,1295.404903
