In [128]:
import pandas as pd
import time
import datetime
import collections
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelEncoder

In [None]:
# Read in Data
df = pd.read_csv("training_set_VU_DM.csv")
df_test = pd.read_csv("test_set_VU_DM.csv")

In [None]:
# Make a descriptive dataframe
df_descr = pd.concat([df.nunique(), df.isna().sum(), df.notna().sum(), df.dtypes, df.max(), df.min()], axis=1)
df_descr.columns = ['Unique values', 'NaN count', 'non NaN count', 'Datatype', 'Max value', 'Min value']
df_descr

## Plot Graphs

In [None]:
descr2 = df_descr[df_descr['Unique values'] <= 10].index.tolist() # (almost) all are categorical variables
descr3 = df_descr[df_descr['Unique values'] > 50].index.tolist() # (almost) all are numerical variables


df[descr2].hist(figsize=(16, 20), bins=10, xlabelsize=8, ylabelsize=8)
descr3.remove('date_time')
descr3.remove('srch_id')

## Plot Boxplots

In [None]:
df_numerical = df[descr3]

red_circle = dict(markerfacecolor = 'red', marker = 'o', markeredgecolor = 'white')

fig, axs = plt.subplots(11, 2, figsize=(15,25))

for i, ax in enumerate(axs.flat):
    ax.boxplot(df_numerical.iloc[:,i].dropna(), flierprops = red_circle, vert=False)
    ax.set_title(df_numerical.columns[i], fontweight = 'bold')
    ax.tick_params(axis = 'y', labelsize = 14)

plt.show()

## Reduce all comp rate columns to 1 column

In [None]:
conditions = [(df['comp1_rate'].eq(-1) & df['comp1_inv'].ne(1)), 
              (df['comp2_rate'].eq(-1) & df['comp2_inv'].ne(1)), 
              (df['comp3_rate'].eq(-1) & df['comp3_inv'].ne(1)),  
              (df['comp4_rate'].eq(-1) & df['comp4_inv'].ne(1)), 
              (df['comp5_rate'].eq(-1) & df['comp5_inv'].ne(1)), 
              (df['comp6_rate'].eq(-1) & df['comp6_inv'].ne(1)), 
              (df['comp7_rate'].eq(-1) & df['comp7_inv'].ne(1)),  
              (df['comp8_rate'].eq(-1) & df['comp8_inv'].ne(1))]

choices = [1,1,1,1,1,1,1,1]
print(conditions)

df['comp_cheaper'] = np.select(conditions, choices, default=0)

# and for test set
conditions2 = [(df_test['comp1_rate'].eq(-1) & df_test['comp1_inv'].ne(1)), 
              (df_test['comp2_rate'].eq(-1) & df_test['comp2_inv'].ne(1)), 
              (df_test['comp3_rate'].eq(-1) & df_test['comp3_inv'].ne(1)),  
              (df_test['comp4_rate'].eq(-1) & df_test['comp4_inv'].ne(1)), 
              (df_test['comp5_rate'].eq(-1) & df_test['comp5_inv'].ne(1)), 
              (df_test['comp6_rate'].eq(-1) & df_test['comp6_inv'].ne(1)), 
              (df_test['comp7_rate'].eq(-1) & df_test['comp7_inv'].ne(1)),  
              (df_test['comp8_rate'].eq(-1) & df_test['comp8_inv'].ne(1))]

choices2 = [1,1,1,1,1,1,1,1]

df_test['comp_cheaper'] = np.select(conditions2, choices2, default=0)
df_test['comp_cheaper'].hist()

## Impute columns

In [None]:
for c in df.columns:
    if c == 'prop_review_score':
        df[c] = df[c].fillna(df[c].mode()[0])
    elif c == 'prop_location_score2':
        df.loc[df.loc[:,c].isnull(),c]=df.loc[:,c].mean()
    elif c == 'orig_destination_distance':
        df.loc[df.loc[:,c].isnull(),c]=df.loc[:,c].mean()
    elif c == 'gross_bookings_usd':
        df.loc[df.loc[:,c].isnull(),c]=df.loc[:,c].median()
    elif c == 'srch_query_affinity_score':
        df.loc[df.loc[:,c].isnull(),c]=df.loc[:,c].median()

for c in df_test.columns:
    if c == 'prop_review_score':
        df_test[c] = df_test[c].fillna(df_test[c].mode()[0])
    elif c == 'prop_location_score2':
        df_test.loc[df_test.loc[:,c].isnull(),c]=df_test.loc[:,c].mean()
    elif c == 'orig_destination_distance':
        df_test.loc[df_test.loc[:,c].isnull(),c]=df_test.loc[:,c].mean()
    elif c == 'gross_bookings_usd':
        df_test.loc[df_test.loc[:,c].isnull(),c]=df_test.loc[:,c].median()
    elif c == 'srch_query_affinity_score':
        df_test.loc[df_test.loc[:,c].isnull(),c]=df_test.loc[:,c].median()
        
        

print("count of NULL values after imputation\n")
print(df['prop_review_score'].isnull().sum())
print(df['prop_location_score2'].isnull().sum())   
print(df['gross_bookings_usd'].isnull().sum())

## Make booleans for starrating and adr_usd (this runtime can still be optimized  - see https://stackoverflow.com/questions/30912403/appending-boolean-column-in-panda-dataframe )

In [None]:
previously_purchased_hotels_train = []
for i in df['visitor_hist_starrating'].values:
    if i >= 0:
        previously_purchased_hotels_train.append(1)
    else:
        previously_purchased_hotels_train.append(0)
        
previously_purchased_hotels_test = []

for i in df_test['visitor_hist_starrating'].values:
    if i >= 0:
        previously_purchased_hotels_test.append(1)
    else:
        previously_purchased_hotels_test.append(0)


previously_purchased_hotels_train2 = []
for i in df['visitor_hist_adr_usd'].values:
    if i >= 0:
        previously_purchased_hotels_train2.append(1)
    else:
        previously_purchased_hotels_train2.append(0)
        
previously_purchased_hotels_test2 = []

for i in df_test['visitor_hist_adr_usd'].values:
    if i >= 0:
        previously_purchased_hotels_test2.append(1)
    else:
        previously_purchased_hotels_test2.append(0)
        
        
df.drop('visitor_hist_starrating', inplace = True, axis =1)
df.drop('visitor_hist_adr_usd', inplace = True, axis = 1)
df_test.drop('visitor_hist_starrating', inplace = True, axis =1)
df_test.drop('visitor_hist_adr_usd', inplace = True, axis = 1)
df.insert(6, 'visitor_hist_bool', previously_purchased_hotels_train)
df_test.insert(6, 'visitor_hist_bool', previously_purchased_hotels_test)
df.insert(6, 'visitor_usd_bool', previously_purchased_hotels_train2)
df_test.insert(6, 'visitor_usd_bool', previously_purchased_hotels_test2)
df.columns


## Adult ratio added

In [None]:
# For train set
adults = df['srch_adults_count'].values
children = df['srch_children_count'].values

adults = list(adults)
children = list(children)

ratio_adults = [i/ (i+j) for i, j in zip(adults, children)]

df.drop('srch_adults_count', inplace=True, axis=1)
df.drop('srch_children_count', inplace=True, axis=1)
df.insert(22, 'adult_ratio', ratio_adults)

# For test set

adults = df_test['srch_adults_count'].values
children = df_test['srch_children_count'].values

adults = list(adults)
children = list(children)

ratio_adults = [i/ (i+j) for i, j in zip(adults, children)]

df_test.drop('srch_adults_count', inplace=True, axis=1)
df_test.drop('srch_children_count', inplace=True, axis=1)
df_test.insert(22, 'adult_ratio', ratio_adults)
df.columns

In [None]:
# Drop the comp rate columns to save storage, as we created new column for this and 
df = df.drop(['comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff'], axis=1)

df_test = df_test.drop(['comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff'], axis=1)


# To do 

In [None]:
# 'visitor_location_country_id'
median = df['visitor_location_country_id'].median()
df['visitor_location_country_bool'] = np.select([df['visitor_location_country_id'].ge(median)],[1], default=0)
median = df_test['visitor_location_country_id'].median()
df_test['visitor_location_country_bool'] = np.select([df_test['visitor_location_country_id'].ge(median)],[1], default=0)

# 'prop_country_id'
median = df['prop_country_id'].median()
df['prop_country_bool'] = np.select([df['prop_country_id'].ge(median)],[1], default=0)
median = df_test['prop_country_id'].median()
df_test['prop_country_bool'] = np.select([df_test['prop_country_id'].ge(median)],[1], default=0)

# 'orig_destination_distance'
median = df['orig_destination_distance'].median()
df['orig_destination_distance_bool'] = np.select([df['orig_destination_distance'].ge(median)],[1], default=0)
median = df_test['orig_destination_distance'].median()
df_test['orig_destination_distance_bool'] = np.select([df_test['orig_destination_distance'].ge(median)],[1], default=0)

# srch_length_of_stay
median = df['srch_length_of_stay'].median()
df['srch_length_of_stay_bool'] = np.select([df['srch_length_of_stay'].ge(median)],[1], default=0)
median = df_test['srch_length_of_stay'].median()
df_test['srch_length_of_stay_bool'] = np.select([df_test['srch_length_of_stay'].ge(median)],[1], default=0)

# srch_booking_window
median = df['srch_booking_window'].median()
df['srch_booking_window_bool'] = np.select([df['srch_booking_window'].ge(median)],[1], default=0)
median = df_test['srch_booking_window'].median()
df_test['srch_booking_window_bool'] = np.select([df_test['srch_booking_window'].ge(median)],[1], default=0)

# df['prop_country_popular'].hist()
# len(df[df['srch_length_of_stay_bool']==1])

# 'visitor_location_country_id' + 'prop_country_id'  - splitten in boolean: 200 of hoger / lager dan 200
# 'orig_destination_distance' (386 meadiaan) , 'srch_length_of_stay' (2 mediaan), 'srch_booking_window' (mediaan 17) -  opsplitsen in 2 categorieen 
# 'prop_location_score1' , 'prop_review_score', "prop_log_historical_price", 'srch_query_affinity_score',  normalize between 0 and 1 (min/max)
# 'price_usd' / - log normalize and bin  
# 
# check for one hot encoding  - pd.get_dummies 


In [None]:
# drop prop id / srch_destination_id
df = df.drop(['srch_destination_id'], axis=1)
df_test = df_test.drop(['srch_destination_id'], axis=1)

# drop old columns
df = df.drop(['visitor_location_country_id', 'prop_country_id',
       'orig_destination_distance', 'srch_length_of_stay', 'srch_booking_window'], axis=1)

df_test = df_test.drop(['visitor_location_country_id', 'prop_country_id',
       'orig_destination_distance', 'srch_length_of_stay', 'srch_booking_window'], axis=1)

In [None]:
#   normalize between 0 and 1 (min/max) 'prop_location_score1','prop_review_score',"prop_log_historical_price",'srch_query_affinity_score'
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))
df['prop_location_score1'] = NormalizeData(df['prop_location_score1'])
df['prop_review_score'] = NormalizeData(df['prop_review_score'])
df['prop_log_historical_price'] = NormalizeData(df['prop_log_historical_price'])
df['srch_query_affinity_score'] = NormalizeData(df['srch_query_affinity_score'])


df_test['prop_location_score1'] = NormalizeData(df_test['prop_location_score1'])
df_test['prop_review_score'] = NormalizeData(df_test['prop_review_score'])
df_test['prop_log_historical_price'] = NormalizeData(df_test['prop_log_historical_price'])
df_test['srch_query_affinity_score'] = NormalizeData(df_test['srch_query_affinity_score'])

In [None]:
# made price into categorical data
df['price_usd'] = df['price_usd'].transform(lambda x: np.log(x + 1))
df['price_usd'] = NormalizeData(df['price_usd']) # dit is overbodig
quantile02 = df['price_usd'].quantile(0.2)
quantile04 = df['price_usd'].quantile(0.4)
quantile06 = df['price_usd'].quantile(0.6)
quantile08 = df['price_usd'].quantile(0.8)

conditionsprice = [(df['price_usd'].ge(quantile02) & df['price_usd'].lt(quantile04)), 
              (df['price_usd'].ge(quantile04) & df['price_usd'].lt(quantile06)), 
              (df['price_usd'].ge(quantile06) & df['price_usd'].lt(quantile08)),  
              (df['price_usd'].ge(quantile08)) 
              ]

choicesprice = [1,2,3,4,]

df['price_usd'] = np.select(conditionsprice, choicesprice, default=0)
df['price_usd'].hist()


# made price into categorical data
df_test['price_usd'] = df_test['price_usd'].transform(lambda x: np.log(x + 1))
df_test['price_usd'] = NormalizeData(df_test['price_usd']) # dit is overbodig
quantile02 = df_test['price_usd'].quantile(0.2)
quantile04 = df_test['price_usd'].quantile(0.4)
quantile06 = df_test['price_usd'].quantile(0.6)
quantile08 = df_test['price_usd'].quantile(0.8)

conditionsprice = [(df_test['price_usd'].ge(quantile02) & df_test['price_usd'].lt(quantile04)), 
              (df_test['price_usd'].ge(quantile04) & df_test['price_usd'].lt(quantile06)), 
              (df_test['price_usd'].ge(quantile06) & df_test['price_usd'].lt(quantile08)),  
              (df_test['price_usd'].ge(quantile08)) 
              ]

choicesprice = [1,2,3,4,]

df_test['price_usd'] = np.select(conditionsprice, choicesprice, default=0)
df_test['price_usd'].hist()

In [None]:
print(df.columns)

df_descr_two = pd.concat([df.nunique(), df.isna().sum(), df.notna().sum(), df.dtypes, df.max(), df.min()], axis=1)
df_descr_two.columns = ['Unique values', 'NaN count', 'non NaN count', 'Datatype', 'Max value', 'Min value']
df_descr_two

#df['srch_booking_window'].hist()
# df_descr_two
# srch_length_of_stay , srch_booking_window

#  ----------------------------------XGBoost------------------------------------------------------------

In [120]:
### Check X_train / X_val if all new created features are incorporated

In [121]:
# Make training and validation split - based on search id

# we exclude the columns that are not in the test set
# Exclude prop _id because to many categories

df1 = df[df['srch_id']<= 200000]
df2 = df[df['srch_id']> 200000]

X_train = df1.loc[:, df1.columns.isin(['site_id', 'visitor_location_country_bool',
       'prop_country_bool', 'visitor_hist_bool', 'prop_starrating',
       'prop_review_score', 'prop_brand_bool', 'prop_location_score1',
       'prop_location_score2', 'prop_log_historical_price', 'visitor_hist_bool', 
        'visitor_usd_bool', 'price_usd', 'promotion_flag',
       'srch_length_of_stay_bool', 'srch_booking_window_bool', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score', 'adult_ratio',
       'orig_destination_distance_bool', 'random_bool', 'comp_cheaper'])]
y_train = df1.loc[:, df1.columns.isin(['booking_bool'])]

X_val = df2.loc[:, df2.columns.isin(['site_id', 'visitor_location_country_bool',
       'prop_country_bool', 'visitor_hist_bool', 'prop_starrating',
       'prop_review_score', 'prop_brand_bool', 'prop_location_score1',
       'prop_location_score2', 'prop_log_historical_price', 'visitor_hist_bool', 
        'visitor_usd_bool', 'price_usd', 'promotion_flag',
       'srch_length_of_stay_bool', 'srch_booking_window_bool', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score', 'adult_ratio',
       'orig_destination_distance_bool', 'random_bool', 'comp_cheaper'])]
y_val = df2.loc[:, df2.columns.isin(['booking_bool'])]

print(X_train.shape, X_val.shape)
# Set some features to type category to help xgboost
X_train[['site_id','visitor_location_country_bool','prop_review_score',
       'prop_country_bool', 'prop_starrating', 'prop_brand_bool', 'promotion_flag',
         'visitor_hist_bool', 'visitor_usd_bool','srch_length_of_stay_bool', 'srch_booking_window_bool', 'srch_room_count', 
         'srch_saturday_night_bool', 'random_bool']].astype("category")
X_val[['site_id','visitor_location_country_bool','prop_review_score',
       'prop_country_bool', 'prop_starrating', 'prop_brand_bool', 'promotion_flag',
         'visitor_hist_bool', 'visitor_usd_bool','srch_length_of_stay_bool', 'srch_booking_window_bool', 'srch_room_count', 
         'srch_saturday_night_bool', 'random_bool']].astype("category")
print('done')

(2978462, 22) (1979885, 22)
done


In [122]:
# Fit and train the model
clf2 = xgb.XGBClassifier(tree_method="approx", enable_categorical=True, use_label_encoder=False)
clf2.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=True,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [123]:
# Score the training data
print(clf2.feature_names_in_)
print(clf2.feature_importances_)
score = clf2.score(X_train, y_train, sample_weight=None)
print('Accuracy Training: ', score)
XGB_prob = clf2.predict_proba(X_train)
print("Booked or not:", clf2.classes_)
print("Probabilities of being booked:", XGB_prob[:,1])
df_train_result = df1.loc[:, df1.columns.isin(['srch_id', 'prop_id', 'booking_bool', 'position'])]
df_train_result['booking_pred_prob'] = XGB_prob[:,1]
df_sorted_train = df_train_result.sort_values(["srch_id", "booking_pred_prob"], ascending=[True, False])
df_sorted_train[['srch_id', 'prop_id', "booking_pred_prob", 'booking_bool', 'position']]
df_sorted_train['position_rank'] = df_sorted_train.groupby('srch_id')['booking_pred_prob'].rank(ascending=False)
df_sorted_train

['site_id' 'visitor_usd_bool' 'visitor_hist_bool' 'prop_starrating'
 'prop_review_score' 'prop_brand_bool' 'prop_location_score1'
 'prop_location_score2' 'prop_log_historical_price' 'price_usd'
 'promotion_flag' 'srch_room_count' 'srch_saturday_night_bool'
 'adult_ratio' 'srch_query_affinity_score' 'random_bool' 'comp_cheaper'
 'visitor_location_country_bool' 'prop_country_bool'
 'orig_destination_distance_bool' 'srch_length_of_stay_bool'
 'srch_booking_window_bool']
[0.00730816 0.00693717 0.01057633 0.02633808 0.01556457 0.01185575
 0.03041524 0.04265395 0.00923215 0.01525085 0.04361676 0.01549068
 0.0056587  0.01465357 0.00670408 0.669785   0.01375756 0.00799165
 0.0132435  0.01099659 0.01061512 0.01135443]
Accuracy Training:  0.9722114970746647
Booked or not: [0 1]
Probabilities of being booked: [0.00818393 0.00527882 0.00330212 ... 0.02544005 0.00526438 0.02885399]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_result['booking_pred_prob'] = XGB_prob[:,1]


Unnamed: 0,srch_id,prop_id,position,booking_bool,booking_pred_prob,position_rank
18,1,88218,8,0,0.019173,1.0
8,1,53341,3,0,0.018630,2.0
21,1,95307,1,0,0.015923,3.0
16,1,88096,23,0,0.014105,4.0
4,1,29604,4,0,0.014024,5.0
...,...,...,...,...,...,...
2978449,200000,94042,29,0,0.008301,26.0
2978453,200000,99540,3,0,0.007853,27.0
2978452,200000,98034,9,0,0.007295,28.0
2978460,200000,132353,32,0,0.005264,29.0


In [124]:
# Calculate NDCG
grouped = df_sorted_train.groupby('srch_id')

def ndcg(group):
    y_pred = group["position_rank"].values # predicted rank
    y_true = group["position"].values # true rank
    y_pred = [y_pred]
    y_true = [y_true]
    return ndcg_score(y_true, y_pred)

ndcg_list = grouped.apply(ndcg)
print('Average NDCG for all users in training set is: ', ndcg_list.mean())

Average NDCG for all users in training set is:  0.8914695745776796


In [125]:
# Score the validation data
score = clf2.score(X_val, y_val, sample_weight=None)
print('Accuracy Validation: ', score)
XGB_prob = clf2.predict_proba(X_val)
print("Booked or not:", clf2.classes_)
print("Probabilities of being booked:", XGB_prob[:,1])
df_val_result = df2.loc[:, df2.columns.isin(['srch_id', 'prop_id', 'booking_bool', 'position'])]
df_val_result['booking_pred_prob'] = XGB_prob[:,1]
df_sorted_val = df_val_result.sort_values(["srch_id", "booking_pred_prob"], ascending=[True, False])
df_sorted_val[['srch_id', 'prop_id', "booking_pred_prob",'booking_bool', 'position']]
df_sorted_val['position_rank'] = df_sorted_val.groupby('srch_id')['booking_pred_prob'].rank(ascending=False)
df_sorted_val.tail(7)

Accuracy Validation:  0.9720160514373309
Booked or not: [0 1]
Probabilities of being booked: [0.002019   0.01370149 0.00592883 ... 0.02400131 0.03828261 0.0195649 ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val_result['booking_pred_prob'] = XGB_prob[:,1]


Unnamed: 0,srch_id,prop_id,position,booking_bool,booking_pred_prob,position_rank
4958322,332784,48216,22,0,0.011217,28.0
4958343,332785,88083,3,0,0.091153,1.0
4958345,332785,128360,1,1,0.038283,2.0
4958342,332785,77700,2,0,0.033422,3.0
4958344,332785,94508,4,0,0.024001,4.0
4958346,332785,134949,6,0,0.019565,5.0
4958341,332785,55110,7,0,0.009393,6.0


In [126]:
# Calculate NDCG
grouped = df_sorted_val.groupby('srch_id')

def ndcg(group):
    y_pred = group["position_rank"].values # predicted rank
    y_true = group["position"].values # true rank
    y_pred = [y_pred]
    y_true = [y_true]
    return ndcg_score(y_true, y_pred, k=5)

ndcg_list = grouped.apply(ndcg)
print('Average NDCG for all users in val set is: ', ndcg_list.mean())

Average NDCG for all users in val set is:  0.7277754446524305


# Run test csv for kaggle submission

In [127]:
## Now for test csv
X_test = df_test.loc[:, df_test.columns.isin(['site_id', 'visitor_location_country_bool',
       'prop_country_bool', 'visitor_hist_bool', 'prop_starrating',
       'prop_review_score', 'prop_brand_bool', 'prop_location_score1',
       'prop_location_score2', 'prop_log_historical_price', 'visitor_hist_bool', 
        'visitor_usd_bool', 'price_usd', 'promotion_flag',
       'srch_length_of_stay_bool', 'srch_booking_window_bool', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score', 'adult_ratio',
       'orig_destination_distance_bool', 'random_bool', 'comp_cheaper'])]

X_test[['site_id','visitor_location_country_bool','prop_review_score',
       'prop_country_bool', 'prop_starrating', 'prop_brand_bool', 'promotion_flag',
         'visitor_hist_bool', 'visitor_usd_bool','srch_length_of_stay_bool', 'srch_booking_window_bool', 'srch_room_count', 
         'srch_saturday_night_bool', 'random_bool']].astype("category")


XGB_prob_t = clf2.predict_proba(X_test)
#print(NB_prob_t[:,1]) # Probabilities of being booked

df_test_result = df_test[['srch_id', 'prop_id']]
df_test_result['booking_pred_prob'] = XGB_prob_t[:,1]
df_sorted = df_test_result.sort_values(["srch_id", "booking_pred_prob"], ascending=[True, False])
final_frame = df_sorted[['srch_id', 'prop_id']]
#final_frame
# header "SearchId ,PropertyId" 
final_frame.to_csv('xgb_group93_model2.csv', index=False)
final_frame

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_result['booking_pred_prob'] = XGB_prob_t[:,1]


Unnamed: 0,srch_id,prop_id
23,1,99484
8,1,50162
9,1,54937
25,1,128085
12,1,61934
...,...,...
4959176,332787,22854
4959178,332787,32019
4959182,332787,99509
4959181,332787,94437


In [None]:
# To do : tuning -> https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
# Random guessing for benchmarking
# Lasso ?
