In [38]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
###testing gitpush
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
def get_target(row):
    """
    0=not clicked at all, 1=clicked but not booked, 2=booked
    """
#     if row.booking_bool>0:
#         return 2
#     if row.click_bool>0 :
#         return 1
#     return 0

    if row.booking_bool>0:
        return 1
    return 0

def featurize_df(df:pd.DataFrame) ->pd.DataFrame:
    """
    Extract more features
    """
    df["weekday"] = df["date_time"].dt.weekday
    df["week_of_year"] = df["date_time"].dt.isocalendar().week

    df["hour"] = df["date_time"].dt.hour
    df["minute"] = df["date_time"].dt.minute
    ## total time elapsed - allows model to learn continous trend over time to a degree
    df["time_epoch"] = df["date_time"].astype('int64')//1e9
    ## if we were looking at fraud: df["seconds"] = df.timestamp.dt.second
    df["early_night"] = ((df["hour"]>19) | (df["hour"]<3)) # no added value from feature
    
    df["nans_count"] = df.isna().sum(axis=1)
    
    ## we won't make any time series features for now
    ## We could add time series features per property/hotel. We'd need to check for unaries, and to add a shift/offset dependant on forecast horizon

    return df


In [10]:
HotelSearch_train=pd.read_csv("./data/HotelSearch_train_sampled.csv")
HotelSearch_test=pd.read_csv("./data/HotelSearch_test_sampled.csv")



#print(HotelSearch_orig['booking_bool'].unique())
HotelSearch_train["date_time"] = pd.to_datetime(HotelSearch_train["date_time"],infer_datetime_format=True)
HotelSearch_train["target"] = HotelSearch_train.apply(get_target,axis=1)

HotelSearch_test["date_time"] = pd.to_datetime(HotelSearch_test["date_time"],infer_datetime_format=True)
HotelSearch_test["target"] = HotelSearch_test.apply(get_target,axis=1)

HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99686 entries, 0 to 99685
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Unnamed: 0                   99686 non-null  int64         
 1   srch_id                      99686 non-null  int64         
 2   date_time                    99686 non-null  datetime64[ns]
 3   site_id                      99686 non-null  int64         
 4   visitor_location_country_id  99686 non-null  int64         
 5   visitor_hist_starrating      4529 non-null   float64       
 6   visitor_hist_adr_usd         4534 non-null   float64       
 7   prop_country_id              99686 non-null  int64         
 8   prop_id                      99686 non-null  int64         
 9   prop_starrating              99686 non-null  int64         
 10  prop_review_score            99512 non-null  float64       
 11  prop_brand_bool              99686 non-nu

In [11]:
drop_cols=[]
drop_unary_cols = [c for c in list(HotelSearch_train)
    if HotelSearch_train[c].nunique(dropna=False) <= 1]
drop_unary_cols
target_cols = ["gross_bookings_usd","click_bool","booking_bool","Unnamed: 0"] # leaky column, and original target columns
drop_cols.extend(drop_unary_cols)
drop_cols.extend(target_cols) 

HotelSearch_train = HotelSearch_train.drop(columns=drop_cols,errors="ignore")
HotelSearch_test  = HotelSearch_test.drop(columns=drop_cols,errors="ignore")
print(HotelSearch_train.shape)
print(HotelSearch_test.shape)


(99686, 52)
(147022, 52)


In [12]:
HotelSearch_train = featurize_df(HotelSearch_train)
HotelSearch_test = featurize_df(HotelSearch_test)
HotelSearch_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147022 entries, 0 to 147021
Data columns (total 59 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   srch_id                      147022 non-null  int64         
 1   date_time                    147022 non-null  datetime64[ns]
 2   site_id                      147022 non-null  int64         
 3   visitor_location_country_id  147022 non-null  int64         
 4   visitor_hist_starrating      7506 non-null    float64       
 5   visitor_hist_adr_usd         7506 non-null    float64       
 6   prop_country_id              147022 non-null  int64         
 7   prop_id                      147022 non-null  int64         
 8   prop_starrating              147022 non-null  int64         
 9   prop_review_score            146813 non-null  float64       
 10  prop_brand_bool              147022 non-null  int64         
 11  prop_location_score1      

In [13]:
HotelSearch_train.drop(['comp3_rate',
       'comp3_inv', 'comp3_rate_percent_diff', 'comp4_inv', 'comp5_rate',
       'comp5_inv', 'comp5_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff'],axis=1).groupby(HotelSearch_train["target"]>0).mean()
HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99686 entries, 0 to 99685
Data columns (total 59 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      99686 non-null  int64         
 1   date_time                    99686 non-null  datetime64[ns]
 2   site_id                      99686 non-null  int64         
 3   visitor_location_country_id  99686 non-null  int64         
 4   visitor_hist_starrating      4529 non-null   float64       
 5   visitor_hist_adr_usd         4534 non-null   float64       
 6   prop_country_id              99686 non-null  int64         
 7   prop_id                      99686 non-null  int64         
 8   prop_starrating              99686 non-null  int64         
 9   prop_review_score            99512 non-null  float64       
 10  prop_brand_bool              99686 non-null  int64         
 11  prop_location_score1         99686 non-nu

In [14]:
cutoff_id = HotelSearch_train["srch_id"].quantile(0.94) # 90/10 split

X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id].drop(["target","date_time"],axis=1)
X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id].drop(["target","date_time"],axis=1)
X_test_df=HotelSearch_test.drop(["target","date_time"],axis=1)

feature_cols=X_train_df.columns.tolist()
# X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]
# X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]

y_train_df = HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]["target"].astype(float)
y_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]["target"].astype(float)
y_test_df = HotelSearch_test["target"].astype(float)

X_train = X_train_df.values
X_eval = X_eval_df.values
X_test= X_test_df.values
queries_train = X_train_df["srch_id"].values

y_train = y_train_df.values
y_eval = y_eval_df.values
y_test = y_test_df.values

queries_test = X_eval_df["srch_id"].values



max_relevance = float(np.max(y_train))
y_train /= max_relevance
y_eval /= max_relevance
y_train_df.value_counts()


0.0    91066
1.0     2637
Name: target, dtype: int64

In [15]:
print("mean relevancy train",round(y_train.mean(),4))
print("mean relevancy eval",round(y_eval.mean(),4))
print(y_eval_df.value_counts()) # check we have all 3 "labels" in subset

mean relevancy train 0.0281
mean relevancy eval 0.0274
0.0    5819
1.0     164
Name: target, dtype: int64


In [16]:
HotelSearch_train['target'].value_counts()

0    96885
1     2801
Name: target, dtype: int64

In [17]:
categorical_cols = ['prop_id',"srch_destination_id", "weekday"] # ,"week_of_year"

In [18]:
X_train_df.shape

(93703, 57)

In [19]:
#set(X_train.columns).symmetric_difference(set(HotelSearch_test.columns))
train_pool = Pool(data=X_train,
                  label = y_train,
#                  cat_features=categorical_cols,
                  group_id=queries_train
                 )

eval_pool = Pool(data=X_eval,
                  label = y_eval,
#                  cat_features=categorical_cols,
                  group_id=queries_test
                 )

In [20]:
default_parameters  = {
    'iterations': 2000,
    'custom_metric': ['NDCG', "AUC:type=Ranking",'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], # , 'AverageGain:top=3'# 'QueryRMSE', "YetiLoss" (use with hints)
    'verbose': False,
    'random_seed': 42,
#     "task_type":"GPU",
#    "has_time":True,
    "metric_period":4,
    "save_snapshot":False,
    "use_best_model":True, # requires eval set to be set
} 

parameters = {}

In [21]:
def fit_model(loss_function, prediction_function, feature_cols,X_train,y_train,X_test,y_test,additional_params=None, train_pool=train_pool, test_pool=eval_pool):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
    print("best results (train on train):") 
    print(parameters) 

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    print("best results (train on train):")
    print(model.get_best_score()["learn"])
    print("best results (on validation set):")
    print(model.get_best_score()["validation"])

#Training predictions
    y_train_predictions = prediction_function(model, X_train)
    training_predictions = pd.DataFrame(X_train,columns=feature_cols)
    training_predictions['ActualRelevancy'] = y_train
    training_predictions['PredictedRelevancy'] = y_train_predictions

    
#Test predictions
    y_test_predictions = prediction_function(model, X_test)
    test_predictions = pd.DataFrame(X_test,columns=feature_cols)
    test_predictions['ActualRelevancy'] = y_test
    test_predictions['PredictedRelevancy'] = y_test_predictions

#     print("(Default) Feature importance (on train pool)")
#     display(model.get_feature_importance(data=train_pool,prettified=True).head(15))
    
#     try:
#         print("SHAP features importance, on all data:")
#         explainer = shap.TreeExplainer(model)
#         shap_values = explainer.shap_values(pd.concat([X_train,X_eval]),
#                                             y=pd.concat([y_train,y_eval]))

#         # # summarize the effects of all the features
#         shap.summary_plot(shap_values, pd.concat([X_train,X_eval]))
#    finally:
    return model, training_predictions , test_predictions  


In [22]:
def get_predicted_outcome(model, data):
    a=model.predict(data)
    print(a)
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict(data)

In [23]:
#model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})
model,training_predictions, test_predictions = fit_model('QueryRMSE',get_predicted_outcome,feature_cols,X_train,y_train,X_test,y_test)


best results (train on train):
{'iterations': 2000, 'custom_metric': ['NDCG', 'AUC:type=Ranking', 'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], 'verbose': False, 'random_seed': 42, 'metric_period': 4, 'save_snapshot': False, 'use_best_model': True, 'loss_function': 'QueryRMSE', 'train_dir': 'QueryRMSE'}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best results (train on train):
{'MAP:top=10': 0.38236180257329483, 'RecallAt:top=10': 0.9249800478850758, 'PrecisionAt:top=10': 0.06656680052529777, 'QueryRMSE': 0.14918552996004378}
best results (on validation set):
{'NDCG:type=Base': 0.6800902765555251, 'MAP:top=10': 0.28126780626780645, 'RecallAt:top=10': 0.8589743589743589, 'PrecisionAt:top=10': 0.06004103920770576, 'QueryRMSE': 0.1564466936684731, 'AUC:type=Ranking': 0.7595115244845523}
[ 0.0017476   0.02538033 -0.00209554 ... -0.01348368  0.08537377
 -0.01976081]
[-0.00334009 -0.00754448 -0.00897748 ...  0.11640193 -0.02197127
  0.00058178]


In [53]:
training_predictions["Predictedrank"] = training_predictions.groupby("srch_id")["PredictedRelevancy"].rank("dense", ascending=False)
training_predictions_df=training_predictions.sort_values(by=['srch_id','position'], ascending=True)
training_predictions_df = training_predictions_df.reset_index(drop=True)
training_predictions_df.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancy,PredictedRelevancy,Predictedrank
0,136,5,219,,,219,35226,3,4.0,0,0.0,0.0455,5.14,1,122.0,0,6767,2,15,2,2,1,1,,163.39,1,,,,,,,,,,,,,,,,,,,,,,,,,3,14,18,43,1365101017.0,False,27,0.0,0.01883,3.0
1,136,5,219,,,219,1182,2,4.0,0,1.39,0.0849,4.62,2,82.0,1,6767,2,15,2,2,1,1,,162.75,1,,,,,,,,,,,,,,,,,,,,,,,,,3,14,18,43,1365101017.0,False,27,0.0,0.02538,2.0
2,136,5,219,,,219,107629,3,4.5,1,0.0,,5.05,3,127.0,0,6767,2,15,2,2,1,1,,162.33,1,,,,,,,,,,,,,,,,,,,,,,,,,3,14,18,43,1365101017.0,False,28,0.0,-0.000375,16.0
3,136,5,219,,,219,80768,3,4.0,1,1.39,0.0879,4.55,4,64.0,1,6767,2,15,2,2,1,1,,162.81,1,,,,,,,,,,,,,,,,,,,,,,,,,3,14,18,43,1365101017.0,False,27,0.0,0.027576,1.0
4,136,5,219,,,219,75245,1,2.5,1,2.71,0.1021,4.07,6,40.0,0,6767,2,15,2,2,1,1,,164.24,1,,,,,,,,,,,,,,,,,,,,,,,,,3,14,18,43,1365101017.0,False,27,0.0,0.003533,13.0


In [54]:
test_predictions["Predictedrank"] = test_predictions.groupby("srch_id")["PredictedRelevancy"].rank("dense", ascending=False)
test_predictions_df=test_predictions.sort_values(by=['srch_id','position'], ascending=True).head(200)
test_predictions_df = test_predictions_df.reset_index(drop=True)
test_predictions_df.head()


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancy,PredictedRelevancy,Predictedrank
0,1,12,187,,,219,95307,4,3.5,1,2.4,0.1149,4.93,1,139.7,0,23246,1,0,4,0,1,1,,,1,,,,,,,0.0,0.0,29.0,,,,,1.0,,,,,,,,-1.0,0.0,27.0,3,14,8,32,1365064335.0,False,21,0.0,0.050316,1.0
1,1,12,187,,,219,74474,3,4.5,1,2.4,0.1255,5.03,2,210.84,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,3,14,8,32,1365064335.0,False,20,0.0,-0.005692,13.0
2,1,12,187,,,219,53341,4,4.0,1,2.56,0.1238,5.18,3,150.05,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,6.0,3,14,8,32,1365064335.0,False,21,0.0,0.014007,3.0
3,1,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4,143.58,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,3,14,8,32,1365064335.0,False,20,0.0,0.010704,4.0
4,1,12,187,,,219,89073,4,4.0,1,2.08,0.015,5.28,6,191.44,0,23246,1,0,4,0,1,1,,,1,,,,,,,0.0,0.0,5.0,,,,0.0,0.0,,,,,,,,0.0,0.0,5.0,3,14,8,32,1365064335.0,False,20,0.0,-0.007345,16.0


In [55]:
training_predictions_df.to_csv('./data/hotelsearch_training_predictions.csv')
test_predictions_df.to_csv('./data/hotelsearch_test_predictions.csv')