In [410]:
import datetime
from datetime import timedelta

import pandas as pd
import numpy as np
import random
from catboost import CatBoostRanker, Pool, MetricVisualizer, CatBoostClassifier
from copy import deepcopy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [382]:
def featurize_df(df:pd.DataFrame) ->pd.DataFrame:
    """
    Extract more features
    """
    df["day"] = df["date_time"].dt.day
    df["weekday"] = df["date_time"].dt.weekday
    df["week_of_year"] = df["date_time"].dt.isocalendar().week

    df["hour"] = df["date_time"].dt.hour
    df["minute"] = df["date_time"].dt.minute
    ## total time elapsed - allows model to learn continous trend over time to a degree
    df["time_epoch"] = df["date_time"].astype('int64')//1e9
    ## if we were looking at fraud: df["seconds"] = df.timestamp.dt.second
    df["early_night"] = ((df["hour"]>19) | (df["hour"]<3)) # no added value from feature
    
#    df["nans_count"] = df.isna().sum(axis=1)
    
    ## we won't make any time series features for now
    ## We could add time series features per property/hotel. We'd need to check for unaries, and to add a shift/offset dependant on forecast horizon

    return df


In [383]:
HotelSearch_train=pd.read_csv("./data/HotelSearch_train_sampled.csv", index_col=0)
HotelSearch_test=pd.read_csv("./data/HotelSearch_test_sampled.csv", index_col=0)



#print(HotelSearch_orig['booking_bool'].unique())
HotelSearch_train["date_time"] = pd.to_datetime(HotelSearch_train["date_time"],infer_datetime_format=True)
#HotelSearch_train["target"] = HotelSearch_train.apply(get_target,axis=1)

HotelSearch_test["date_time"] = pd.to_datetime(HotelSearch_test["date_time"],infer_datetime_format=True)
#HotelSearch_test["target"] = HotelSearch_test.apply(get_target,axis=1)



Timestamp('2013-06-30 20:02:52')

In [418]:
HotelSearch_train["date_time"].min()
datetime.datetime.today()

datetime.datetime(2022, 5, 31, 19, 22, 56, 186868)

In [384]:
drop_cols=[]
drop_unary_cols = [c for c in list(HotelSearch_train)
    if HotelSearch_train[c].nunique(dropna=False) <= 1]
#print(drop_unary_cols)
specific_cols = ["gross_bookings_usd","click_bool","booking_bool","Unnamed: 0","random_bool"] # leaky column, and original target columns
drop_cols.extend(drop_unary_cols)
drop_cols.extend(specific_cols) 
print(drop_cols)
HotelSearch_train = HotelSearch_train.drop(columns=drop_cols,errors="ignore")
HotelSearch_test  = HotelSearch_test.drop(columns=drop_cols,errors="ignore")
print(HotelSearch_train.shape)
print(HotelSearch_test.shape)


['gross_bookings_usd', 'click_bool', 'booking_bool', 'Unnamed: 0', 'random_bool']
(98361, 52)
(147730, 52)


In [385]:
HotelSearch_train = featurize_df(HotelSearch_train)
HotelSearch_test = featurize_df(HotelSearch_test)
HotelSearch_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147730 entries, 0 to 147729
Data columns (total 59 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   srch_id                      147730 non-null  int64         
 1   date_time                    147730 non-null  datetime64[ns]
 2   site_id                      147730 non-null  int64         
 3   visitor_location_country_id  147730 non-null  int64         
 4   visitor_hist_starrating      7770 non-null    float64       
 5   visitor_hist_adr_usd         7801 non-null    float64       
 6   prop_country_id              147730 non-null  int64         
 7   prop_id                      147730 non-null  int64         
 8   prop_starrating              147730 non-null  int64         
 9   prop_review_score            147499 non-null  float64       
 10  prop_brand_bool              147730 non-null  int64         
 11  prop_location_score1      

In [386]:
HotelSearch_train.drop(['comp3_rate',
       'comp3_inv', 'comp3_rate_percent_diff', 'comp4_inv', 'comp5_rate',
       'comp5_inv', 'comp5_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff'],axis=1).groupby(HotelSearch_train["target"]>0).mean()
HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98361 entries, 0 to 98360
Data columns (total 59 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      98361 non-null  int64         
 1   date_time                    98361 non-null  datetime64[ns]
 2   site_id                      98361 non-null  int64         
 3   visitor_location_country_id  98361 non-null  int64         
 4   visitor_hist_starrating      4699 non-null   float64       
 5   visitor_hist_adr_usd         4724 non-null   float64       
 6   prop_country_id              98361 non-null  int64         
 7   prop_id                      98361 non-null  int64         
 8   prop_starrating              98361 non-null  int64         
 9   prop_review_score            98201 non-null  float64       
 10  prop_brand_bool              98361 non-null  int64         
 11  prop_location_score1         98361 non-nu

In [387]:
cutoff_id = HotelSearch_train["srch_id"].quantile(0.94) # 90/10 split

X_train_cutoff=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]
X_eval_cutoff = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]
#X_test_cut =HotelSearch_test.drop(["target"],axis=1)

X_train_df=X_train_cutoff.drop(["date_time","target"],axis=1)
X_eval_df = X_eval_cutoff.drop(["date_time","target"],axis=1)
X_test_df =HotelSearch_test.drop(["date_time","target"],axis=1)


feature_cols=X_train_df.columns.tolist()
# X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]
# X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]

y_train_df = HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]["target"].astype(float)
y_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]["target"].astype(float)
y_test_df = HotelSearch_test["target"].astype(float)

X_train = X_train_df.values
X_eval = X_eval_df.values
X_test= X_test_df.values
queries_train = X_train_df["srch_id"].values

y_train = y_train_df.values
y_eval = y_eval_df.values
y_test = y_test_df.values

queries_test = X_eval_df["srch_id"].values



max_relevance = float(np.max(y_train))
y_train /= max_relevance
y_eval /= max_relevance
y_train_df.value_counts()


0.0    89823
1.0     2628
Name: target, dtype: int64

In [388]:
print("mean relevancy train",round(y_train.mean(),4))
print("mean relevancy eval",round(y_eval.mean(),4))
print(y_eval_df.value_counts()) # check we have all 3 "labels" in subset

mean relevancy train 0.0284
mean relevancy eval 0.0293
0.0    5737
1.0     173
Name: target, dtype: int64


In [389]:
HotelSearch_train['target'].value_counts()

0    95560
1     2801
Name: target, dtype: int64

In [390]:
categorical_cols = ['prop_id',"srch_destination_id", "weekday"] # ,"week_of_year"

In [391]:
X_train_df.shape

(92451, 57)

In [392]:
#set(X_train.columns).symmetric_difference(set(HotelSearch_test.columns))
train_pool = Pool(data=X_train,
                  label = y_train,
#                  cat_features=categorical_cols,
                  group_id=queries_train
                 )

eval_pool = Pool(data=X_eval,
                  label = y_eval,
#                  cat_features=categorical_cols,
                  group_id=queries_test
                 )

In [393]:
default_parameters  = {
    'iterations': 2000,
    'custom_metric': ['NDCG', "AUC:type=Ranking",'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], # , 'AverageGain:top=3'# 'QueryRMSE', "YetiLoss" (use with hints)
    'verbose': False,
    'random_seed': 42,
#     "task_type":"GPU",
#    "has_time":True,
    "metric_period":4,
    "save_snapshot":False,
    "use_best_model":True, # requires eval set to be set
} 

default_clf_parameters  = {
    'iterations':2000,
    'random_seed':42,
    'learning_rate':0.5,
    'custom_loss':['AUC', 'Accuracy']
}    


In [394]:
def fit_model(model,loss_function, prediction_function, feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval,additional_params=None, train_pool=train_pool, test_pool=eval_pool):

    print("Model Type: Is it CatboostRanker?")
    print(isinstance(model, type(CatBoostRanker())))

    
    if isinstance(model, type(CatBoostRanker())) is True:   
        print("Model Type is CatboostRanker")
        parameters = deepcopy(default_parameters)
        parameters['loss_function'] = loss_function
        parameters['train_dir'] = loss_function
        print(parameters) 
        if additional_params is not None:
            parameters.update(additional_params)
        model = CatBoostRanker(**parameters)
        model.fit(train_pool, eval_set=test_pool, plot=True)
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
    else:
        print("Model Type is CatboostClassifier")
        parameters = deepcopy(default_clf_parameters)
#        parameters['loss_function'] = loss_function
#        parameters['train_dir'] = loss_function
        model = CatBoostClassifier(**parameters)
        print(parameters) 
        model.fit(X_train, y_train,eval_set=(X_eval, y_eval),plot=True)        
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
#Training predictions
    y_train_predictions = prediction_function(model, X_train)
    training_predictions = pd.DataFrame(X_train,columns=feature_cols)
    training_predictions['ActualRelevancyScore'] = y_train
    X_train_df
    training_predictions['ActualRelevancy']=np.where(y_train==1.0,"Relevant","Not Relevant")

    if isinstance(model, type(CatBoostRanker())) is True:
        training_predictions['PredictedRelevancyScore'] = y_train_predictions
    else:
        training_predictions['Not_Relevant_prob'] = y_train_predictions[:,0]
        training_predictions['Yes_Relevant_prob'] = y_train_predictions[:,1]
        training_predictions['RelevanceLabelPrediction'] = np.argmax(y_train_predictions, axis=1).astype(np.float32)
        training_predictions['PredictedRelevancy']=np.where(training_predictions['RelevanceLabelPrediction']==1.0,"Relevant","Not Relevant")
        training_predictions['PredictedRelevancyScore'] = np.where(training_predictions['RelevanceLabelPrediction']==1.0,training_predictions['Yes_Relevant_prob'],training_predictions['Not_Relevant_prob'])
    
#Test predictions
    y_test_predictions = prediction_function(model, X_test)
    test_predictions = pd.DataFrame(X_test,columns=feature_cols)
    test_predictions['ActualRelevancyScore'] = y_test
    test_predictions['ActualRelevancy']=np.where(y_test==1.0,"Relevant","Not Relevant")

    if isinstance(model, type(CatBoostRanker())) is True:
        test_predictions['PredictedRelevancyScore'] = y_test_predictions
    else:
        test_predictions['Not_Relevant_prob'] = y_test_predictions[:,0]
        test_predictions['Yes_Relevant_prob'] = y_test_predictions[:,1]
        test_predictions['RelevanceLabelPrediction'] = np.argmax(y_test_predictions, axis=1).astype(np.float32)
        test_predictions['PredictedRelevancy']=np.where(test_predictions['RelevanceLabelPrediction']==1.0,"Relevant","Not Relevant")
        test_predictions['PredictedRelevancyScore'] = np.where(test_predictions['RelevanceLabelPrediction']==1.0,test_predictions['Yes_Relevant_prob'],test_predictions['Not_Relevant_prob'])
       

#     print("(Default) Feature importance (on train pool)")
#     display(model.get_feature_importance(data=train_pool,prettified=True).head(15))
    
#     try:
#         print("SHAP features importance, on all data:")
#         explainer = shap.TreeExplainer(model)
#         shap_values = explainer.shap_values(pd.concat([X_train,X_eval]),
#                                             y=pd.concat([y_train,y_eval]))

#         # # summarize the effects of all the features
#         shap.summary_plot(shap_values, pd.concat([X_train,X_eval]))
#    finally:
    return model, training_predictions , test_predictions  


In [395]:
def get_predicted_ranking_outcome(model, data):
#    a=model.predict(data)
#    print(a)
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict(data)

In [396]:
def get_predicted_clf_outcome(model, data):
#    print(model.predict_proba(data))
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict_proba(data)

In [397]:
#model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})
parameters = {}
ranking_model,ranking_training_predictions, ranking_test_predictions = fit_model(CatBoostRanker(),'QueryRMSE',get_predicted_ranking_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
True
Model Type is CatboostRanker
{'iterations': 2000, 'custom_metric': ['NDCG', 'AUC:type=Ranking', 'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], 'verbose': False, 'random_seed': 42, 'metric_period': 4, 'save_snapshot': False, 'use_best_model': True, 'loss_function': 'QueryRMSE', 'train_dir': 'QueryRMSE'}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best results (train on train):
{'MAP:top=10': 0.3839359867456504, 'RecallAt:top=10': 0.9348638547784304, 'PrecisionAt:top=10': 0.06731465944626414, 'QueryRMSE': 0.1512160241340935}
best results (on validation set):
{'NDCG:type=Base': 0.6757035731111425, 'MAP:top=10': 0.2736344065291435, 'RecallAt:top=10': 0.8785425101214575, 'PrecisionAt:top=10': 0.06201240280187635, 'QueryRMSE': 0.16261249103754186, 'AUC:type=Ranking': 0.7810813288853109}


In [398]:
ranking_training_predictions.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,day,weekday,week_of_year,hour,minute,time_epoch,early_night,ActualRelevancyScore,ActualRelevancy,PredictedRelevancyScore
0,71,24,216,,,98,76541,3,3.5,0,2.2,0.0482,3.21,1,16.9,0,341,1,18,1,1,1,1,,,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,12,1,11,23,14,1363130090.0,True,0.0,Not Relevant,0.118501
1,71,24,216,,,98,37933,3,4.0,0,2.08,0.0812,3.59,2,18.34,0,341,1,18,1,1,1,1,,,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,1,12,1,11,23,14,1363130090.0,True,0.0,Not Relevant,0.088448
2,71,24,216,,,98,108588,4,4.0,0,0.0,0.0268,4.08,3,26.78,1,341,1,18,1,1,1,1,,,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,2,12,1,11,23,14,1363130090.0,True,0.0,Not Relevant,0.049779
3,71,24,216,,,98,28672,3,4.0,0,2.2,0.0151,3.35,4,21.48,0,341,1,18,1,1,1,1,,,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,3,12,1,11,23,14,1363130090.0,True,0.0,Not Relevant,0.028751
4,71,24,216,,,98,25741,3,3.5,0,1.61,0.0359,4.02,6,19.47,1,341,1,18,1,1,1,1,,,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,4,12,1,11,23,14,1363130090.0,True,0.0,Not Relevant,0.023836


In [399]:
parameters = {}
relevancyclf_model,relevancyclf_training_predictions, relevancyclf_test_predictions = fit_model(CatBoostClassifier(),'AUC',get_predicted_clf_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
False
Model Type is CatboostClassifier
{'iterations': 2000, 'random_seed': 42, 'learning_rate': 0.5, 'custom_loss': ['AUC', 'Accuracy']}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.1959459	test: 0.1995675	best: 0.1995675 (0)	total: 17.6ms	remaining: 35.2s
1:	learn: 0.1303376	test: 0.1337912	best: 0.1337912 (1)	total: 30.6ms	remaining: 30.5s
2:	learn: 0.1193944	test: 0.1231323	best: 0.1231323 (2)	total: 44.1ms	remaining: 29.3s
3:	learn: 0.1152225	test: 0.1197776	best: 0.1197776 (3)	total: 56.4ms	remaining: 28.1s
4:	learn: 0.1135892	test: 0.1183250	best: 0.1183250 (4)	total: 67ms	remaining: 26.7s
5:	learn: 0.1127012	test: 0.1177384	best: 0.1177384 (5)	total: 78.7ms	remaining: 26.1s
6:	learn: 0.1124153	test: 0.1175618	best: 0.1175618 (6)	total: 90.8ms	remaining: 25.9s
7:	learn: 0.1120051	test: 0.1176902	best: 0.1175618 (6)	total: 103ms	remaining: 25.7s
8:	learn: 0.1115893	test: 0.1172935	best: 0.1172935 (8)	total: 117ms	remaining: 26s
9:	learn: 0.1112445	test: 0.1171165	best: 0.1171165 (9)	total: 130ms	remaining: 26s
10:	learn: 0.1110101	test: 0.1168766	best: 0.1168766 (10)	total: 142ms	remaining: 25.7s
11:	learn: 0.1108361	test: 0.1169725	best: 0.116876

100:	learn: 0.0934936	test: 0.1205376	best: 0.1168766 (10)	total: 1.24s	remaining: 23.3s
101:	learn: 0.0933070	test: 0.1205392	best: 0.1168766 (10)	total: 1.25s	remaining: 23.3s
102:	learn: 0.0931370	test: 0.1205932	best: 0.1168766 (10)	total: 1.26s	remaining: 23.3s
103:	learn: 0.0930692	test: 0.1206064	best: 0.1168766 (10)	total: 1.27s	remaining: 23.2s
104:	learn: 0.0929350	test: 0.1210811	best: 0.1168766 (10)	total: 1.28s	remaining: 23.2s
105:	learn: 0.0926726	test: 0.1210150	best: 0.1168766 (10)	total: 1.3s	remaining: 23.2s
106:	learn: 0.0924611	test: 0.1209816	best: 0.1168766 (10)	total: 1.31s	remaining: 23.1s
107:	learn: 0.0922912	test: 0.1217399	best: 0.1168766 (10)	total: 1.32s	remaining: 23.1s
108:	learn: 0.0921112	test: 0.1215587	best: 0.1168766 (10)	total: 1.33s	remaining: 23.1s
109:	learn: 0.0918410	test: 0.1217687	best: 0.1168766 (10)	total: 1.34s	remaining: 23.1s
110:	learn: 0.0917465	test: 0.1216719	best: 0.1168766 (10)	total: 1.35s	remaining: 23s
111:	learn: 0.0916438	te

199:	learn: 0.0776754	test: 0.1290370	best: 0.1168766 (10)	total: 2.48s	remaining: 22.4s
200:	learn: 0.0774857	test: 0.1292435	best: 0.1168766 (10)	total: 2.5s	remaining: 22.3s
201:	learn: 0.0773812	test: 0.1291790	best: 0.1168766 (10)	total: 2.51s	remaining: 22.3s
202:	learn: 0.0772502	test: 0.1293022	best: 0.1168766 (10)	total: 2.52s	remaining: 22.3s
203:	learn: 0.0772118	test: 0.1293049	best: 0.1168766 (10)	total: 2.53s	remaining: 22.3s
204:	learn: 0.0771758	test: 0.1293631	best: 0.1168766 (10)	total: 2.55s	remaining: 22.3s
205:	learn: 0.0769434	test: 0.1296753	best: 0.1168766 (10)	total: 2.56s	remaining: 22.3s
206:	learn: 0.0767979	test: 0.1295432	best: 0.1168766 (10)	total: 2.57s	remaining: 22.3s
207:	learn: 0.0767622	test: 0.1294700	best: 0.1168766 (10)	total: 2.58s	remaining: 22.3s
208:	learn: 0.0766176	test: 0.1295287	best: 0.1168766 (10)	total: 2.6s	remaining: 22.3s
209:	learn: 0.0765819	test: 0.1295089	best: 0.1168766 (10)	total: 2.61s	remaining: 22.2s
210:	learn: 0.0764104	t

297:	learn: 0.0667986	test: 0.1382261	best: 0.1168766 (10)	total: 3.75s	remaining: 21.4s
298:	learn: 0.0667400	test: 0.1382619	best: 0.1168766 (10)	total: 3.77s	remaining: 21.4s
299:	learn: 0.0665311	test: 0.1383361	best: 0.1168766 (10)	total: 3.77s	remaining: 21.4s
300:	learn: 0.0663337	test: 0.1385573	best: 0.1168766 (10)	total: 3.79s	remaining: 21.4s
301:	learn: 0.0662265	test: 0.1387336	best: 0.1168766 (10)	total: 3.79s	remaining: 21.3s
302:	learn: 0.0660861	test: 0.1384904	best: 0.1168766 (10)	total: 3.81s	remaining: 21.3s
303:	learn: 0.0658697	test: 0.1384641	best: 0.1168766 (10)	total: 3.82s	remaining: 21.3s
304:	learn: 0.0656441	test: 0.1386219	best: 0.1168766 (10)	total: 3.84s	remaining: 21.3s
305:	learn: 0.0655444	test: 0.1387237	best: 0.1168766 (10)	total: 3.85s	remaining: 21.3s
306:	learn: 0.0654521	test: 0.1388014	best: 0.1168766 (10)	total: 3.86s	remaining: 21.3s
307:	learn: 0.0653461	test: 0.1386923	best: 0.1168766 (10)	total: 3.88s	remaining: 21.3s
308:	learn: 0.0652431

396:	learn: 0.0559690	test: 0.1434936	best: 0.1168766 (10)	total: 5.09s	remaining: 20.5s
397:	learn: 0.0558680	test: 0.1431745	best: 0.1168766 (10)	total: 5.1s	remaining: 20.5s
398:	learn: 0.0558080	test: 0.1431721	best: 0.1168766 (10)	total: 5.11s	remaining: 20.5s
399:	learn: 0.0557279	test: 0.1431924	best: 0.1168766 (10)	total: 5.12s	remaining: 20.5s
400:	learn: 0.0555706	test: 0.1437498	best: 0.1168766 (10)	total: 5.13s	remaining: 20.5s
401:	learn: 0.0553781	test: 0.1437326	best: 0.1168766 (10)	total: 5.14s	remaining: 20.4s
402:	learn: 0.0552558	test: 0.1436976	best: 0.1168766 (10)	total: 5.16s	remaining: 20.4s
403:	learn: 0.0551556	test: 0.1438341	best: 0.1168766 (10)	total: 5.17s	remaining: 20.4s
404:	learn: 0.0549946	test: 0.1435634	best: 0.1168766 (10)	total: 5.18s	remaining: 20.4s
405:	learn: 0.0549676	test: 0.1434786	best: 0.1168766 (10)	total: 5.19s	remaining: 20.4s
406:	learn: 0.0549211	test: 0.1434945	best: 0.1168766 (10)	total: 5.21s	remaining: 20.4s
407:	learn: 0.0548737	

497:	learn: 0.0468356	test: 0.1494011	best: 0.1168766 (10)	total: 6.39s	remaining: 19.3s
498:	learn: 0.0467669	test: 0.1494968	best: 0.1168766 (10)	total: 6.4s	remaining: 19.3s
499:	learn: 0.0467121	test: 0.1495369	best: 0.1168766 (10)	total: 6.41s	remaining: 19.2s
500:	learn: 0.0466730	test: 0.1494910	best: 0.1168766 (10)	total: 6.42s	remaining: 19.2s
501:	learn: 0.0466498	test: 0.1495171	best: 0.1168766 (10)	total: 6.43s	remaining: 19.2s
502:	learn: 0.0465443	test: 0.1494558	best: 0.1168766 (10)	total: 6.44s	remaining: 19.2s
503:	learn: 0.0464712	test: 0.1494641	best: 0.1168766 (10)	total: 6.45s	remaining: 19.2s
504:	learn: 0.0463882	test: 0.1493901	best: 0.1168766 (10)	total: 6.47s	remaining: 19.2s
505:	learn: 0.0463381	test: 0.1494554	best: 0.1168766 (10)	total: 6.48s	remaining: 19.1s
506:	learn: 0.0462720	test: 0.1494935	best: 0.1168766 (10)	total: 6.49s	remaining: 19.1s
507:	learn: 0.0462040	test: 0.1494922	best: 0.1168766 (10)	total: 6.51s	remaining: 19.1s
508:	learn: 0.0461336	

594:	learn: 0.0397573	test: 0.1543680	best: 0.1168766 (10)	total: 7.68s	remaining: 18.1s
595:	learn: 0.0397364	test: 0.1543418	best: 0.1168766 (10)	total: 7.69s	remaining: 18.1s
596:	learn: 0.0396532	test: 0.1543038	best: 0.1168766 (10)	total: 7.7s	remaining: 18.1s
597:	learn: 0.0395761	test: 0.1545087	best: 0.1168766 (10)	total: 7.72s	remaining: 18.1s
598:	learn: 0.0395011	test: 0.1544999	best: 0.1168766 (10)	total: 7.74s	remaining: 18.1s
599:	learn: 0.0394564	test: 0.1545064	best: 0.1168766 (10)	total: 7.75s	remaining: 18.1s
600:	learn: 0.0392558	test: 0.1543369	best: 0.1168766 (10)	total: 7.76s	remaining: 18.1s
601:	learn: 0.0392064	test: 0.1543041	best: 0.1168766 (10)	total: 7.78s	remaining: 18.1s
602:	learn: 0.0390673	test: 0.1542761	best: 0.1168766 (10)	total: 7.79s	remaining: 18s
603:	learn: 0.0390530	test: 0.1542714	best: 0.1168766 (10)	total: 7.8s	remaining: 18s
604:	learn: 0.0389539	test: 0.1543551	best: 0.1168766 (10)	total: 7.82s	remaining: 18s
605:	learn: 0.0389466	test: 0

693:	learn: 0.0334822	test: 0.1616086	best: 0.1168766 (10)	total: 8.93s	remaining: 16.8s
694:	learn: 0.0333591	test: 0.1618300	best: 0.1168766 (10)	total: 8.95s	remaining: 16.8s
695:	learn: 0.0332283	test: 0.1620359	best: 0.1168766 (10)	total: 8.96s	remaining: 16.8s
696:	learn: 0.0332241	test: 0.1620273	best: 0.1168766 (10)	total: 8.97s	remaining: 16.8s
697:	learn: 0.0332105	test: 0.1620568	best: 0.1168766 (10)	total: 8.98s	remaining: 16.8s
698:	learn: 0.0331282	test: 0.1620743	best: 0.1168766 (10)	total: 8.99s	remaining: 16.7s
699:	learn: 0.0330499	test: 0.1621132	best: 0.1168766 (10)	total: 9.01s	remaining: 16.7s
700:	learn: 0.0330177	test: 0.1621464	best: 0.1168766 (10)	total: 9.02s	remaining: 16.7s
701:	learn: 0.0330138	test: 0.1621454	best: 0.1168766 (10)	total: 9.03s	remaining: 16.7s
702:	learn: 0.0329056	test: 0.1623323	best: 0.1168766 (10)	total: 9.04s	remaining: 16.7s
703:	learn: 0.0328609	test: 0.1623734	best: 0.1168766 (10)	total: 9.06s	remaining: 16.7s
704:	learn: 0.0327307

791:	learn: 0.0282104	test: 0.1654734	best: 0.1168766 (10)	total: 10.2s	remaining: 15.6s
792:	learn: 0.0281528	test: 0.1655151	best: 0.1168766 (10)	total: 10.2s	remaining: 15.6s
793:	learn: 0.0281436	test: 0.1653328	best: 0.1168766 (10)	total: 10.2s	remaining: 15.5s
794:	learn: 0.0281000	test: 0.1652643	best: 0.1168766 (10)	total: 10.2s	remaining: 15.5s
795:	learn: 0.0280547	test: 0.1653350	best: 0.1168766 (10)	total: 10.3s	remaining: 15.5s
796:	learn: 0.0280066	test: 0.1656864	best: 0.1168766 (10)	total: 10.3s	remaining: 15.5s
797:	learn: 0.0279451	test: 0.1657183	best: 0.1168766 (10)	total: 10.3s	remaining: 15.5s
798:	learn: 0.0278964	test: 0.1658042	best: 0.1168766 (10)	total: 10.3s	remaining: 15.5s
799:	learn: 0.0278614	test: 0.1658278	best: 0.1168766 (10)	total: 10.3s	remaining: 15.5s
800:	learn: 0.0277949	test: 0.1658684	best: 0.1168766 (10)	total: 10.3s	remaining: 15.5s
801:	learn: 0.0277431	test: 0.1660089	best: 0.1168766 (10)	total: 10.4s	remaining: 15.5s
802:	learn: 0.0277005

889:	learn: 0.0239805	test: 0.1715038	best: 0.1168766 (10)	total: 11.5s	remaining: 14.4s
890:	learn: 0.0238855	test: 0.1713893	best: 0.1168766 (10)	total: 11.5s	remaining: 14.3s
891:	learn: 0.0238396	test: 0.1714093	best: 0.1168766 (10)	total: 11.5s	remaining: 14.3s
892:	learn: 0.0238033	test: 0.1715452	best: 0.1168766 (10)	total: 11.6s	remaining: 14.3s
893:	learn: 0.0237473	test: 0.1718131	best: 0.1168766 (10)	total: 11.6s	remaining: 14.3s
894:	learn: 0.0236567	test: 0.1718372	best: 0.1168766 (10)	total: 11.6s	remaining: 14.3s
895:	learn: 0.0235788	test: 0.1719048	best: 0.1168766 (10)	total: 11.6s	remaining: 14.3s
896:	learn: 0.0235067	test: 0.1720576	best: 0.1168766 (10)	total: 11.6s	remaining: 14.3s
897:	learn: 0.0234851	test: 0.1720979	best: 0.1168766 (10)	total: 11.6s	remaining: 14.3s
898:	learn: 0.0234660	test: 0.1721160	best: 0.1168766 (10)	total: 11.7s	remaining: 14.3s
899:	learn: 0.0234411	test: 0.1721895	best: 0.1168766 (10)	total: 11.7s	remaining: 14.3s
900:	learn: 0.0233861

987:	learn: 0.0202675	test: 0.1766652	best: 0.1168766 (10)	total: 12.8s	remaining: 13.1s
988:	learn: 0.0202152	test: 0.1765193	best: 0.1168766 (10)	total: 12.8s	remaining: 13.1s
989:	learn: 0.0201762	test: 0.1765621	best: 0.1168766 (10)	total: 12.9s	remaining: 13.1s
990:	learn: 0.0201420	test: 0.1765639	best: 0.1168766 (10)	total: 12.9s	remaining: 13.1s
991:	learn: 0.0200924	test: 0.1766850	best: 0.1168766 (10)	total: 12.9s	remaining: 13.1s
992:	learn: 0.0200580	test: 0.1766555	best: 0.1168766 (10)	total: 12.9s	remaining: 13.1s
993:	learn: 0.0199733	test: 0.1767114	best: 0.1168766 (10)	total: 12.9s	remaining: 13.1s
994:	learn: 0.0199350	test: 0.1766787	best: 0.1168766 (10)	total: 12.9s	remaining: 13.1s
995:	learn: 0.0198946	test: 0.1769295	best: 0.1168766 (10)	total: 13s	remaining: 13.1s
996:	learn: 0.0198567	test: 0.1770597	best: 0.1168766 (10)	total: 13s	remaining: 13.1s
997:	learn: 0.0198480	test: 0.1769883	best: 0.1168766 (10)	total: 13s	remaining: 13s
998:	learn: 0.0197566	test: 0

1084:	learn: 0.0172677	test: 0.1802935	best: 0.1168766 (10)	total: 14.1s	remaining: 11.9s
1085:	learn: 0.0172100	test: 0.1803409	best: 0.1168766 (10)	total: 14.1s	remaining: 11.9s
1086:	learn: 0.0172079	test: 0.1803527	best: 0.1168766 (10)	total: 14.1s	remaining: 11.9s
1087:	learn: 0.0171510	test: 0.1803819	best: 0.1168766 (10)	total: 14.2s	remaining: 11.9s
1088:	learn: 0.0171080	test: 0.1803067	best: 0.1168766 (10)	total: 14.2s	remaining: 11.9s
1089:	learn: 0.0170877	test: 0.1804302	best: 0.1168766 (10)	total: 14.2s	remaining: 11.8s
1090:	learn: 0.0170534	test: 0.1804264	best: 0.1168766 (10)	total: 14.2s	remaining: 11.8s
1091:	learn: 0.0170173	test: 0.1802386	best: 0.1168766 (10)	total: 14.2s	remaining: 11.8s
1092:	learn: 0.0169824	test: 0.1802768	best: 0.1168766 (10)	total: 14.2s	remaining: 11.8s
1093:	learn: 0.0169582	test: 0.1801391	best: 0.1168766 (10)	total: 14.3s	remaining: 11.8s
1094:	learn: 0.0168878	test: 0.1800435	best: 0.1168766 (10)	total: 14.3s	remaining: 11.8s
1095:	lear

1178:	learn: 0.0148518	test: 0.1830018	best: 0.1168766 (10)	total: 15.4s	remaining: 10.7s
1179:	learn: 0.0148388	test: 0.1830293	best: 0.1168766 (10)	total: 15.4s	remaining: 10.7s
1180:	learn: 0.0147792	test: 0.1832652	best: 0.1168766 (10)	total: 15.4s	remaining: 10.7s
1181:	learn: 0.0147485	test: 0.1831890	best: 0.1168766 (10)	total: 15.4s	remaining: 10.7s
1182:	learn: 0.0147327	test: 0.1830870	best: 0.1168766 (10)	total: 15.4s	remaining: 10.6s
1183:	learn: 0.0147089	test: 0.1830926	best: 0.1168766 (10)	total: 15.4s	remaining: 10.6s
1184:	learn: 0.0147051	test: 0.1831224	best: 0.1168766 (10)	total: 15.4s	remaining: 10.6s
1185:	learn: 0.0146886	test: 0.1831320	best: 0.1168766 (10)	total: 15.4s	remaining: 10.6s
1186:	learn: 0.0146603	test: 0.1833405	best: 0.1168766 (10)	total: 15.5s	remaining: 10.6s
1187:	learn: 0.0146551	test: 0.1833637	best: 0.1168766 (10)	total: 15.5s	remaining: 10.6s
1188:	learn: 0.0146400	test: 0.1835063	best: 0.1168766 (10)	total: 15.5s	remaining: 10.6s
1189:	lear

1276:	learn: 0.0128736	test: 0.1860540	best: 0.1168766 (10)	total: 16.6s	remaining: 9.41s
1277:	learn: 0.0128415	test: 0.1861268	best: 0.1168766 (10)	total: 16.6s	remaining: 9.4s
1278:	learn: 0.0128350	test: 0.1861067	best: 0.1168766 (10)	total: 16.7s	remaining: 9.39s
1279:	learn: 0.0128087	test: 0.1860746	best: 0.1168766 (10)	total: 16.7s	remaining: 9.38s
1280:	learn: 0.0127613	test: 0.1860778	best: 0.1168766 (10)	total: 16.7s	remaining: 9.36s
1281:	learn: 0.0127176	test: 0.1863529	best: 0.1168766 (10)	total: 16.7s	remaining: 9.35s
1282:	learn: 0.0127160	test: 0.1863564	best: 0.1168766 (10)	total: 16.7s	remaining: 9.34s
1283:	learn: 0.0127153	test: 0.1863560	best: 0.1168766 (10)	total: 16.7s	remaining: 9.33s
1284:	learn: 0.0127137	test: 0.1863588	best: 0.1168766 (10)	total: 16.8s	remaining: 9.33s
1285:	learn: 0.0126878	test: 0.1863576	best: 0.1168766 (10)	total: 16.8s	remaining: 9.31s
1286:	learn: 0.0126796	test: 0.1863757	best: 0.1168766 (10)	total: 16.8s	remaining: 9.3s
1287:	learn:

1374:	learn: 0.0113147	test: 0.1886433	best: 0.1168766 (10)	total: 17.9s	remaining: 8.15s
1375:	learn: 0.0113132	test: 0.1886400	best: 0.1168766 (10)	total: 17.9s	remaining: 8.14s
1376:	learn: 0.0112957	test: 0.1886274	best: 0.1168766 (10)	total: 18s	remaining: 8.13s
1377:	learn: 0.0112888	test: 0.1886875	best: 0.1168766 (10)	total: 18s	remaining: 8.11s
1378:	learn: 0.0112798	test: 0.1886872	best: 0.1168766 (10)	total: 18s	remaining: 8.1s
1379:	learn: 0.0112350	test: 0.1889354	best: 0.1168766 (10)	total: 18s	remaining: 8.09s
1380:	learn: 0.0112219	test: 0.1889114	best: 0.1168766 (10)	total: 18s	remaining: 8.07s
1381:	learn: 0.0112187	test: 0.1889172	best: 0.1168766 (10)	total: 18s	remaining: 8.06s
1382:	learn: 0.0111997	test: 0.1889852	best: 0.1168766 (10)	total: 18.1s	remaining: 8.05s
1383:	learn: 0.0111919	test: 0.1890311	best: 0.1168766 (10)	total: 18.1s	remaining: 8.04s
1384:	learn: 0.0111814	test: 0.1890217	best: 0.1168766 (10)	total: 18.1s	remaining: 8.03s
1385:	learn: 0.0111413	

1471:	learn: 0.0098552	test: 0.1904029	best: 0.1168766 (10)	total: 19.3s	remaining: 6.91s
1472:	learn: 0.0098478	test: 0.1904387	best: 0.1168766 (10)	total: 19.3s	remaining: 6.89s
1473:	learn: 0.0098444	test: 0.1903609	best: 0.1168766 (10)	total: 19.3s	remaining: 6.88s
1474:	learn: 0.0098236	test: 0.1901217	best: 0.1168766 (10)	total: 19.3s	remaining: 6.87s
1475:	learn: 0.0098190	test: 0.1901693	best: 0.1168766 (10)	total: 19.3s	remaining: 6.85s
1476:	learn: 0.0098126	test: 0.1900912	best: 0.1168766 (10)	total: 19.3s	remaining: 6.84s
1477:	learn: 0.0098080	test: 0.1902087	best: 0.1168766 (10)	total: 19.3s	remaining: 6.83s
1478:	learn: 0.0098062	test: 0.1901892	best: 0.1168766 (10)	total: 19.4s	remaining: 6.82s
1479:	learn: 0.0097879	test: 0.1900650	best: 0.1168766 (10)	total: 19.4s	remaining: 6.81s
1480:	learn: 0.0097575	test: 0.1903128	best: 0.1168766 (10)	total: 19.4s	remaining: 6.79s
1481:	learn: 0.0097410	test: 0.1902979	best: 0.1168766 (10)	total: 19.4s	remaining: 6.78s
1482:	lear

1566:	learn: 0.0087131	test: 0.1937929	best: 0.1168766 (10)	total: 20.6s	remaining: 5.68s
1567:	learn: 0.0087057	test: 0.1938381	best: 0.1168766 (10)	total: 20.6s	remaining: 5.67s
1568:	learn: 0.0086969	test: 0.1937981	best: 0.1168766 (10)	total: 20.6s	remaining: 5.66s
1569:	learn: 0.0086859	test: 0.1938220	best: 0.1168766 (10)	total: 20.6s	remaining: 5.64s
1570:	learn: 0.0086666	test: 0.1939766	best: 0.1168766 (10)	total: 20.6s	remaining: 5.63s
1571:	learn: 0.0086591	test: 0.1939951	best: 0.1168766 (10)	total: 20.6s	remaining: 5.62s
1572:	learn: 0.0086412	test: 0.1940824	best: 0.1168766 (10)	total: 20.6s	remaining: 5.61s
1573:	learn: 0.0086395	test: 0.1940510	best: 0.1168766 (10)	total: 20.7s	remaining: 5.6s
1574:	learn: 0.0086263	test: 0.1940480	best: 0.1168766 (10)	total: 20.7s	remaining: 5.58s
1575:	learn: 0.0086177	test: 0.1939234	best: 0.1168766 (10)	total: 20.7s	remaining: 5.57s
1576:	learn: 0.0086171	test: 0.1939275	best: 0.1168766 (10)	total: 20.7s	remaining: 5.56s
1577:	learn

1662:	learn: 0.0077849	test: 0.1971616	best: 0.1168766 (10)	total: 21.9s	remaining: 4.43s
1663:	learn: 0.0077756	test: 0.1972042	best: 0.1168766 (10)	total: 21.9s	remaining: 4.42s
1664:	learn: 0.0077733	test: 0.1972228	best: 0.1168766 (10)	total: 21.9s	remaining: 4.41s
1665:	learn: 0.0077722	test: 0.1971813	best: 0.1168766 (10)	total: 21.9s	remaining: 4.39s
1666:	learn: 0.0077721	test: 0.1972940	best: 0.1168766 (10)	total: 21.9s	remaining: 4.38s
1667:	learn: 0.0077596	test: 0.1975536	best: 0.1168766 (10)	total: 21.9s	remaining: 4.37s
1668:	learn: 0.0077554	test: 0.1975646	best: 0.1168766 (10)	total: 22s	remaining: 4.36s
1669:	learn: 0.0077517	test: 0.1976619	best: 0.1168766 (10)	total: 22s	remaining: 4.34s
1670:	learn: 0.0077463	test: 0.1976657	best: 0.1168766 (10)	total: 22s	remaining: 4.33s
1671:	learn: 0.0077411	test: 0.1977074	best: 0.1168766 (10)	total: 22s	remaining: 4.32s
1672:	learn: 0.0077342	test: 0.1977612	best: 0.1168766 (10)	total: 22s	remaining: 4.3s
1673:	learn: 0.007719

1757:	learn: 0.0069999	test: 0.1998160	best: 0.1168766 (10)	total: 23.1s	remaining: 3.18s
1758:	learn: 0.0069876	test: 0.1998856	best: 0.1168766 (10)	total: 23.1s	remaining: 3.17s
1759:	learn: 0.0069868	test: 0.1998958	best: 0.1168766 (10)	total: 23.1s	remaining: 3.16s
1760:	learn: 0.0069724	test: 0.1998146	best: 0.1168766 (10)	total: 23.2s	remaining: 3.14s
1761:	learn: 0.0069616	test: 0.1999221	best: 0.1168766 (10)	total: 23.2s	remaining: 3.13s
1762:	learn: 0.0069593	test: 0.1998725	best: 0.1168766 (10)	total: 23.2s	remaining: 3.12s
1763:	learn: 0.0069524	test: 0.1998050	best: 0.1168766 (10)	total: 23.2s	remaining: 3.1s
1764:	learn: 0.0069457	test: 0.1998529	best: 0.1168766 (10)	total: 23.2s	remaining: 3.09s
1765:	learn: 0.0069443	test: 0.1998888	best: 0.1168766 (10)	total: 23.2s	remaining: 3.08s
1766:	learn: 0.0069391	test: 0.1998732	best: 0.1168766 (10)	total: 23.4s	remaining: 3.08s
1767:	learn: 0.0069334	test: 0.1998554	best: 0.1168766 (10)	total: 23.4s	remaining: 3.07s
1768:	learn

1856:	learn: 0.0062917	test: 0.2022997	best: 0.1168766 (10)	total: 24.6s	remaining: 1.9s
1857:	learn: 0.0062758	test: 0.2023568	best: 0.1168766 (10)	total: 24.6s	remaining: 1.88s
1858:	learn: 0.0062622	test: 0.2024294	best: 0.1168766 (10)	total: 24.6s	remaining: 1.87s
1859:	learn: 0.0062502	test: 0.2024202	best: 0.1168766 (10)	total: 24.7s	remaining: 1.85s
1860:	learn: 0.0062295	test: 0.2026530	best: 0.1168766 (10)	total: 24.7s	remaining: 1.84s
1861:	learn: 0.0062232	test: 0.2026411	best: 0.1168766 (10)	total: 24.7s	remaining: 1.83s
1862:	learn: 0.0062189	test: 0.2026212	best: 0.1168766 (10)	total: 24.7s	remaining: 1.81s
1863:	learn: 0.0062171	test: 0.2025948	best: 0.1168766 (10)	total: 24.7s	remaining: 1.8s
1864:	learn: 0.0062105	test: 0.2026988	best: 0.1168766 (10)	total: 24.7s	remaining: 1.79s
1865:	learn: 0.0062067	test: 0.2027391	best: 0.1168766 (10)	total: 24.7s	remaining: 1.77s
1866:	learn: 0.0062000	test: 0.2028139	best: 0.1168766 (10)	total: 24.7s	remaining: 1.76s
1867:	learn:

1953:	learn: 0.0055497	test: 0.2058202	best: 0.1168766 (10)	total: 25.9s	remaining: 610ms
1954:	learn: 0.0055492	test: 0.2058231	best: 0.1168766 (10)	total: 25.9s	remaining: 597ms
1955:	learn: 0.0055484	test: 0.2058327	best: 0.1168766 (10)	total: 25.9s	remaining: 583ms
1956:	learn: 0.0055477	test: 0.2058302	best: 0.1168766 (10)	total: 25.9s	remaining: 570ms
1957:	learn: 0.0055449	test: 0.2058175	best: 0.1168766 (10)	total: 26s	remaining: 557ms
1958:	learn: 0.0055329	test: 0.2058214	best: 0.1168766 (10)	total: 26s	remaining: 544ms
1959:	learn: 0.0055275	test: 0.2057768	best: 0.1168766 (10)	total: 26s	remaining: 530ms
1960:	learn: 0.0055272	test: 0.2057777	best: 0.1168766 (10)	total: 26s	remaining: 517ms
1961:	learn: 0.0054924	test: 0.2056860	best: 0.1168766 (10)	total: 26s	remaining: 504ms
1962:	learn: 0.0054828	test: 0.2057011	best: 0.1168766 (10)	total: 26s	remaining: 491ms
1963:	learn: 0.0054686	test: 0.2057507	best: 0.1168766 (10)	total: 26s	remaining: 477ms
1964:	learn: 0.0054563	t

In [400]:
ranking_training_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92451 entries, 0 to 92450
Data columns (total 60 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   srch_id                      92451 non-null  object 
 1   site_id                      92451 non-null  object 
 2   visitor_location_country_id  92451 non-null  object 
 3   visitor_hist_starrating      4461 non-null   object 
 4   visitor_hist_adr_usd         4486 non-null   object 
 5   prop_country_id              92451 non-null  object 
 6   prop_id                      92451 non-null  object 
 7   prop_starrating              92451 non-null  object 
 8   prop_review_score            92300 non-null  object 
 9   prop_brand_bool              92451 non-null  object 
 10  prop_location_score1         92451 non-null  object 
 11  prop_location_score2         72046 non-null  object 
 12  prop_log_historical_price    92451 non-null  object 
 13  position        

In [401]:
relevancyclf_training_predictions.head()
ranking_training_predictions["prediction_ts"] = X_train_cutoff["date_time"]

In [402]:
ranking_training_predictions["Predictedrank"] = ranking_training_predictions.groupby("srch_id")["PredictedRelevancyScore"].rank("dense", ascending=False)
ranking_training_predictions_df=ranking_training_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_training_predictions_df = ranking_training_predictions_df.reset_index(drop=True)

ranking_training_predictions_df[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedScore']] = relevancyclf_training_predictions[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedRelevancyScore']]

ranking_training_predictions_df['PredictedRelevancy'].value_counts()

Not Relevant    92443
Relevant            8
Name: PredictedRelevancy, dtype: int64

In [403]:
ranking_test_predictions["prediction_ts"] = HotelSearch_test["date_time"]

ranking_test_predictions["Predictedrank"] = ranking_test_predictions.groupby("srch_id")["PredictedRelevancyScore"].rank("dense", ascending=False)
ranking_test_predictions_df=ranking_test_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_test_predictions_df = ranking_test_predictions_df.reset_index(drop=True)

ranking_test_predictions_df[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedScore']] = relevancyclf_test_predictions[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedRelevancyScore']]

ranking_test_predictions_df.head()


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,day,weekday,week_of_year,hour,minute,time_epoch,early_night,ActualRelevancyScore,ActualRelevancy,PredictedRelevancyScore,prediction_ts,Predictedrank,Not_Relevant_prob,Yes_Relevant_prob,RelevanceLabelPrediction,PredictedRelevancy,PredictedScore
0,246,14,100,,,219,11396,4,5.0,0,0.69,0.0551,5.36,1,159.64,0,18294,1,6,2,0,1,0,,87.17,,,,,,,,,,,,,,,,,,,,,,,,,0,4,0,10,9,55,1362390943.0,False,1.0,Relevant,0.108178,2013-03-04 09:55:43,2.0,0.830779,0.169221,0.0,Not Relevant,0.830779
1,246,14,100,,,219,103885,2,4.0,1,1.1,0.154,4.75,2,98.32,0,18294,1,6,2,0,1,0,,93.87,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,1,4,0,10,9,55,1362390943.0,False,0.0,Not Relevant,0.111394,2013-03-04 09:55:43,1.0,0.820869,0.179131,0.0,Not Relevant,0.820869
2,246,14,100,,,219,61167,3,4.5,0,1.1,0.0314,4.87,3,101.24,0,18294,1,6,2,0,1,0,,92.31,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,2,4,0,10,9,55,1362390943.0,False,0.0,Not Relevant,0.035087,2013-03-04 09:55:43,4.0,0.928381,0.071619,0.0,Not Relevant,0.928381
3,246,14,100,,,219,95490,2,3.5,1,1.1,0.0142,4.88,4,95.4,0,18294,1,6,2,0,1,0,,89.28,,,,,,,,,,,,,,,,,,,,,,,,,3,4,0,10,9,55,1362390943.0,False,0.0,Not Relevant,0.002431,2013-03-04 09:55:43,9.0,0.942449,0.057551,0.0,Not Relevant,0.942449
4,246,14,100,,,219,127213,3,4.0,1,1.1,0.1652,4.95,6,119.73,0,18294,1,6,2,0,1,0,,93.97,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,4,4,0,10,9,55,1362390943.0,False,0.0,Not Relevant,0.036583,2013-03-04 09:55:43,3.0,0.934113,0.065887,0.0,Not Relevant,0.934113


In [404]:
ranking_training_predictions_df.to_csv('./data/hotelsearch_training_predictions.csv')
ranking_test_predictions_df.to_csv('./data/hotelsearch_test_predictions.csv')

In [278]:
#relevancyclf_training_predictions.to_csv('./data/hotelsearch_clf_training.csv')
#relevancyclf_test_predictions.to_csv('./data/hotelsearch_clf_test.csv')

In [407]:
ranking_training_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92451 entries, 0 to 92450
Data columns (total 67 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      92451 non-null  object        
 1   site_id                      92451 non-null  object        
 2   visitor_location_country_id  92451 non-null  object        
 3   visitor_hist_starrating      4461 non-null   object        
 4   visitor_hist_adr_usd         4486 non-null   object        
 5   prop_country_id              92451 non-null  object        
 6   prop_id                      92451 non-null  object        
 7   prop_starrating              92451 non-null  object        
 8   prop_review_score            92300 non-null  object        
 9   prop_brand_bool              92451 non-null  object        
 10  prop_location_score1         92451 non-null  object        
 11  prop_location_score2         72046 non-nu