In [256]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRanker, Pool, MetricVisualizer, CatBoostClassifier
from copy import deepcopy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [257]:


def featurize_df(df:pd.DataFrame) ->pd.DataFrame:
    """
    Extract more features
    """
    df["weekday"] = df["date_time"].dt.weekday
    df["week_of_year"] = df["date_time"].dt.isocalendar().week

    df["hour"] = df["date_time"].dt.hour
    df["minute"] = df["date_time"].dt.minute
    ## total time elapsed - allows model to learn continous trend over time to a degree
    df["time_epoch"] = df["date_time"].astype('int64')//1e9
    ## if we were looking at fraud: df["seconds"] = df.timestamp.dt.second
    df["early_night"] = ((df["hour"]>19) | (df["hour"]<3)) # no added value from feature
    
    df["nans_count"] = df.isna().sum(axis=1)
    
    ## we won't make any time series features for now
    ## We could add time series features per property/hotel. We'd need to check for unaries, and to add a shift/offset dependant on forecast horizon

    return df


In [258]:
HotelSearch_train=pd.read_csv("./data/HotelSearch_train_sampled.csv", index_col=0)
HotelSearch_test=pd.read_csv("./data/HotelSearch_test_sampled.csv", index_col=0)



#print(HotelSearch_orig['booking_bool'].unique())
HotelSearch_train["date_time"] = pd.to_datetime(HotelSearch_train["date_time"],infer_datetime_format=True)
#HotelSearch_train["target"] = HotelSearch_train.apply(get_target,axis=1)

HotelSearch_test["date_time"] = pd.to_datetime(HotelSearch_test["date_time"],infer_datetime_format=True)
#HotelSearch_test["target"] = HotelSearch_test.apply(get_target,axis=1)

HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98361 entries, 0 to 98360
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      98361 non-null  int64         
 1   date_time                    98361 non-null  datetime64[ns]
 2   site_id                      98361 non-null  int64         
 3   visitor_location_country_id  98361 non-null  int64         
 4   visitor_hist_starrating      4699 non-null   float64       
 5   visitor_hist_adr_usd         4724 non-null   float64       
 6   prop_country_id              98361 non-null  int64         
 7   prop_id                      98361 non-null  int64         
 8   prop_starrating              98361 non-null  int64         
 9   prop_review_score            98201 non-null  float64       
 10  prop_brand_bool              98361 non-null  int64         
 11  prop_location_score1         98361 non-nu

In [259]:
drop_cols=[]
drop_unary_cols = [c for c in list(HotelSearch_train)
    if HotelSearch_train[c].nunique(dropna=False) <= 1]
#print(drop_unary_cols)
target_cols = ["gross_bookings_usd","click_bool","booking_bool","Unnamed: 0"] # leaky column, and original target columns
drop_cols.extend(drop_unary_cols)
drop_cols.extend(target_cols) 
print(drop_cols)
HotelSearch_train = HotelSearch_train.drop(columns=drop_cols,errors="ignore")
HotelSearch_test  = HotelSearch_test.drop(columns=drop_cols,errors="ignore")
print(HotelSearch_train.shape)
print(HotelSearch_test.shape)


['gross_bookings_usd', 'click_bool', 'booking_bool', 'Unnamed: 0']
(98361, 53)
(147730, 53)


In [260]:
HotelSearch_train = featurize_df(HotelSearch_train)
HotelSearch_test = featurize_df(HotelSearch_test)
HotelSearch_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147730 entries, 0 to 147729
Data columns (total 60 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   srch_id                      147730 non-null  int64         
 1   date_time                    147730 non-null  datetime64[ns]
 2   site_id                      147730 non-null  int64         
 3   visitor_location_country_id  147730 non-null  int64         
 4   visitor_hist_starrating      7770 non-null    float64       
 5   visitor_hist_adr_usd         7801 non-null    float64       
 6   prop_country_id              147730 non-null  int64         
 7   prop_id                      147730 non-null  int64         
 8   prop_starrating              147730 non-null  int64         
 9   prop_review_score            147499 non-null  float64       
 10  prop_brand_bool              147730 non-null  int64         
 11  prop_location_score1      

In [261]:
HotelSearch_train.drop(['comp3_rate',
       'comp3_inv', 'comp3_rate_percent_diff', 'comp4_inv', 'comp5_rate',
       'comp5_inv', 'comp5_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff'],axis=1).groupby(HotelSearch_train["target"]>0).mean()
HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98361 entries, 0 to 98360
Data columns (total 60 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      98361 non-null  int64         
 1   date_time                    98361 non-null  datetime64[ns]
 2   site_id                      98361 non-null  int64         
 3   visitor_location_country_id  98361 non-null  int64         
 4   visitor_hist_starrating      4699 non-null   float64       
 5   visitor_hist_adr_usd         4724 non-null   float64       
 6   prop_country_id              98361 non-null  int64         
 7   prop_id                      98361 non-null  int64         
 8   prop_starrating              98361 non-null  int64         
 9   prop_review_score            98201 non-null  float64       
 10  prop_brand_bool              98361 non-null  int64         
 11  prop_location_score1         98361 non-nu

In [262]:
cutoff_id = HotelSearch_train["srch_id"].quantile(0.94) # 90/10 split

X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id].drop(["target","date_time"],axis=1)
X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id].drop(["target","date_time"],axis=1)
X_test_df=HotelSearch_test.drop(["target","date_time"],axis=1)

feature_cols=X_train_df.columns.tolist()
# X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]
# X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]

y_train_df = HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]["target"].astype(float)
y_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]["target"].astype(float)
y_test_df = HotelSearch_test["target"].astype(float)

X_train = X_train_df.values
X_eval = X_eval_df.values
X_test= X_test_df.values
queries_train = X_train_df["srch_id"].values

y_train = y_train_df.values
y_eval = y_eval_df.values
y_test = y_test_df.values

queries_test = X_eval_df["srch_id"].values



max_relevance = float(np.max(y_train))
y_train /= max_relevance
y_eval /= max_relevance
y_train_df.value_counts()


0.0    89823
1.0     2628
Name: target, dtype: int64

In [263]:
print("mean relevancy train",round(y_train.mean(),4))
print("mean relevancy eval",round(y_eval.mean(),4))
print(y_eval_df.value_counts()) # check we have all 3 "labels" in subset

mean relevancy train 0.0284
mean relevancy eval 0.0293
0.0    5737
1.0     173
Name: target, dtype: int64


In [264]:
HotelSearch_train['target'].value_counts()

0    95560
1     2801
Name: target, dtype: int64

In [265]:
categorical_cols = ['prop_id',"srch_destination_id", "weekday"] # ,"week_of_year"

In [266]:
X_train_df.shape

(92451, 58)

In [267]:
#set(X_train.columns).symmetric_difference(set(HotelSearch_test.columns))
train_pool = Pool(data=X_train,
                  label = y_train,
#                  cat_features=categorical_cols,
                  group_id=queries_train
                 )

eval_pool = Pool(data=X_eval,
                  label = y_eval,
#                  cat_features=categorical_cols,
                  group_id=queries_test
                 )

In [268]:
default_parameters  = {
    'iterations': 2000,
    'custom_metric': ['NDCG', "AUC:type=Ranking",'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], # , 'AverageGain:top=3'# 'QueryRMSE', "YetiLoss" (use with hints)
    'verbose': False,
    'random_seed': 42,
#     "task_type":"GPU",
#    "has_time":True,
    "metric_period":4,
    "save_snapshot":False,
    "use_best_model":True, # requires eval set to be set
} 

default_clf_parameters  = {
    'iterations':2000,
    'random_seed':42,
    'learning_rate':0.5,
    'custom_loss':['AUC', 'Accuracy']
}    


In [313]:
def fit_model(model,loss_function, prediction_function, feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval,additional_params=None, train_pool=train_pool, test_pool=eval_pool):

    print("Model Type: Is it CatboostRanker?")
    print(isinstance(model, type(CatBoostRanker())))

    
    if isinstance(model, type(CatBoostRanker())) is True:   
        print("Model Type is CatboostRanker")
        parameters = deepcopy(default_parameters)
        parameters['loss_function'] = loss_function
        parameters['train_dir'] = loss_function
        print(parameters) 
        if additional_params is not None:
            parameters.update(additional_params)
        model = CatBoostRanker(**parameters)
        model.fit(train_pool, eval_set=test_pool, plot=True)
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
    else:
        print("Model Type is CatboostClassifier")
        parameters = deepcopy(default_clf_parameters)
#        parameters['loss_function'] = loss_function
#        parameters['train_dir'] = loss_function
        model = CatBoostClassifier(**parameters)
        print(parameters) 
        model.fit(X_train, y_train,eval_set=(X_eval, y_eval),plot=True)        
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
#Training predictions
    y_train_predictions = prediction_function(model, X_train)
    training_predictions = pd.DataFrame(X_train,columns=feature_cols)
    training_predictions['ActualRelevancyScore'] = y_train
    training_predictions['ActualRelevancy']=np.where(y_train==1.0,"Relevant","Not Relevant")

    if isinstance(model, type(CatBoostRanker())) is True:
        training_predictions['PredictedRelevancyScore'] = y_train_predictions
    else:
        training_predictions['Not_Relevant_prob'] = y_train_predictions[:,0]
        training_predictions['Yes_Relevant_prob'] = y_train_predictions[:,1]
        training_predictions['RelevanceLabelPrediction'] = np.argmax(y_train_predictions, axis=1).astype(np.float32)
        training_predictions['PredictedRelevancy']=np.where(training_predictions['RelevanceLabelPrediction']==1.0,"Relevant","Not Relevant")
        training_predictions['PredictedRelevancyScore'] = np.where(training_predictions['RelevanceLabelPrediction']==1.0,training_predictions['Yes_Relevant_prob'],training_predictions['Not_Relevant_prob'])
    
#Test predictions
    y_test_predictions = prediction_function(model, X_test)
    test_predictions = pd.DataFrame(X_test,columns=feature_cols)
    test_predictions['ActualRelevancyScore'] = y_test
    test_predictions['ActualRelevancy']=np.where(y_test==1.0,"Relevant","Not Relevant")

    if isinstance(model, type(CatBoostRanker())) is True:
        test_predictions['PredictedRelevancyScore'] = y_test_predictions
    else:
        test_predictions['Not_Relevant_prob'] = y_test_predictions[:,0]
        test_predictions['Yes_Relevant_prob'] = y_test_predictions[:,1]
        test_predictions['RelevanceLabelPrediction'] = np.argmax(y_test_predictions, axis=1).astype(np.float32)
        test_predictions['PredictedRelevancy']=np.where(test_predictions['RelevanceLabelPrediction']==1.0,"Relevant","Not Relevant")
        test_predictions['PredictedRelevancyScore'] = np.where(test_predictions['RelevanceLabelPrediction']==1.0,test_predictions['Yes_Relevant_prob'],test_predictions['Not_Relevant_prob'])
       

#     print("(Default) Feature importance (on train pool)")
#     display(model.get_feature_importance(data=train_pool,prettified=True).head(15))
    
#     try:
#         print("SHAP features importance, on all data:")
#         explainer = shap.TreeExplainer(model)
#         shap_values = explainer.shap_values(pd.concat([X_train,X_eval]),
#                                             y=pd.concat([y_train,y_eval]))

#         # # summarize the effects of all the features
#         shap.summary_plot(shap_values, pd.concat([X_train,X_eval]))
#    finally:
    return model, training_predictions , test_predictions  


In [314]:
def get_predicted_ranking_outcome(model, data):
#    a=model.predict(data)
#    print(a)
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict(data)

In [315]:
def get_predicted_clf_outcome(model, data):
#    print(model.predict_proba(data))
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict_proba(data)

In [310]:
#model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})
parameters = {}
ranking_model,ranking_training_predictions, ranking_test_predictions = fit_model(CatBoostRanker(),'QueryRMSE',get_predicted_ranking_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
True
Model Type is CatboostRanker
{'iterations': 2000, 'custom_metric': ['NDCG', 'AUC:type=Ranking', 'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], 'verbose': False, 'random_seed': 42, 'metric_period': 4, 'save_snapshot': False, 'use_best_model': True, 'loss_function': 'QueryRMSE', 'train_dir': 'QueryRMSE'}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best results (train on train):
{'MAP:top=10': 0.37225463775116746, 'RecallAt:top=10': 0.9321943406300054, 'PrecisionAt:top=10': 0.06704770803142165, 'QueryRMSE': 0.1507495037944327}
best results (on validation set):
{'NDCG:type=Base': 0.6759827628642938, 'MAP:top=10': 0.27431559668401795, 'RecallAt:top=10': 0.8906882591093117, 'PrecisionAt:top=10': 0.06322697770066177, 'QueryRMSE': 0.16196556683244842, 'AUC:type=Ranking': 0.7718128243699502}


In [311]:
ranking_training_predictions.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancyScore,ActualRelevancy,PredictedRelevancyScore
0,71,24,216,,,98,76541,3,3.5,0,2.2,0.0482,3.21,1,16.9,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.03371
1,71,24,216,,,98,37933,3,4.0,0,2.08,0.0812,3.59,2,18.34,0,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,1,1,11,23,14,1363130090.0,True,20,0.0,Not Relevant,0.03636
2,71,24,216,,,98,108588,4,4.0,0,0.0,0.0268,4.08,3,26.78,1,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,2,1,11,23,14,1363130090.0,True,17,0.0,Not Relevant,0.017548
3,71,24,216,,,98,28672,3,4.0,0,2.2,0.0151,3.35,4,21.48,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,3,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.010673
4,71,24,216,,,98,25741,3,3.5,0,1.61,0.0359,4.02,6,19.47,1,341,1,18,1,1,1,1,,,1,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,4,1,11,23,14,1363130090.0,True,19,0.0,Not Relevant,0.015136


In [316]:
parameters = {}
relevancyclf_model,relevancyclf_training_predictions, relevancyclf_test_predictions = fit_model(CatBoostClassifier(),'AUC',get_predicted_clf_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
False
Model Type is CatboostClassifier
{'iterations': 2000, 'random_seed': 42, 'learning_rate': 0.5, 'custom_loss': ['AUC', 'Accuracy']}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.2074159	test: 0.2086029	best: 0.2086029 (0)	total: 21.6ms	remaining: 43.2s
1:	learn: 0.1266463	test: 0.1284016	best: 0.1284016 (1)	total: 41.9ms	remaining: 41.9s
2:	learn: 0.1131732	test: 0.1154236	best: 0.1154236 (2)	total: 57.6ms	remaining: 38.3s
3:	learn: 0.1097551	test: 0.1124045	best: 0.1124045 (3)	total: 72.2ms	remaining: 36s
4:	learn: 0.1081585	test: 0.1114645	best: 0.1114645 (4)	total: 85.1ms	remaining: 33.9s
5:	learn: 0.1075461	test: 0.1112727	best: 0.1112727 (5)	total: 94.5ms	remaining: 31.4s
6:	learn: 0.1071975	test: 0.1115514	best: 0.1112727 (5)	total: 105ms	remaining: 30s
7:	learn: 0.1070893	test: 0.1116050	best: 0.1112727 (5)	total: 117ms	remaining: 29.1s
8:	learn: 0.1067913	test: 0.1114940	best: 0.1112727 (5)	total: 129ms	remaining: 28.4s
9:	learn: 0.1066184	test: 0.1115702	best: 0.1112727 (5)	total: 144ms	remaining: 28.6s
10:	learn: 0.1065860	test: 0.1116119	best: 0.1112727 (5)	total: 175ms	remaining: 31.6s
11:	learn: 0.1062766	test: 0.1116563	best: 0.111272

100:	learn: 0.0900950	test: 0.1188535	best: 0.1111548 (29)	total: 1.28s	remaining: 24s
101:	learn: 0.0898852	test: 0.1189811	best: 0.1111548 (29)	total: 1.29s	remaining: 24s
102:	learn: 0.0897923	test: 0.1190640	best: 0.1111548 (29)	total: 1.3s	remaining: 23.9s
103:	learn: 0.0895697	test: 0.1191080	best: 0.1111548 (29)	total: 1.31s	remaining: 24s
104:	learn: 0.0893710	test: 0.1192001	best: 0.1111548 (29)	total: 1.33s	remaining: 24s
105:	learn: 0.0892158	test: 0.1192555	best: 0.1111548 (29)	total: 1.34s	remaining: 24s
106:	learn: 0.0891721	test: 0.1192768	best: 0.1111548 (29)	total: 1.35s	remaining: 24s
107:	learn: 0.0891051	test: 0.1193770	best: 0.1111548 (29)	total: 1.36s	remaining: 23.9s
108:	learn: 0.0890102	test: 0.1193372	best: 0.1111548 (29)	total: 1.38s	remaining: 23.9s
109:	learn: 0.0889358	test: 0.1193845	best: 0.1111548 (29)	total: 1.39s	remaining: 23.9s
110:	learn: 0.0887797	test: 0.1192519	best: 0.1111548 (29)	total: 1.4s	remaining: 23.9s
111:	learn: 0.0885945	test: 0.11926

197:	learn: 0.0763973	test: 0.1266629	best: 0.1111548 (29)	total: 2.59s	remaining: 23.6s
198:	learn: 0.0761924	test: 0.1264766	best: 0.1111548 (29)	total: 2.6s	remaining: 23.6s
199:	learn: 0.0761700	test: 0.1264600	best: 0.1111548 (29)	total: 2.61s	remaining: 23.5s
200:	learn: 0.0760609	test: 0.1261892	best: 0.1111548 (29)	total: 2.63s	remaining: 23.5s
201:	learn: 0.0759394	test: 0.1262015	best: 0.1111548 (29)	total: 2.65s	remaining: 23.5s
202:	learn: 0.0758949	test: 0.1261931	best: 0.1111548 (29)	total: 2.66s	remaining: 23.5s
203:	learn: 0.0757801	test: 0.1262527	best: 0.1111548 (29)	total: 2.67s	remaining: 23.5s
204:	learn: 0.0756094	test: 0.1262648	best: 0.1111548 (29)	total: 2.68s	remaining: 23.5s
205:	learn: 0.0754521	test: 0.1269125	best: 0.1111548 (29)	total: 2.69s	remaining: 23.4s
206:	learn: 0.0753826	test: 0.1268173	best: 0.1111548 (29)	total: 2.7s	remaining: 23.4s
207:	learn: 0.0751850	test: 0.1272420	best: 0.1111548 (29)	total: 2.72s	remaining: 23.4s
208:	learn: 0.0749630	t

295:	learn: 0.0636215	test: 0.1335956	best: 0.1111548 (29)	total: 3.92s	remaining: 22.6s
296:	learn: 0.0635636	test: 0.1338864	best: 0.1111548 (29)	total: 3.93s	remaining: 22.5s
297:	learn: 0.0634455	test: 0.1340568	best: 0.1111548 (29)	total: 3.94s	remaining: 22.5s
298:	learn: 0.0633578	test: 0.1341432	best: 0.1111548 (29)	total: 3.95s	remaining: 22.5s
299:	learn: 0.0632049	test: 0.1339054	best: 0.1111548 (29)	total: 3.96s	remaining: 22.5s
300:	learn: 0.0630716	test: 0.1340940	best: 0.1111548 (29)	total: 3.98s	remaining: 22.5s
301:	learn: 0.0628975	test: 0.1349141	best: 0.1111548 (29)	total: 3.99s	remaining: 22.5s
302:	learn: 0.0627761	test: 0.1350563	best: 0.1111548 (29)	total: 4.01s	remaining: 22.4s
303:	learn: 0.0625813	test: 0.1349438	best: 0.1111548 (29)	total: 4.02s	remaining: 22.4s
304:	learn: 0.0623094	test: 0.1352260	best: 0.1111548 (29)	total: 4.04s	remaining: 22.4s
305:	learn: 0.0621916	test: 0.1353676	best: 0.1111548 (29)	total: 4.05s	remaining: 22.4s
306:	learn: 0.0620033

394:	learn: 0.0525747	test: 0.1404694	best: 0.1111548 (29)	total: 5.24s	remaining: 21.3s
395:	learn: 0.0525237	test: 0.1406328	best: 0.1111548 (29)	total: 5.25s	remaining: 21.3s
396:	learn: 0.0524817	test: 0.1406455	best: 0.1111548 (29)	total: 5.26s	remaining: 21.2s
397:	learn: 0.0523382	test: 0.1406080	best: 0.1111548 (29)	total: 5.27s	remaining: 21.2s
398:	learn: 0.0522762	test: 0.1406466	best: 0.1111548 (29)	total: 5.28s	remaining: 21.2s
399:	learn: 0.0522423	test: 0.1406772	best: 0.1111548 (29)	total: 5.3s	remaining: 21.2s
400:	learn: 0.0521491	test: 0.1408790	best: 0.1111548 (29)	total: 5.31s	remaining: 21.2s
401:	learn: 0.0521270	test: 0.1408405	best: 0.1111548 (29)	total: 5.32s	remaining: 21.2s
402:	learn: 0.0520936	test: 0.1407973	best: 0.1111548 (29)	total: 5.33s	remaining: 21.1s
403:	learn: 0.0520063	test: 0.1408625	best: 0.1111548 (29)	total: 5.35s	remaining: 21.1s
404:	learn: 0.0518812	test: 0.1408562	best: 0.1111548 (29)	total: 5.36s	remaining: 21.1s
405:	learn: 0.0516900	

493:	learn: 0.0451098	test: 0.1447339	best: 0.1111548 (29)	total: 6.47s	remaining: 19.7s
494:	learn: 0.0449581	test: 0.1447604	best: 0.1111548 (29)	total: 6.48s	remaining: 19.7s
495:	learn: 0.0448772	test: 0.1447411	best: 0.1111548 (29)	total: 6.5s	remaining: 19.7s
496:	learn: 0.0448199	test: 0.1448915	best: 0.1111548 (29)	total: 6.52s	remaining: 19.7s
497:	learn: 0.0446733	test: 0.1447631	best: 0.1111548 (29)	total: 6.53s	remaining: 19.7s
498:	learn: 0.0446663	test: 0.1447561	best: 0.1111548 (29)	total: 6.55s	remaining: 19.7s
499:	learn: 0.0446590	test: 0.1447162	best: 0.1111548 (29)	total: 6.56s	remaining: 19.7s
500:	learn: 0.0446053	test: 0.1446981	best: 0.1111548 (29)	total: 6.57s	remaining: 19.7s
501:	learn: 0.0446011	test: 0.1446804	best: 0.1111548 (29)	total: 6.58s	remaining: 19.7s
502:	learn: 0.0445295	test: 0.1445875	best: 0.1111548 (29)	total: 6.6s	remaining: 19.6s
503:	learn: 0.0444468	test: 0.1446946	best: 0.1111548 (29)	total: 6.61s	remaining: 19.6s
504:	learn: 0.0443470	t

587:	learn: 0.0388521	test: 0.1472183	best: 0.1111548 (29)	total: 7.67s	remaining: 18.4s
588:	learn: 0.0387759	test: 0.1472384	best: 0.1111548 (29)	total: 7.69s	remaining: 18.4s
589:	learn: 0.0387532	test: 0.1473610	best: 0.1111548 (29)	total: 7.7s	remaining: 18.4s
590:	learn: 0.0387116	test: 0.1474125	best: 0.1111548 (29)	total: 7.71s	remaining: 18.4s
591:	learn: 0.0386631	test: 0.1474386	best: 0.1111548 (29)	total: 7.72s	remaining: 18.4s
592:	learn: 0.0385616	test: 0.1476208	best: 0.1111548 (29)	total: 7.73s	remaining: 18.4s
593:	learn: 0.0385251	test: 0.1475689	best: 0.1111548 (29)	total: 7.75s	remaining: 18.3s
594:	learn: 0.0384369	test: 0.1475387	best: 0.1111548 (29)	total: 7.76s	remaining: 18.3s
595:	learn: 0.0383209	test: 0.1476763	best: 0.1111548 (29)	total: 7.77s	remaining: 18.3s
596:	learn: 0.0381480	test: 0.1476178	best: 0.1111548 (29)	total: 7.79s	remaining: 18.3s
597:	learn: 0.0381395	test: 0.1476313	best: 0.1111548 (29)	total: 7.8s	remaining: 18.3s
598:	learn: 0.0380998	t

688:	learn: 0.0323731	test: 0.1530173	best: 0.1111548 (29)	total: 8.94s	remaining: 17s
689:	learn: 0.0323550	test: 0.1529698	best: 0.1111548 (29)	total: 8.95s	remaining: 17s
690:	learn: 0.0322708	test: 0.1530399	best: 0.1111548 (29)	total: 8.96s	remaining: 17s
691:	learn: 0.0321952	test: 0.1530470	best: 0.1111548 (29)	total: 8.98s	remaining: 17s
692:	learn: 0.0321744	test: 0.1536938	best: 0.1111548 (29)	total: 9s	remaining: 17s
693:	learn: 0.0321437	test: 0.1537258	best: 0.1111548 (29)	total: 9.01s	remaining: 17s
694:	learn: 0.0321320	test: 0.1537134	best: 0.1111548 (29)	total: 9.02s	remaining: 16.9s
695:	learn: 0.0320972	test: 0.1537639	best: 0.1111548 (29)	total: 9.03s	remaining: 16.9s
696:	learn: 0.0320515	test: 0.1537582	best: 0.1111548 (29)	total: 9.04s	remaining: 16.9s
697:	learn: 0.0319787	test: 0.1541005	best: 0.1111548 (29)	total: 9.06s	remaining: 16.9s
698:	learn: 0.0319522	test: 0.1541507	best: 0.1111548 (29)	total: 9.07s	remaining: 16.9s
699:	learn: 0.0318626	test: 0.154364

787:	learn: 0.0273290	test: 0.1598231	best: 0.1111548 (29)	total: 10.2s	remaining: 15.7s
788:	learn: 0.0272922	test: 0.1598046	best: 0.1111548 (29)	total: 10.2s	remaining: 15.7s
789:	learn: 0.0271895	test: 0.1599278	best: 0.1111548 (29)	total: 10.2s	remaining: 15.7s
790:	learn: 0.0271838	test: 0.1599533	best: 0.1111548 (29)	total: 10.3s	remaining: 15.7s
791:	learn: 0.0270875	test: 0.1598737	best: 0.1111548 (29)	total: 10.3s	remaining: 15.7s
792:	learn: 0.0270839	test: 0.1599127	best: 0.1111548 (29)	total: 10.3s	remaining: 15.6s
793:	learn: 0.0270408	test: 0.1599624	best: 0.1111548 (29)	total: 10.3s	remaining: 15.6s
794:	learn: 0.0269740	test: 0.1600914	best: 0.1111548 (29)	total: 10.3s	remaining: 15.6s
795:	learn: 0.0269704	test: 0.1601054	best: 0.1111548 (29)	total: 10.3s	remaining: 15.6s
796:	learn: 0.0269202	test: 0.1603166	best: 0.1111548 (29)	total: 10.3s	remaining: 15.6s
797:	learn: 0.0268875	test: 0.1603441	best: 0.1111548 (29)	total: 10.3s	remaining: 15.6s
798:	learn: 0.0267633

886:	learn: 0.0232039	test: 0.1638717	best: 0.1111548 (29)	total: 11.5s	remaining: 14.4s
887:	learn: 0.0231766	test: 0.1638569	best: 0.1111548 (29)	total: 11.5s	remaining: 14.4s
888:	learn: 0.0231212	test: 0.1639705	best: 0.1111548 (29)	total: 11.5s	remaining: 14.4s
889:	learn: 0.0230915	test: 0.1638303	best: 0.1111548 (29)	total: 11.6s	remaining: 14.4s
890:	learn: 0.0230623	test: 0.1637871	best: 0.1111548 (29)	total: 11.6s	remaining: 14.4s
891:	learn: 0.0230571	test: 0.1637667	best: 0.1111548 (29)	total: 11.6s	remaining: 14.4s
892:	learn: 0.0230037	test: 0.1636682	best: 0.1111548 (29)	total: 11.6s	remaining: 14.4s
893:	learn: 0.0229876	test: 0.1636926	best: 0.1111548 (29)	total: 11.6s	remaining: 14.4s
894:	learn: 0.0229290	test: 0.1634917	best: 0.1111548 (29)	total: 11.6s	remaining: 14.4s
895:	learn: 0.0229130	test: 0.1635456	best: 0.1111548 (29)	total: 11.6s	remaining: 14.3s
896:	learn: 0.0229057	test: 0.1649101	best: 0.1111548 (29)	total: 11.7s	remaining: 14.3s
897:	learn: 0.0228705

985:	learn: 0.0199581	test: 0.1678982	best: 0.1111548 (29)	total: 12.9s	remaining: 13.2s
986:	learn: 0.0199099	test: 0.1679027	best: 0.1111548 (29)	total: 12.9s	remaining: 13.2s
987:	learn: 0.0198459	test: 0.1681022	best: 0.1111548 (29)	total: 12.9s	remaining: 13.2s
988:	learn: 0.0198083	test: 0.1681482	best: 0.1111548 (29)	total: 12.9s	remaining: 13.2s
989:	learn: 0.0197838	test: 0.1686191	best: 0.1111548 (29)	total: 12.9s	remaining: 13.2s
990:	learn: 0.0197317	test: 0.1684724	best: 0.1111548 (29)	total: 12.9s	remaining: 13.2s
991:	learn: 0.0197142	test: 0.1685164	best: 0.1111548 (29)	total: 13s	remaining: 13.2s
992:	learn: 0.0196974	test: 0.1685298	best: 0.1111548 (29)	total: 13s	remaining: 13.2s
993:	learn: 0.0196384	test: 0.1687952	best: 0.1111548 (29)	total: 13s	remaining: 13.1s
994:	learn: 0.0196345	test: 0.1688818	best: 0.1111548 (29)	total: 13s	remaining: 13.1s
995:	learn: 0.0196020	test: 0.1689899	best: 0.1111548 (29)	total: 13s	remaining: 13.1s
996:	learn: 0.0195876	test: 0.1

1085:	learn: 0.0166701	test: 0.1736938	best: 0.1111548 (29)	total: 14.2s	remaining: 11.9s
1086:	learn: 0.0166562	test: 0.1737841	best: 0.1111548 (29)	total: 14.2s	remaining: 11.9s
1087:	learn: 0.0166388	test: 0.1739722	best: 0.1111548 (29)	total: 14.2s	remaining: 11.9s
1088:	learn: 0.0166344	test: 0.1739459	best: 0.1111548 (29)	total: 14.2s	remaining: 11.9s
1089:	learn: 0.0165930	test: 0.1739573	best: 0.1111548 (29)	total: 14.2s	remaining: 11.9s
1090:	learn: 0.0165165	test: 0.1741811	best: 0.1111548 (29)	total: 14.3s	remaining: 11.9s
1091:	learn: 0.0165148	test: 0.1741736	best: 0.1111548 (29)	total: 14.3s	remaining: 11.9s
1092:	learn: 0.0164934	test: 0.1742678	best: 0.1111548 (29)	total: 14.3s	remaining: 11.9s
1093:	learn: 0.0164604	test: 0.1742910	best: 0.1111548 (29)	total: 14.3s	remaining: 11.9s
1094:	learn: 0.0164495	test: 0.1743894	best: 0.1111548 (29)	total: 14.3s	remaining: 11.8s
1095:	learn: 0.0163696	test: 0.1747130	best: 0.1111548 (29)	total: 14.3s	remaining: 11.8s
1096:	lear

1181:	learn: 0.0141743	test: 0.1763163	best: 0.1111548 (29)	total: 15.5s	remaining: 10.7s
1182:	learn: 0.0141348	test: 0.1763317	best: 0.1111548 (29)	total: 15.5s	remaining: 10.7s
1183:	learn: 0.0141068	test: 0.1762833	best: 0.1111548 (29)	total: 15.5s	remaining: 10.7s
1184:	learn: 0.0141034	test: 0.1762827	best: 0.1111548 (29)	total: 15.5s	remaining: 10.7s
1185:	learn: 0.0140509	test: 0.1766151	best: 0.1111548 (29)	total: 15.6s	remaining: 10.7s
1186:	learn: 0.0140196	test: 0.1766965	best: 0.1111548 (29)	total: 15.6s	remaining: 10.7s
1187:	learn: 0.0139666	test: 0.1765764	best: 0.1111548 (29)	total: 15.6s	remaining: 10.7s
1188:	learn: 0.0139396	test: 0.1765503	best: 0.1111548 (29)	total: 15.6s	remaining: 10.7s
1189:	learn: 0.0138784	test: 0.1765836	best: 0.1111548 (29)	total: 15.6s	remaining: 10.6s
1190:	learn: 0.0138525	test: 0.1767652	best: 0.1111548 (29)	total: 15.7s	remaining: 10.6s
1191:	learn: 0.0138287	test: 0.1767402	best: 0.1111548 (29)	total: 15.7s	remaining: 10.6s
1192:	lear

1276:	learn: 0.0123853	test: 0.1791640	best: 0.1111548 (29)	total: 16.8s	remaining: 9.52s
1277:	learn: 0.0123672	test: 0.1793148	best: 0.1111548 (29)	total: 16.8s	remaining: 9.5s
1278:	learn: 0.0123374	test: 0.1794352	best: 0.1111548 (29)	total: 16.8s	remaining: 9.49s
1279:	learn: 0.0123191	test: 0.1794174	best: 0.1111548 (29)	total: 16.8s	remaining: 9.48s
1280:	learn: 0.0122967	test: 0.1794441	best: 0.1111548 (29)	total: 16.9s	remaining: 9.46s
1281:	learn: 0.0122919	test: 0.1794171	best: 0.1111548 (29)	total: 16.9s	remaining: 9.45s
1282:	learn: 0.0122540	test: 0.1794138	best: 0.1111548 (29)	total: 16.9s	remaining: 9.44s
1283:	learn: 0.0122525	test: 0.1794444	best: 0.1111548 (29)	total: 16.9s	remaining: 9.43s
1284:	learn: 0.0122478	test: 0.1793915	best: 0.1111548 (29)	total: 16.9s	remaining: 9.42s
1285:	learn: 0.0122281	test: 0.1794278	best: 0.1111548 (29)	total: 16.9s	remaining: 9.41s
1286:	learn: 0.0122177	test: 0.1794345	best: 0.1111548 (29)	total: 17s	remaining: 9.4s
1287:	learn: 0

1374:	learn: 0.0108421	test: 0.1818360	best: 0.1111548 (29)	total: 18.1s	remaining: 8.23s
1375:	learn: 0.0107953	test: 0.1819627	best: 0.1111548 (29)	total: 18.1s	remaining: 8.21s
1376:	learn: 0.0107916	test: 0.1819455	best: 0.1111548 (29)	total: 18.1s	remaining: 8.2s
1377:	learn: 0.0107891	test: 0.1819301	best: 0.1111548 (29)	total: 18.1s	remaining: 8.19s
1378:	learn: 0.0107561	test: 0.1820594	best: 0.1111548 (29)	total: 18.2s	remaining: 8.17s
1379:	learn: 0.0107497	test: 0.1820654	best: 0.1111548 (29)	total: 18.2s	remaining: 8.16s
1380:	learn: 0.0107053	test: 0.1821249	best: 0.1111548 (29)	total: 18.2s	remaining: 8.15s
1381:	learn: 0.0106895	test: 0.1821257	best: 0.1111548 (29)	total: 18.2s	remaining: 8.13s
1382:	learn: 0.0106674	test: 0.1821456	best: 0.1111548 (29)	total: 18.2s	remaining: 8.13s
1383:	learn: 0.0106360	test: 0.1819562	best: 0.1111548 (29)	total: 18.2s	remaining: 8.12s
1384:	learn: 0.0106230	test: 0.1818745	best: 0.1111548 (29)	total: 18.3s	remaining: 8.11s
1385:	learn

1469:	learn: 0.0095499	test: 0.1861177	best: 0.1111548 (29)	total: 19.3s	remaining: 6.97s
1470:	learn: 0.0095360	test: 0.1861027	best: 0.1111548 (29)	total: 19.4s	remaining: 6.96s
1471:	learn: 0.0095075	test: 0.1860865	best: 0.1111548 (29)	total: 19.4s	remaining: 6.95s
1472:	learn: 0.0095038	test: 0.1861129	best: 0.1111548 (29)	total: 19.4s	remaining: 6.93s
1473:	learn: 0.0094936	test: 0.1860531	best: 0.1111548 (29)	total: 19.4s	remaining: 6.92s
1474:	learn: 0.0094747	test: 0.1859991	best: 0.1111548 (29)	total: 19.4s	remaining: 6.91s
1475:	learn: 0.0094559	test: 0.1860353	best: 0.1111548 (29)	total: 19.4s	remaining: 6.89s
1476:	learn: 0.0094465	test: 0.1861648	best: 0.1111548 (29)	total: 19.4s	remaining: 6.88s
1477:	learn: 0.0094343	test: 0.1862221	best: 0.1111548 (29)	total: 19.4s	remaining: 6.87s
1478:	learn: 0.0094307	test: 0.1862639	best: 0.1111548 (29)	total: 19.5s	remaining: 6.86s
1479:	learn: 0.0094206	test: 0.1862255	best: 0.1111548 (29)	total: 19.5s	remaining: 6.85s
1480:	lear

1566:	learn: 0.0084673	test: 0.1886632	best: 0.1111548 (29)	total: 20.8s	remaining: 5.75s
1567:	learn: 0.0084650	test: 0.1887122	best: 0.1111548 (29)	total: 20.8s	remaining: 5.74s
1568:	learn: 0.0084650	test: 0.1887108	best: 0.1111548 (29)	total: 20.8s	remaining: 5.72s
1569:	learn: 0.0084546	test: 0.1886170	best: 0.1111548 (29)	total: 20.8s	remaining: 5.71s
1570:	learn: 0.0084472	test: 0.1887562	best: 0.1111548 (29)	total: 20.9s	remaining: 5.7s
1571:	learn: 0.0084229	test: 0.1887223	best: 0.1111548 (29)	total: 20.9s	remaining: 5.68s
1572:	learn: 0.0084080	test: 0.1886912	best: 0.1111548 (29)	total: 20.9s	remaining: 5.67s
1573:	learn: 0.0084002	test: 0.1887118	best: 0.1111548 (29)	total: 20.9s	remaining: 5.66s
1574:	learn: 0.0083887	test: 0.1886398	best: 0.1111548 (29)	total: 20.9s	remaining: 5.65s
1575:	learn: 0.0083783	test: 0.1890862	best: 0.1111548 (29)	total: 21s	remaining: 5.64s
1576:	learn: 0.0083599	test: 0.1893378	best: 0.1111548 (29)	total: 21s	remaining: 5.62s
1577:	learn: 0.

1661:	learn: 0.0075146	test: 0.1913149	best: 0.1111548 (29)	total: 22.2s	remaining: 4.51s
1662:	learn: 0.0075135	test: 0.1913259	best: 0.1111548 (29)	total: 22.2s	remaining: 4.5s
1663:	learn: 0.0075128	test: 0.1913367	best: 0.1111548 (29)	total: 22.2s	remaining: 4.48s
1664:	learn: 0.0075124	test: 0.1913446	best: 0.1111548 (29)	total: 22.2s	remaining: 4.47s
1665:	learn: 0.0075100	test: 0.1913870	best: 0.1111548 (29)	total: 22.2s	remaining: 4.46s
1666:	learn: 0.0075085	test: 0.1914123	best: 0.1111548 (29)	total: 22.2s	remaining: 4.44s
1667:	learn: 0.0074896	test: 0.1914608	best: 0.1111548 (29)	total: 22.3s	remaining: 4.43s
1668:	learn: 0.0074891	test: 0.1914803	best: 0.1111548 (29)	total: 22.3s	remaining: 4.42s
1669:	learn: 0.0074782	test: 0.1915461	best: 0.1111548 (29)	total: 22.3s	remaining: 4.4s
1670:	learn: 0.0074714	test: 0.1915416	best: 0.1111548 (29)	total: 22.3s	remaining: 4.39s
1671:	learn: 0.0074613	test: 0.1915942	best: 0.1111548 (29)	total: 22.3s	remaining: 4.38s
1672:	learn:

1758:	learn: 0.0067133	test: 0.1949322	best: 0.1111548 (29)	total: 23.5s	remaining: 3.21s
1759:	learn: 0.0066971	test: 0.1949691	best: 0.1111548 (29)	total: 23.5s	remaining: 3.2s
1760:	learn: 0.0066971	test: 0.1949699	best: 0.1111548 (29)	total: 23.5s	remaining: 3.19s
1761:	learn: 0.0066859	test: 0.1951971	best: 0.1111548 (29)	total: 23.5s	remaining: 3.17s
1762:	learn: 0.0066859	test: 0.1951962	best: 0.1111548 (29)	total: 23.5s	remaining: 3.16s
1763:	learn: 0.0066816	test: 0.1952402	best: 0.1111548 (29)	total: 23.5s	remaining: 3.15s
1764:	learn: 0.0066800	test: 0.1952144	best: 0.1111548 (29)	total: 23.5s	remaining: 3.13s
1765:	learn: 0.0066653	test: 0.1952916	best: 0.1111548 (29)	total: 23.5s	remaining: 3.12s
1766:	learn: 0.0066475	test: 0.1953851	best: 0.1111548 (29)	total: 23.6s	remaining: 3.11s
1767:	learn: 0.0066405	test: 0.1953842	best: 0.1111548 (29)	total: 23.6s	remaining: 3.1s
1768:	learn: 0.0066315	test: 0.1953421	best: 0.1111548 (29)	total: 23.6s	remaining: 3.08s
1769:	learn:

1855:	learn: 0.0061417	test: 0.1989281	best: 0.1111548 (29)	total: 24.7s	remaining: 1.91s
1856:	learn: 0.0061373	test: 0.1989289	best: 0.1111548 (29)	total: 24.7s	remaining: 1.9s
1857:	learn: 0.0061243	test: 0.1990998	best: 0.1111548 (29)	total: 24.7s	remaining: 1.89s
1858:	learn: 0.0061142	test: 0.1992744	best: 0.1111548 (29)	total: 24.7s	remaining: 1.87s
1859:	learn: 0.0060994	test: 0.1993282	best: 0.1111548 (29)	total: 24.7s	remaining: 1.86s
1860:	learn: 0.0060911	test: 0.1994793	best: 0.1111548 (29)	total: 24.7s	remaining: 1.85s
1861:	learn: 0.0060885	test: 0.1995329	best: 0.1111548 (29)	total: 24.7s	remaining: 1.83s
1862:	learn: 0.0060831	test: 0.1995738	best: 0.1111548 (29)	total: 24.8s	remaining: 1.82s
1863:	learn: 0.0060784	test: 0.1996236	best: 0.1111548 (29)	total: 24.8s	remaining: 1.81s
1864:	learn: 0.0060783	test: 0.1996275	best: 0.1111548 (29)	total: 24.8s	remaining: 1.79s
1865:	learn: 0.0060773	test: 0.1996162	best: 0.1111548 (29)	total: 24.8s	remaining: 1.78s
1866:	learn

1949:	learn: 0.0055738	test: 0.2006378	best: 0.1111548 (29)	total: 25.9s	remaining: 663ms
1950:	learn: 0.0055683	test: 0.2006042	best: 0.1111548 (29)	total: 25.9s	remaining: 650ms
1951:	learn: 0.0055656	test: 0.2005761	best: 0.1111548 (29)	total: 25.9s	remaining: 637ms
1952:	learn: 0.0055607	test: 0.2005894	best: 0.1111548 (29)	total: 25.9s	remaining: 623ms
1953:	learn: 0.0055596	test: 0.2005723	best: 0.1111548 (29)	total: 25.9s	remaining: 610ms
1954:	learn: 0.0055537	test: 0.2006095	best: 0.1111548 (29)	total: 25.9s	remaining: 597ms
1955:	learn: 0.0055470	test: 0.2007615	best: 0.1111548 (29)	total: 25.9s	remaining: 584ms
1956:	learn: 0.0055371	test: 0.2008954	best: 0.1111548 (29)	total: 26s	remaining: 571ms
1957:	learn: 0.0055335	test: 0.2009450	best: 0.1111548 (29)	total: 26s	remaining: 557ms
1958:	learn: 0.0055330	test: 0.2009565	best: 0.1111548 (29)	total: 26s	remaining: 544ms
1959:	learn: 0.0055275	test: 0.2009263	best: 0.1111548 (29)	total: 26s	remaining: 531ms
1960:	learn: 0.005

In [317]:
ranking_training_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92451 entries, 0 to 92450
Data columns (total 61 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   srch_id                      92451 non-null  object 
 1   site_id                      92451 non-null  object 
 2   visitor_location_country_id  92451 non-null  object 
 3   visitor_hist_starrating      4461 non-null   object 
 4   visitor_hist_adr_usd         4486 non-null   object 
 5   prop_country_id              92451 non-null  object 
 6   prop_id                      92451 non-null  object 
 7   prop_starrating              92451 non-null  object 
 8   prop_review_score            92300 non-null  object 
 9   prop_brand_bool              92451 non-null  object 
 10  prop_location_score1         92451 non-null  object 
 11  prop_location_score2         72046 non-null  object 
 12  prop_log_historical_price    92451 non-null  object 
 13  position        

In [318]:
relevancyclf_training_predictions.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancyScore,ActualRelevancy,Not_Relevant_prob,Yes_Relevant_prob,RelevanceLabelPrediction,PredictedRelevancy,PredictedRelevancyScore
0,71,24,216,,,98,76541,3,3.5,0,2.2,0.0482,3.21,1,16.9,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.978216,0.021784,0.0,Not Relevant,0.978216
1,71,24,216,,,98,37933,3,4.0,0,2.08,0.0812,3.59,2,18.34,0,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,1,1,11,23,14,1363130090.0,True,20,0.0,Not Relevant,0.973249,0.026751,0.0,Not Relevant,0.973249
2,71,24,216,,,98,108588,4,4.0,0,0.0,0.0268,4.08,3,26.78,1,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,2,1,11,23,14,1363130090.0,True,17,0.0,Not Relevant,0.996076,0.003924,0.0,Not Relevant,0.996076
3,71,24,216,,,98,28672,3,4.0,0,2.2,0.0151,3.35,4,21.48,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,3,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.992027,0.007973,0.0,Not Relevant,0.992027
4,71,24,216,,,98,25741,3,3.5,0,1.61,0.0359,4.02,6,19.47,1,341,1,18,1,1,1,1,,,1,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,4,1,11,23,14,1363130090.0,True,19,0.0,Not Relevant,0.976626,0.023374,0.0,Not Relevant,0.976626


In [319]:
ranking_training_predictions["Predictedrank"] = ranking_training_predictions.groupby("srch_id")["PredictedRelevancyScore"].rank("dense", ascending=False)
ranking_training_predictions_df=ranking_training_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_training_predictions_df = ranking_training_predictions_df.reset_index(drop=True)

ranking_training_predictions_df[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedScore']] = relevancyclf_training_predictions[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedRelevancyScore']]

ranking_training_predictions_df['PredictedRelevancy'].value_counts()

Not Relevant    92403
Relevant           48
Name: PredictedRelevancy, dtype: int64

In [320]:
ranking_test_predictions["Predictedrank"] = ranking_test_predictions.groupby("srch_id")["PredictedRelevancyScore"].rank("dense", ascending=False)
ranking_test_predictions_df=ranking_test_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_test_predictions_df = ranking_test_predictions_df.reset_index(drop=True)

ranking_test_predictions_df[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedScore']] = relevancyclf_test_predictions[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedRelevancyScore']]

ranking_test_predictions_df.head()


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancyScore,ActualRelevancy,PredictedRelevancyScore,Predictedrank,Not_Relevant_prob,Yes_Relevant_prob,RelevanceLabelPrediction,PredictedRelevancy,PredictedScore
0,246,14,100,,,219,11396,4,5.0,0,0.69,0.0551,5.36,1,159.64,0,18294,1,6,2,0,1,0,,87.17,0,,,,,,,,,,,,,,,,,,,,,,,,,0,0,10,9,55,1362390943.0,False,27,1.0,Relevant,0.132251,1.0,0.853474,0.146526,0.0,Not Relevant,0.853474
1,246,14,100,,,219,103885,2,4.0,1,1.1,0.154,4.75,2,98.32,0,18294,1,6,2,0,1,0,,93.87,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,1,0,10,9,55,1362390943.0,False,25,0.0,Not Relevant,0.117169,2.0,0.834611,0.165389,0.0,Not Relevant,0.834611
2,246,14,100,,,219,61167,3,4.5,0,1.1,0.0314,4.87,3,101.24,0,18294,1,6,2,0,1,0,,92.31,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,2,0,10,9,55,1362390943.0,False,25,0.0,Not Relevant,0.052576,3.0,0.914844,0.085156,0.0,Not Relevant,0.914844
3,246,14,100,,,219,95490,2,3.5,1,1.1,0.0142,4.88,4,95.4,0,18294,1,6,2,0,1,0,,89.28,0,,,,,,,,,,,,,,,,,,,,,,,,,3,0,10,9,55,1362390943.0,False,27,0.0,Not Relevant,0.010497,6.0,0.940705,0.059295,0.0,Not Relevant,0.940705
4,246,14,100,,,219,127213,3,4.0,1,1.1,0.1652,4.95,6,119.73,0,18294,1,6,2,0,1,0,,93.97,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,4,0,10,9,55,1362390943.0,False,25,0.0,Not Relevant,0.039523,4.0,0.872,0.128,0.0,Not Relevant,0.872


In [321]:
ranking_training_predictions_df.to_csv('./data/hotelsearch_training_predictions.csv')
ranking_test_predictions_df.to_csv('./data/hotelsearch_test_predictions.csv')

In [278]:
#relevancyclf_training_predictions.to_csv('./data/hotelsearch_clf_training.csv')
#relevancyclf_test_predictions.to_csv('./data/hotelsearch_clf_test.csv')

In [279]:
ranking_training_predictions_df.shape

(92451, 64)