In [141]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRanker, Pool, MetricVisualizer, CatBoostClassifier
from copy import deepcopy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [142]:


def featurize_df(df:pd.DataFrame) ->pd.DataFrame:
    """
    Extract more features
    """
    df["weekday"] = df["date_time"].dt.weekday
    df["week_of_year"] = df["date_time"].dt.isocalendar().week

    df["hour"] = df["date_time"].dt.hour
    df["minute"] = df["date_time"].dt.minute
    ## total time elapsed - allows model to learn continous trend over time to a degree
    df["time_epoch"] = df["date_time"].astype('int64')//1e9
    ## if we were looking at fraud: df["seconds"] = df.timestamp.dt.second
    df["early_night"] = ((df["hour"]>19) | (df["hour"]<3)) # no added value from feature
    
    df["nans_count"] = df.isna().sum(axis=1)
    
    ## we won't make any time series features for now
    ## We could add time series features per property/hotel. We'd need to check for unaries, and to add a shift/offset dependant on forecast horizon

    return df


In [143]:
HotelSearch_train=pd.read_csv("./data/HotelSearch_train_sampled.csv", index_col=0)
HotelSearch_test=pd.read_csv("./data/HotelSearch_test_sampled.csv", index_col=0)



#print(HotelSearch_orig['booking_bool'].unique())
HotelSearch_train["date_time"] = pd.to_datetime(HotelSearch_train["date_time"],infer_datetime_format=True)
#HotelSearch_train["target"] = HotelSearch_train.apply(get_target,axis=1)

HotelSearch_test["date_time"] = pd.to_datetime(HotelSearch_test["date_time"],infer_datetime_format=True)
#HotelSearch_test["target"] = HotelSearch_test.apply(get_target,axis=1)

HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98361 entries, 0 to 98360
Data columns (total 55 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      98361 non-null  int64         
 1   date_time                    98361 non-null  datetime64[ns]
 2   site_id                      98361 non-null  int64         
 3   visitor_location_country_id  98361 non-null  int64         
 4   visitor_hist_starrating      4699 non-null   float64       
 5   visitor_hist_adr_usd         4724 non-null   float64       
 6   prop_country_id              98361 non-null  int64         
 7   prop_id                      98361 non-null  int64         
 8   prop_starrating              98361 non-null  int64         
 9   prop_review_score            98201 non-null  float64       
 10  prop_brand_bool              98361 non-null  int64         
 11  prop_location_score1         98361 non-nu

In [144]:
drop_cols=[]
drop_unary_cols = [c for c in list(HotelSearch_train)
    if HotelSearch_train[c].nunique(dropna=False) <= 1]
drop_unary_cols
target_cols = ["gross_bookings_usd","click_bool","booking_bool","Unnamed: 0"] # leaky column, and original target columns
drop_cols.extend(drop_unary_cols)
drop_cols.extend(target_cols) 

HotelSearch_train = HotelSearch_train.drop(columns=drop_cols,errors="ignore")
HotelSearch_test  = HotelSearch_test.drop(columns=drop_cols,errors="ignore")
print(HotelSearch_train.shape)
print(HotelSearch_test.shape)


(98361, 52)
(147730, 52)


In [67]:
HotelSearch_train = featurize_df(HotelSearch_train)
HotelSearch_test = featurize_df(HotelSearch_test)
HotelSearch_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147730 entries, 0 to 147729
Data columns (total 59 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   srch_id                      147730 non-null  int64         
 1   date_time                    147730 non-null  datetime64[ns]
 2   site_id                      147730 non-null  int64         
 3   visitor_location_country_id  147730 non-null  int64         
 4   visitor_hist_starrating      7770 non-null    float64       
 5   visitor_hist_adr_usd         7801 non-null    float64       
 6   prop_country_id              147730 non-null  int64         
 7   prop_id                      147730 non-null  int64         
 8   prop_starrating              147730 non-null  int64         
 9   prop_review_score            147499 non-null  float64       
 10  prop_brand_bool              147730 non-null  int64         
 11  prop_location_score1      

In [145]:
HotelSearch_train.drop(['comp3_rate',
       'comp3_inv', 'comp3_rate_percent_diff', 'comp4_inv', 'comp5_rate',
       'comp5_inv', 'comp5_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff'],axis=1).groupby(HotelSearch_train["target"]>0).mean()
HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98361 entries, 0 to 98360
Data columns (total 52 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      98361 non-null  int64         
 1   date_time                    98361 non-null  datetime64[ns]
 2   site_id                      98361 non-null  int64         
 3   visitor_location_country_id  98361 non-null  int64         
 4   visitor_hist_starrating      4699 non-null   float64       
 5   visitor_hist_adr_usd         4724 non-null   float64       
 6   prop_country_id              98361 non-null  int64         
 7   prop_id                      98361 non-null  int64         
 8   prop_starrating              98361 non-null  int64         
 9   prop_review_score            98201 non-null  float64       
 10  prop_brand_bool              98361 non-null  int64         
 11  prop_location_score1         98361 non-nu

In [146]:
cutoff_id = HotelSearch_train["srch_id"].quantile(0.94) # 90/10 split

X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id].drop(["target","date_time"],axis=1)
X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id].drop(["target","date_time"],axis=1)
X_test_df=HotelSearch_test.drop(["target","date_time"],axis=1)

feature_cols=X_train_df.columns.tolist()
# X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]
# X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]

y_train_df = HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]["target"].astype(float)
y_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]["target"].astype(float)
y_test_df = HotelSearch_test["target"].astype(float)

X_train = X_train_df.values
X_eval = X_eval_df.values
X_test= X_test_df.values
queries_train = X_train_df["srch_id"].values

y_train = y_train_df.values
y_eval = y_eval_df.values
y_test = y_test_df.values

queries_test = X_eval_df["srch_id"].values



max_relevance = float(np.max(y_train))
y_train /= max_relevance
y_eval /= max_relevance
y_train_df.value_counts()


0.0    89823
1.0     2628
Name: target, dtype: int64

In [147]:
print("mean relevancy train",round(y_train.mean(),4))
print("mean relevancy eval",round(y_eval.mean(),4))
print(y_eval_df.value_counts()) # check we have all 3 "labels" in subset

mean relevancy train 0.0284
mean relevancy eval 0.0293
0.0    5737
1.0     173
Name: target, dtype: int64


In [148]:
HotelSearch_train['target'].value_counts()

0    95560
1     2801
Name: target, dtype: int64

In [149]:
categorical_cols = ['prop_id',"srch_destination_id", "weekday"] # ,"week_of_year"

In [150]:
X_train_df.shape

(92451, 50)

In [151]:
#set(X_train.columns).symmetric_difference(set(HotelSearch_test.columns))
train_pool = Pool(data=X_train,
                  label = y_train,
#                  cat_features=categorical_cols,
                  group_id=queries_train
                 )

eval_pool = Pool(data=X_eval,
                  label = y_eval,
#                  cat_features=categorical_cols,
                  group_id=queries_test
                 )

In [172]:
default_parameters  = {
    'iterations': 2000,
    'custom_metric': ['NDCG', "AUC:type=Ranking",'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], # , 'AverageGain:top=3'# 'QueryRMSE', "YetiLoss" (use with hints)
    'verbose': False,
    'random_seed': 42,
#     "task_type":"GPU",
#    "has_time":True,
    "metric_period":4,
    "save_snapshot":False,
    "use_best_model":True, # requires eval set to be set
} 

default_clf_parameters  = {
    'iterations':2000,
    'random_seed':42,
    'learning_rate':0.5,
    'custom_loss':['AUC', 'Accuracy']
}    


In [180]:
def fit_model(model,loss_function, prediction_function, feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval,additional_params=None, train_pool=train_pool, test_pool=eval_pool):

    print("Model Type: Is it CatboostRanker?")
    print(isinstance(model, type(CatBoostRanker())))

    
    if isinstance(model, type(CatBoostRanker())) is True:   
        print("Model Type is CatboostRanker")
        parameters = deepcopy(default_parameters)
        parameters['loss_function'] = loss_function
        parameters['train_dir'] = loss_function
        print(parameters) 
        if additional_params is not None:
            parameters.update(additional_params)
        model = CatBoostRanker(**parameters)
        model.fit(train_pool, eval_set=test_pool, plot=True)
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
    else:
        print("Model Type is CatboostClassifier")
        parameters = deepcopy(default_clf_parameters)
#        parameters['loss_function'] = loss_function
#        parameters['train_dir'] = loss_function
        model = CatBoostClassifier(**parameters)
        print(parameters) 
        model.fit(X_train, y_train,eval_set=(X_eval, y_eval),plot=True)        
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
#Training predictions
    y_train_predictions = prediction_function(model, X_train)
    training_predictions = pd.DataFrame(X_train,columns=feature_cols)
    training_predictions['ActualRelevancy'] = y_train
    if isinstance(model, type(CatBoostRanker())) is True:
        training_predictions['PredictedRelevancy'] = y_train_predictions
    else:
        training_predictions['Not_Relevant_prob'] = y_train_predictions[:,0]
        training_predictions['Yes_Relevant_prob'] = y_train_predictions[:,1]
        training_predictions['RelevancePrediction'] = np.argmax(y_train_predictions, axis=1).astype(np.float32)

    
#Test predictions
    y_test_predictions = prediction_function(model, X_test)
    test_predictions = pd.DataFrame(X_test,columns=feature_cols)
    test_predictions['ActualRelevancy'] = y_test
    if isinstance(model, type(CatBoostRanker())) is True:
        test_predictions['PredictedRelevancy'] = y_test_predictions
    else:
        test_predictions['Not_Relevant_prob'] = y_test_predictions[:,0]
        test_predictions['Yes_Relevant_prob'] = y_test_predictions[:,1]
        test_predictions['RelevancePrediction'] = np.argmax(y_test_predictions, axis=1).astype(np.float32)
        

#     print("(Default) Feature importance (on train pool)")
#     display(model.get_feature_importance(data=train_pool,prettified=True).head(15))
    
#     try:
#         print("SHAP features importance, on all data:")
#         explainer = shap.TreeExplainer(model)
#         shap_values = explainer.shap_values(pd.concat([X_train,X_eval]),
#                                             y=pd.concat([y_train,y_eval]))

#         # # summarize the effects of all the features
#         shap.summary_plot(shap_values, pd.concat([X_train,X_eval]))
#    finally:
    return model, training_predictions , test_predictions  


In [181]:
def get_predicted_ranking_outcome(model, data):
#    a=model.predict(data)
#    print(a)
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict(data)

In [182]:
def get_predicted_clf_outcome(model, data):
#    print(model.predict_proba(data))
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict_proba(data)

In [183]:
#model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})
parameters = {}
ranking_model,ranking_training_predictions, ranking_test_predictions = fit_model(CatBoostRanker(),'QueryRMSE',get_predicted_ranking_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
True
Model Type is CatboostRanker
{'iterations': 2000, 'custom_metric': ['NDCG', 'AUC:type=Ranking', 'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], 'verbose': False, 'random_seed': 42, 'metric_period': 4, 'save_snapshot': False, 'use_best_model': True, 'loss_function': 'QueryRMSE', 'train_dir': 'QueryRMSE'}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best results (train on train):
{'MAP:top=10': 0.36678541767303113, 'RecallAt:top=10': 0.9297917778964229, 'PrecisionAt:top=10': 0.06680745175806341, 'QueryRMSE': 0.1511645318719358}
best results (on validation set):
{'NDCG:type=Base': 0.6768051001310567, 'MAP:top=10': 0.27550607287449413, 'RecallAt:top=10': 0.8825910931174089, 'PrecisionAt:top=10': 0.06241726110147149, 'QueryRMSE': 0.16184280009577853, 'AUC:type=Ranking': 0.7769342297891891}


In [184]:
parameters = {}
relevancyclf_model,relevancyclf_training_predictions, relevancyclf_test_predictions = fit_model(CatBoostClassifier(),'AUC',get_predicted_clf_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
False
Model Type is CatboostClassifier
{'iterations': 2000, 'random_seed': 42, 'learning_rate': 0.5, 'custom_loss': ['AUC', 'Accuracy']}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.2079809	test: 0.2097731	best: 0.2097731 (0)	total: 33.6ms	remaining: 1m 7s
1:	learn: 0.1276657	test: 0.1301752	best: 0.1301752 (1)	total: 52.3ms	remaining: 52.2s
2:	learn: 0.1133685	test: 0.1168208	best: 0.1168208 (2)	total: 77.3ms	remaining: 51.4s
3:	learn: 0.1103973	test: 0.1146575	best: 0.1146575 (3)	total: 92.2ms	remaining: 46s
4:	learn: 0.1086936	test: 0.1136050	best: 0.1136050 (4)	total: 106ms	remaining: 42.3s
5:	learn: 0.1081519	test: 0.1135221	best: 0.1135221 (5)	total: 128ms	remaining: 42.7s
6:	learn: 0.1076629	test: 0.1132895	best: 0.1132895 (6)	total: 146ms	remaining: 41.5s
7:	learn: 0.1073428	test: 0.1131360	best: 0.1131360 (7)	total: 157ms	remaining: 39.2s
8:	learn: 0.1069114	test: 0.1128358	best: 0.1128358 (8)	total: 176ms	remaining: 38.9s
9:	learn: 0.1066545	test: 0.1127601	best: 0.1127601 (9)	total: 185ms	remaining: 36.8s
10:	learn: 0.1062805	test: 0.1129633	best: 0.1127601 (9)	total: 194ms	remaining: 35.1s
11:	learn: 0.1062025	test: 0.1128969	best: 0.112760

98:	learn: 0.0905342	test: 0.1229273	best: 0.1127601 (9)	total: 1.24s	remaining: 23.9s
99:	learn: 0.0903395	test: 0.1230843	best: 0.1127601 (9)	total: 1.25s	remaining: 23.8s
100:	learn: 0.0902084	test: 0.1233357	best: 0.1127601 (9)	total: 1.26s	remaining: 23.8s
101:	learn: 0.0900316	test: 0.1236091	best: 0.1127601 (9)	total: 1.27s	remaining: 23.7s
102:	learn: 0.0899681	test: 0.1235859	best: 0.1127601 (9)	total: 1.28s	remaining: 23.6s
103:	learn: 0.0898799	test: 0.1235840	best: 0.1127601 (9)	total: 1.29s	remaining: 23.6s
104:	learn: 0.0897883	test: 0.1236490	best: 0.1127601 (9)	total: 1.3s	remaining: 23.5s
105:	learn: 0.0897186	test: 0.1235401	best: 0.1127601 (9)	total: 1.31s	remaining: 23.4s
106:	learn: 0.0893627	test: 0.1235336	best: 0.1127601 (9)	total: 1.32s	remaining: 23.4s
107:	learn: 0.0891271	test: 0.1233531	best: 0.1127601 (9)	total: 1.34s	remaining: 23.4s
108:	learn: 0.0888986	test: 0.1232219	best: 0.1127601 (9)	total: 1.35s	remaining: 23.4s
109:	learn: 0.0886541	test: 0.12330

199:	learn: 0.0783341	test: 0.1340255	best: 0.1127601 (9)	total: 2.35s	remaining: 21.1s
200:	learn: 0.0782051	test: 0.1337415	best: 0.1127601 (9)	total: 2.36s	remaining: 21.1s
201:	learn: 0.0780218	test: 0.1337440	best: 0.1127601 (9)	total: 2.38s	remaining: 21.2s
202:	learn: 0.0778070	test: 0.1339572	best: 0.1127601 (9)	total: 2.39s	remaining: 21.1s
203:	learn: 0.0777326	test: 0.1339450	best: 0.1127601 (9)	total: 2.4s	remaining: 21.1s
204:	learn: 0.0776824	test: 0.1340000	best: 0.1127601 (9)	total: 2.4s	remaining: 21.1s
205:	learn: 0.0774588	test: 0.1340514	best: 0.1127601 (9)	total: 2.41s	remaining: 21s
206:	learn: 0.0773545	test: 0.1340038	best: 0.1127601 (9)	total: 2.42s	remaining: 21s
207:	learn: 0.0772436	test: 0.1335471	best: 0.1127601 (9)	total: 2.43s	remaining: 20.9s
208:	learn: 0.0772423	test: 0.1335514	best: 0.1127601 (9)	total: 2.44s	remaining: 20.9s
209:	learn: 0.0771586	test: 0.1336147	best: 0.1127601 (9)	total: 2.45s	remaining: 20.9s
210:	learn: 0.0770835	test: 0.1334358	

299:	learn: 0.0667798	test: 0.1407962	best: 0.1127601 (9)	total: 3.52s	remaining: 19.9s
300:	learn: 0.0667120	test: 0.1406789	best: 0.1127601 (9)	total: 3.53s	remaining: 19.9s
301:	learn: 0.0667053	test: 0.1406749	best: 0.1127601 (9)	total: 3.54s	remaining: 19.9s
302:	learn: 0.0665691	test: 0.1406218	best: 0.1127601 (9)	total: 3.55s	remaining: 19.9s
303:	learn: 0.0663720	test: 0.1408439	best: 0.1127601 (9)	total: 3.56s	remaining: 19.9s
304:	learn: 0.0663424	test: 0.1408908	best: 0.1127601 (9)	total: 3.58s	remaining: 19.9s
305:	learn: 0.0663211	test: 0.1408533	best: 0.1127601 (9)	total: 3.59s	remaining: 19.9s
306:	learn: 0.0662965	test: 0.1408419	best: 0.1127601 (9)	total: 3.6s	remaining: 19.9s
307:	learn: 0.0662048	test: 0.1407904	best: 0.1127601 (9)	total: 3.61s	remaining: 19.8s
308:	learn: 0.0661519	test: 0.1407199	best: 0.1127601 (9)	total: 3.62s	remaining: 19.8s
309:	learn: 0.0661198	test: 0.1407130	best: 0.1127601 (9)	total: 3.64s	remaining: 19.8s
310:	learn: 0.0660212	test: 0.140

397:	learn: 0.0573560	test: 0.1446009	best: 0.1127601 (9)	total: 4.65s	remaining: 18.7s
398:	learn: 0.0572646	test: 0.1446607	best: 0.1127601 (9)	total: 4.66s	remaining: 18.7s
399:	learn: 0.0571430	test: 0.1447577	best: 0.1127601 (9)	total: 4.67s	remaining: 18.7s
400:	learn: 0.0571225	test: 0.1447645	best: 0.1127601 (9)	total: 4.68s	remaining: 18.7s
401:	learn: 0.0569897	test: 0.1447232	best: 0.1127601 (9)	total: 4.7s	remaining: 18.7s
402:	learn: 0.0568593	test: 0.1447133	best: 0.1127601 (9)	total: 4.71s	remaining: 18.7s
403:	learn: 0.0567660	test: 0.1447641	best: 0.1127601 (9)	total: 4.73s	remaining: 18.7s
404:	learn: 0.0566877	test: 0.1452144	best: 0.1127601 (9)	total: 4.74s	remaining: 18.7s
405:	learn: 0.0566428	test: 0.1451339	best: 0.1127601 (9)	total: 4.75s	remaining: 18.7s
406:	learn: 0.0565051	test: 0.1451312	best: 0.1127601 (9)	total: 4.77s	remaining: 18.7s
407:	learn: 0.0563758	test: 0.1453648	best: 0.1127601 (9)	total: 4.78s	remaining: 18.7s
408:	learn: 0.0562600	test: 0.145

492:	learn: 0.0489534	test: 0.1508278	best: 0.1127601 (9)	total: 5.75s	remaining: 17.6s
493:	learn: 0.0488952	test: 0.1507576	best: 0.1127601 (9)	total: 5.76s	remaining: 17.6s
494:	learn: 0.0488114	test: 0.1508742	best: 0.1127601 (9)	total: 5.77s	remaining: 17.5s
495:	learn: 0.0486833	test: 0.1509717	best: 0.1127601 (9)	total: 5.78s	remaining: 17.5s
496:	learn: 0.0486250	test: 0.1510768	best: 0.1127601 (9)	total: 5.79s	remaining: 17.5s
497:	learn: 0.0484594	test: 0.1512444	best: 0.1127601 (9)	total: 5.8s	remaining: 17.5s
498:	learn: 0.0484554	test: 0.1512512	best: 0.1127601 (9)	total: 5.81s	remaining: 17.5s
499:	learn: 0.0483981	test: 0.1513048	best: 0.1127601 (9)	total: 5.82s	remaining: 17.5s
500:	learn: 0.0483867	test: 0.1513046	best: 0.1127601 (9)	total: 5.84s	remaining: 17.5s
501:	learn: 0.0483685	test: 0.1513112	best: 0.1127601 (9)	total: 5.85s	remaining: 17.5s
502:	learn: 0.0482331	test: 0.1514813	best: 0.1127601 (9)	total: 5.86s	remaining: 17.4s
503:	learn: 0.0481478	test: 0.151

589:	learn: 0.0416547	test: 0.1567216	best: 0.1127601 (9)	total: 6.87s	remaining: 16.4s
590:	learn: 0.0415423	test: 0.1565013	best: 0.1127601 (9)	total: 6.88s	remaining: 16.4s
591:	learn: 0.0415384	test: 0.1565106	best: 0.1127601 (9)	total: 6.89s	remaining: 16.4s
592:	learn: 0.0414584	test: 0.1563805	best: 0.1127601 (9)	total: 6.9s	remaining: 16.4s
593:	learn: 0.0413876	test: 0.1563197	best: 0.1127601 (9)	total: 6.91s	remaining: 16.4s
594:	learn: 0.0413603	test: 0.1563620	best: 0.1127601 (9)	total: 6.92s	remaining: 16.3s
595:	learn: 0.0412876	test: 0.1565604	best: 0.1127601 (9)	total: 6.94s	remaining: 16.3s
596:	learn: 0.0412258	test: 0.1567612	best: 0.1127601 (9)	total: 6.95s	remaining: 16.3s
597:	learn: 0.0410906	test: 0.1568984	best: 0.1127601 (9)	total: 6.96s	remaining: 16.3s
598:	learn: 0.0410543	test: 0.1569699	best: 0.1127601 (9)	total: 6.98s	remaining: 16.3s
599:	learn: 0.0410117	test: 0.1569909	best: 0.1127601 (9)	total: 6.99s	remaining: 16.3s
600:	learn: 0.0409909	test: 0.157

687:	learn: 0.0354845	test: 0.1619069	best: 0.1127601 (9)	total: 8.04s	remaining: 15.3s
688:	learn: 0.0353913	test: 0.1617210	best: 0.1127601 (9)	total: 8.06s	remaining: 15.3s
689:	learn: 0.0352652	test: 0.1617072	best: 0.1127601 (9)	total: 8.07s	remaining: 15.3s
690:	learn: 0.0352592	test: 0.1617595	best: 0.1127601 (9)	total: 8.08s	remaining: 15.3s
691:	learn: 0.0351757	test: 0.1621083	best: 0.1127601 (9)	total: 8.09s	remaining: 15.3s
692:	learn: 0.0350866	test: 0.1622502	best: 0.1127601 (9)	total: 8.1s	remaining: 15.3s
693:	learn: 0.0350417	test: 0.1621154	best: 0.1127601 (9)	total: 8.12s	remaining: 15.3s
694:	learn: 0.0350077	test: 0.1621131	best: 0.1127601 (9)	total: 8.13s	remaining: 15.3s
695:	learn: 0.0349292	test: 0.1622021	best: 0.1127601 (9)	total: 8.14s	remaining: 15.3s
696:	learn: 0.0348487	test: 0.1621529	best: 0.1127601 (9)	total: 8.16s	remaining: 15.3s
697:	learn: 0.0347823	test: 0.1620661	best: 0.1127601 (9)	total: 8.17s	remaining: 15.2s
698:	learn: 0.0346974	test: 0.162

784:	learn: 0.0292920	test: 0.1688535	best: 0.1127601 (9)	total: 9.25s	remaining: 14.3s
785:	learn: 0.0292837	test: 0.1687711	best: 0.1127601 (9)	total: 9.26s	remaining: 14.3s
786:	learn: 0.0292490	test: 0.1687020	best: 0.1127601 (9)	total: 9.27s	remaining: 14.3s
787:	learn: 0.0292453	test: 0.1686934	best: 0.1127601 (9)	total: 9.28s	remaining: 14.3s
788:	learn: 0.0291624	test: 0.1690306	best: 0.1127601 (9)	total: 9.3s	remaining: 14.3s
789:	learn: 0.0291415	test: 0.1690162	best: 0.1127601 (9)	total: 9.31s	remaining: 14.3s
790:	learn: 0.0290677	test: 0.1688638	best: 0.1127601 (9)	total: 9.32s	remaining: 14.2s
791:	learn: 0.0290067	test: 0.1689526	best: 0.1127601 (9)	total: 9.34s	remaining: 14.2s
792:	learn: 0.0289800	test: 0.1689056	best: 0.1127601 (9)	total: 9.35s	remaining: 14.2s
793:	learn: 0.0289437	test: 0.1689983	best: 0.1127601 (9)	total: 9.36s	remaining: 14.2s
794:	learn: 0.0288881	test: 0.1690370	best: 0.1127601 (9)	total: 9.37s	remaining: 14.2s
795:	learn: 0.0288419	test: 0.169

881:	learn: 0.0254960	test: 0.1745222	best: 0.1127601 (9)	total: 10.4s	remaining: 13.2s
882:	learn: 0.0254931	test: 0.1745371	best: 0.1127601 (9)	total: 10.4s	remaining: 13.2s
883:	learn: 0.0254413	test: 0.1745496	best: 0.1127601 (9)	total: 10.5s	remaining: 13.2s
884:	learn: 0.0253886	test: 0.1746090	best: 0.1127601 (9)	total: 10.5s	remaining: 13.2s
885:	learn: 0.0253120	test: 0.1747225	best: 0.1127601 (9)	total: 10.5s	remaining: 13.2s
886:	learn: 0.0251852	test: 0.1745478	best: 0.1127601 (9)	total: 10.5s	remaining: 13.2s
887:	learn: 0.0251736	test: 0.1745388	best: 0.1127601 (9)	total: 10.5s	remaining: 13.2s
888:	learn: 0.0251585	test: 0.1745681	best: 0.1127601 (9)	total: 10.5s	remaining: 13.2s
889:	learn: 0.0251354	test: 0.1745769	best: 0.1127601 (9)	total: 10.5s	remaining: 13.2s
890:	learn: 0.0251005	test: 0.1744837	best: 0.1127601 (9)	total: 10.6s	remaining: 13.2s
891:	learn: 0.0250716	test: 0.1744610	best: 0.1127601 (9)	total: 10.6s	remaining: 13.2s
892:	learn: 0.0249896	test: 0.17

977:	learn: 0.0219537	test: 0.1790238	best: 0.1127601 (9)	total: 11.6s	remaining: 12.1s
978:	learn: 0.0219168	test: 0.1791501	best: 0.1127601 (9)	total: 11.6s	remaining: 12.1s
979:	learn: 0.0219047	test: 0.1790495	best: 0.1127601 (9)	total: 11.6s	remaining: 12.1s
980:	learn: 0.0218660	test: 0.1791643	best: 0.1127601 (9)	total: 11.7s	remaining: 12.1s
981:	learn: 0.0218636	test: 0.1791631	best: 0.1127601 (9)	total: 11.7s	remaining: 12.1s
982:	learn: 0.0218177	test: 0.1790940	best: 0.1127601 (9)	total: 11.7s	remaining: 12.1s
983:	learn: 0.0217860	test: 0.1790642	best: 0.1127601 (9)	total: 11.7s	remaining: 12.1s
984:	learn: 0.0217319	test: 0.1790608	best: 0.1127601 (9)	total: 11.7s	remaining: 12.1s
985:	learn: 0.0216795	test: 0.1790773	best: 0.1127601 (9)	total: 11.7s	remaining: 12.1s
986:	learn: 0.0216609	test: 0.1791558	best: 0.1127601 (9)	total: 11.7s	remaining: 12.1s
987:	learn: 0.0216213	test: 0.1792587	best: 0.1127601 (9)	total: 11.8s	remaining: 12s
988:	learn: 0.0215946	test: 0.1794

1075:	learn: 0.0186460	test: 0.1822870	best: 0.1127601 (9)	total: 12.8s	remaining: 11s
1076:	learn: 0.0186136	test: 0.1823128	best: 0.1127601 (9)	total: 12.8s	remaining: 11s
1077:	learn: 0.0185908	test: 0.1822600	best: 0.1127601 (9)	total: 12.8s	remaining: 11s
1078:	learn: 0.0185513	test: 0.1823135	best: 0.1127601 (9)	total: 12.8s	remaining: 10.9s
1079:	learn: 0.0185005	test: 0.1832130	best: 0.1127601 (9)	total: 12.8s	remaining: 10.9s
1080:	learn: 0.0184732	test: 0.1832320	best: 0.1127601 (9)	total: 12.8s	remaining: 10.9s
1081:	learn: 0.0184573	test: 0.1832568	best: 0.1127601 (9)	total: 12.9s	remaining: 10.9s
1082:	learn: 0.0184349	test: 0.1833920	best: 0.1127601 (9)	total: 12.9s	remaining: 10.9s
1083:	learn: 0.0183931	test: 0.1834183	best: 0.1127601 (9)	total: 12.9s	remaining: 10.9s
1084:	learn: 0.0183671	test: 0.1835300	best: 0.1127601 (9)	total: 12.9s	remaining: 10.9s
1085:	learn: 0.0183436	test: 0.1834189	best: 0.1127601 (9)	total: 12.9s	remaining: 10.9s
1086:	learn: 0.0183315	test

1175:	learn: 0.0161647	test: 0.1864893	best: 0.1127601 (9)	total: 14s	remaining: 9.78s
1176:	learn: 0.0161641	test: 0.1864862	best: 0.1127601 (9)	total: 14s	remaining: 9.76s
1177:	learn: 0.0161484	test: 0.1864873	best: 0.1127601 (9)	total: 14s	remaining: 9.75s
1178:	learn: 0.0161344	test: 0.1864833	best: 0.1127601 (9)	total: 14s	remaining: 9.74s
1179:	learn: 0.0161095	test: 0.1865018	best: 0.1127601 (9)	total: 14s	remaining: 9.73s
1180:	learn: 0.0160992	test: 0.1865339	best: 0.1127601 (9)	total: 14s	remaining: 9.72s
1181:	learn: 0.0160387	test: 0.1864821	best: 0.1127601 (9)	total: 14s	remaining: 9.71s
1182:	learn: 0.0160208	test: 0.1864588	best: 0.1127601 (9)	total: 14s	remaining: 9.69s
1183:	learn: 0.0159969	test: 0.1865232	best: 0.1127601 (9)	total: 14s	remaining: 9.68s
1184:	learn: 0.0159792	test: 0.1864178	best: 0.1127601 (9)	total: 14.1s	remaining: 9.67s
1185:	learn: 0.0159590	test: 0.1864989	best: 0.1127601 (9)	total: 14.1s	remaining: 9.66s
1186:	learn: 0.0159214	test: 0.1865119	

1272:	learn: 0.0141701	test: 0.1904066	best: 0.1127601 (9)	total: 15.2s	remaining: 8.67s
1273:	learn: 0.0141532	test: 0.1904123	best: 0.1127601 (9)	total: 15.2s	remaining: 8.65s
1274:	learn: 0.0141108	test: 0.1905563	best: 0.1127601 (9)	total: 15.2s	remaining: 8.64s
1275:	learn: 0.0140812	test: 0.1908074	best: 0.1127601 (9)	total: 15.2s	remaining: 8.63s
1276:	learn: 0.0140719	test: 0.1908616	best: 0.1127601 (9)	total: 15.2s	remaining: 8.62s
1277:	learn: 0.0140561	test: 0.1909212	best: 0.1127601 (9)	total: 15.2s	remaining: 8.61s
1278:	learn: 0.0140273	test: 0.1909264	best: 0.1127601 (9)	total: 15.2s	remaining: 8.6s
1279:	learn: 0.0140173	test: 0.1909610	best: 0.1127601 (9)	total: 15.3s	remaining: 8.58s
1280:	learn: 0.0140161	test: 0.1909463	best: 0.1127601 (9)	total: 15.3s	remaining: 8.57s
1281:	learn: 0.0139989	test: 0.1913262	best: 0.1127601 (9)	total: 15.3s	remaining: 8.56s
1282:	learn: 0.0139989	test: 0.1913240	best: 0.1127601 (9)	total: 15.3s	remaining: 8.54s
1283:	learn: 0.0139572

1369:	learn: 0.0123186	test: 0.1954754	best: 0.1127601 (9)	total: 16.3s	remaining: 7.5s
1370:	learn: 0.0123092	test: 0.1957687	best: 0.1127601 (9)	total: 16.3s	remaining: 7.49s
1371:	learn: 0.0123066	test: 0.1957634	best: 0.1127601 (9)	total: 16.3s	remaining: 7.48s
1372:	learn: 0.0123040	test: 0.1957619	best: 0.1127601 (9)	total: 16.3s	remaining: 7.46s
1373:	learn: 0.0123035	test: 0.1957833	best: 0.1127601 (9)	total: 16.4s	remaining: 7.45s
1374:	learn: 0.0122985	test: 0.1957392	best: 0.1127601 (9)	total: 16.4s	remaining: 7.44s
1375:	learn: 0.0122763	test: 0.1958729	best: 0.1127601 (9)	total: 16.4s	remaining: 7.43s
1376:	learn: 0.0122684	test: 0.1960028	best: 0.1127601 (9)	total: 16.4s	remaining: 7.42s
1377:	learn: 0.0122474	test: 0.1960241	best: 0.1127601 (9)	total: 16.4s	remaining: 7.41s
1378:	learn: 0.0122320	test: 0.1960383	best: 0.1127601 (9)	total: 16.4s	remaining: 7.39s
1379:	learn: 0.0122299	test: 0.1960220	best: 0.1127601 (9)	total: 16.4s	remaining: 7.38s
1380:	learn: 0.0122259

1466:	learn: 0.0107340	test: 0.1996403	best: 0.1127601 (9)	total: 17.4s	remaining: 6.34s
1467:	learn: 0.0107338	test: 0.1996446	best: 0.1127601 (9)	total: 17.4s	remaining: 6.32s
1468:	learn: 0.0107085	test: 0.1997821	best: 0.1127601 (9)	total: 17.5s	remaining: 6.31s
1469:	learn: 0.0106974	test: 0.1997981	best: 0.1127601 (9)	total: 17.5s	remaining: 6.3s
1470:	learn: 0.0106813	test: 0.1997217	best: 0.1127601 (9)	total: 17.5s	remaining: 6.29s
1471:	learn: 0.0106585	test: 0.1998407	best: 0.1127601 (9)	total: 17.5s	remaining: 6.28s
1472:	learn: 0.0106478	test: 0.1998406	best: 0.1127601 (9)	total: 17.5s	remaining: 6.27s
1473:	learn: 0.0106452	test: 0.2001696	best: 0.1127601 (9)	total: 17.5s	remaining: 6.26s
1474:	learn: 0.0106235	test: 0.2004702	best: 0.1127601 (9)	total: 17.6s	remaining: 6.25s
1475:	learn: 0.0105987	test: 0.2005677	best: 0.1127601 (9)	total: 17.6s	remaining: 6.24s
1476:	learn: 0.0105805	test: 0.2006834	best: 0.1127601 (9)	total: 17.6s	remaining: 6.23s
1477:	learn: 0.0105500

1565:	learn: 0.0094793	test: 0.2033499	best: 0.1127601 (9)	total: 18.7s	remaining: 5.18s
1566:	learn: 0.0094787	test: 0.2033440	best: 0.1127601 (9)	total: 18.7s	remaining: 5.17s
1567:	learn: 0.0094724	test: 0.2034729	best: 0.1127601 (9)	total: 18.7s	remaining: 5.15s
1568:	learn: 0.0094546	test: 0.2036029	best: 0.1127601 (9)	total: 18.7s	remaining: 5.14s
1569:	learn: 0.0094356	test: 0.2035852	best: 0.1127601 (9)	total: 18.7s	remaining: 5.13s
1570:	learn: 0.0094295	test: 0.2036053	best: 0.1127601 (9)	total: 18.7s	remaining: 5.12s
1571:	learn: 0.0094117	test: 0.2037770	best: 0.1127601 (9)	total: 18.8s	remaining: 5.11s
1572:	learn: 0.0094084	test: 0.2037439	best: 0.1127601 (9)	total: 18.8s	remaining: 5.09s
1573:	learn: 0.0094078	test: 0.2037301	best: 0.1127601 (9)	total: 18.8s	remaining: 5.08s
1574:	learn: 0.0094051	test: 0.2037063	best: 0.1127601 (9)	total: 18.8s	remaining: 5.07s
1575:	learn: 0.0094025	test: 0.2037491	best: 0.1127601 (9)	total: 18.8s	remaining: 5.06s
1576:	learn: 0.009401

1661:	learn: 0.0084202	test: 0.2055464	best: 0.1127601 (9)	total: 19.9s	remaining: 4.04s
1662:	learn: 0.0084176	test: 0.2055359	best: 0.1127601 (9)	total: 19.9s	remaining: 4.03s
1663:	learn: 0.0084036	test: 0.2056442	best: 0.1127601 (9)	total: 19.9s	remaining: 4.01s
1664:	learn: 0.0083900	test: 0.2056645	best: 0.1127601 (9)	total: 19.9s	remaining: 4s
1665:	learn: 0.0083835	test: 0.2056257	best: 0.1127601 (9)	total: 19.9s	remaining: 3.99s
1666:	learn: 0.0083818	test: 0.2056691	best: 0.1127601 (9)	total: 19.9s	remaining: 3.98s
1667:	learn: 0.0083748	test: 0.2057298	best: 0.1127601 (9)	total: 19.9s	remaining: 3.97s
1668:	learn: 0.0083570	test: 0.2057959	best: 0.1127601 (9)	total: 19.9s	remaining: 3.96s
1669:	learn: 0.0083536	test: 0.2057907	best: 0.1127601 (9)	total: 20s	remaining: 3.94s
1670:	learn: 0.0083467	test: 0.2057960	best: 0.1127601 (9)	total: 20s	remaining: 3.93s
1671:	learn: 0.0083431	test: 0.2057973	best: 0.1127601 (9)	total: 20s	remaining: 3.92s
1672:	learn: 0.0083353	test: 0

1756:	learn: 0.0074956	test: 0.2081951	best: 0.1127601 (9)	total: 21.2s	remaining: 2.92s
1757:	learn: 0.0074904	test: 0.2082117	best: 0.1127601 (9)	total: 21.2s	remaining: 2.91s
1758:	learn: 0.0074791	test: 0.2082449	best: 0.1127601 (9)	total: 21.2s	remaining: 2.9s
1759:	learn: 0.0074765	test: 0.2082952	best: 0.1127601 (9)	total: 21.2s	remaining: 2.89s
1760:	learn: 0.0074764	test: 0.2082951	best: 0.1127601 (9)	total: 21.2s	remaining: 2.88s
1761:	learn: 0.0074458	test: 0.2084301	best: 0.1127601 (9)	total: 21.2s	remaining: 2.87s
1762:	learn: 0.0074411	test: 0.2084001	best: 0.1127601 (9)	total: 21.3s	remaining: 2.86s
1763:	learn: 0.0074170	test: 0.2084784	best: 0.1127601 (9)	total: 21.3s	remaining: 2.85s
1764:	learn: 0.0074148	test: 0.2084418	best: 0.1127601 (9)	total: 21.3s	remaining: 2.83s
1765:	learn: 0.0073982	test: 0.2085584	best: 0.1127601 (9)	total: 21.3s	remaining: 2.82s
1766:	learn: 0.0073971	test: 0.2085524	best: 0.1127601 (9)	total: 21.3s	remaining: 2.81s
1767:	learn: 0.0073820

1858:	learn: 0.0066522	test: 0.2137375	best: 0.1127601 (9)	total: 23s	remaining: 1.75s
1859:	learn: 0.0066456	test: 0.2137370	best: 0.1127601 (9)	total: 23s	remaining: 1.73s
1860:	learn: 0.0066437	test: 0.2137242	best: 0.1127601 (9)	total: 23s	remaining: 1.72s
1861:	learn: 0.0066361	test: 0.2138061	best: 0.1127601 (9)	total: 23.1s	remaining: 1.71s
1862:	learn: 0.0066248	test: 0.2137564	best: 0.1127601 (9)	total: 23.1s	remaining: 1.7s
1863:	learn: 0.0066183	test: 0.2138215	best: 0.1127601 (9)	total: 23.1s	remaining: 1.68s
1864:	learn: 0.0066090	test: 0.2138731	best: 0.1127601 (9)	total: 23.1s	remaining: 1.67s
1865:	learn: 0.0065975	test: 0.2140673	best: 0.1127601 (9)	total: 23.1s	remaining: 1.66s
1866:	learn: 0.0065928	test: 0.2140339	best: 0.1127601 (9)	total: 23.1s	remaining: 1.65s
1867:	learn: 0.0065888	test: 0.2140677	best: 0.1127601 (9)	total: 23.1s	remaining: 1.64s
1868:	learn: 0.0065882	test: 0.2140377	best: 0.1127601 (9)	total: 23.1s	remaining: 1.62s
1869:	learn: 0.0065879	test:

1955:	learn: 0.0060659	test: 0.2151805	best: 0.1127601 (9)	total: 24.2s	remaining: 544ms
1956:	learn: 0.0060528	test: 0.2150181	best: 0.1127601 (9)	total: 24.2s	remaining: 532ms
1957:	learn: 0.0060453	test: 0.2152166	best: 0.1127601 (9)	total: 24.2s	remaining: 520ms
1958:	learn: 0.0060409	test: 0.2153197	best: 0.1127601 (9)	total: 24.2s	remaining: 507ms
1959:	learn: 0.0060407	test: 0.2153363	best: 0.1127601 (9)	total: 24.2s	remaining: 495ms
1960:	learn: 0.0060380	test: 0.2153799	best: 0.1127601 (9)	total: 24.3s	remaining: 482ms
1961:	learn: 0.0060301	test: 0.2154444	best: 0.1127601 (9)	total: 24.3s	remaining: 470ms
1962:	learn: 0.0060192	test: 0.2155113	best: 0.1127601 (9)	total: 24.3s	remaining: 458ms
1963:	learn: 0.0060149	test: 0.2155314	best: 0.1127601 (9)	total: 24.3s	remaining: 445ms
1964:	learn: 0.0060040	test: 0.2155377	best: 0.1127601 (9)	total: 24.3s	remaining: 433ms
1965:	learn: 0.0060019	test: 0.2155250	best: 0.1127601 (9)	total: 24.3s	remaining: 420ms
1966:	learn: 0.005997

In [185]:
relevancyclf_training_predictions.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,ActualRelevancy,Not_Relevant_prob,Yes_Relevant_prob,RelevancePrediction
0,71.0,24.0,216.0,,,98.0,76541.0,3.0,3.5,0.0,2.2,0.0482,3.21,1.0,16.9,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.960739,0.039261,0.0
1,71.0,24.0,216.0,,,98.0,37933.0,3.0,4.0,0.0,2.08,0.0812,3.59,2.0,18.34,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,0.0,0.974633,0.025367,0.0
2,71.0,24.0,216.0,,,98.0,108588.0,4.0,4.0,0.0,0.0,0.0268,4.08,3.0,26.78,1.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,0.0,0.980494,0.019506,0.0
3,71.0,24.0,216.0,,,98.0,28672.0,3.0,4.0,0.0,2.2,0.0151,3.35,4.0,21.48,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.983915,0.016085,0.0
4,71.0,24.0,216.0,,,98.0,25741.0,3.0,3.5,0.0,1.61,0.0359,4.02,6.0,19.47,1.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.987769,0.012231,0.0


In [186]:
ranking_training_predictions["Predictedrank"] = ranking_training_predictions.groupby("srch_id")["PredictedRelevancy"].rank("dense", ascending=False)
ranking_training_predictions_df=ranking_training_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_training_predictions_df = ranking_training_predictions_df.reset_index(drop=True)
ranking_training_predictions_df.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,ActualRelevancy,PredictedRelevancy,Predictedrank
0,71.0,24.0,216.0,,,98.0,76541.0,3.0,3.5,0.0,2.2,0.0482,3.21,1.0,16.9,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.041112,1.0
1,71.0,24.0,216.0,,,98.0,37933.0,3.0,4.0,0.0,2.08,0.0812,3.59,2.0,18.34,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,0.0,0.031434,2.0
2,71.0,24.0,216.0,,,98.0,108588.0,4.0,4.0,0.0,0.0,0.0268,4.08,3.0,26.78,1.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,0.0,0.024781,5.0
3,71.0,24.0,216.0,,,98.0,28672.0,3.0,4.0,0.0,2.2,0.0151,3.35,4.0,21.48,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.007737,15.0
4,71.0,24.0,216.0,,,98.0,25741.0,3.0,3.5,0.0,1.61,0.0359,4.02,6.0,19.47,1.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.010106,11.0


In [187]:
ranking_test_predictions["Predictedrank"] = ranking_test_predictions.groupby("srch_id")["PredictedRelevancy"].rank("dense", ascending=False)
ranking_test_predictions_df=ranking_test_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_test_predictions_df = ranking_test_predictions_df.reset_index(drop=True)
ranking_test_predictions_df.head()


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,ActualRelevancy,PredictedRelevancy,Predictedrank
0,246.0,14.0,100.0,,,219.0,11396.0,4.0,5.0,0.0,0.69,0.0551,5.36,1.0,159.64,0.0,18294.0,1.0,6.0,2.0,0.0,1.0,0.0,,87.17,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.123994,1.0
1,246.0,14.0,100.0,,,219.0,103885.0,2.0,4.0,1.0,1.1,0.154,4.75,2.0,98.32,0.0,18294.0,1.0,6.0,2.0,0.0,1.0,0.0,,93.87,0.0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,0.0,0.122948,2.0
2,246.0,14.0,100.0,,,219.0,61167.0,3.0,4.5,0.0,1.1,0.0314,4.87,3.0,101.24,0.0,18294.0,1.0,6.0,2.0,0.0,1.0,0.0,,92.31,0.0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,0.0,0.054097,3.0
3,246.0,14.0,100.0,,,219.0,95490.0,2.0,3.5,1.0,1.1,0.0142,4.88,4.0,95.4,0.0,18294.0,1.0,6.0,2.0,0.0,1.0,0.0,,89.28,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.017256,6.0
4,246.0,14.0,100.0,,,219.0,127213.0,3.0,4.0,1.0,1.1,0.1652,4.95,6.0,119.73,0.0,18294.0,1.0,6.0,2.0,0.0,1.0,0.0,,93.97,0.0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,0.0,0.048755,4.0


In [188]:
#relevancyclf_training_predictions["Predictedrank"] = relevancyclf_training_predictions.groupby("srch_id")["PredictedRelevancy"].rank("dense", ascending=False)
# relevancyclf_training_predictions=relevancyclf_training_predictions.sort_values(by=['srch_id','position'], ascending=True)
# relevancyclf_training_predictions = relevancyclf_training_predictions.reset_index(drop=True)
# relevancyclf_training_predictions.head()

In [189]:
# relevancyclf_test_predictions["Predictedrank"] = relevancyclf_test_predictions.groupby("srch_id")["PredictedRelevancy"].rank("dense", ascending=False)
# relevancyclf_test_predictions=relevancyclf_test_predictions.sort_values(by=['srch_id','position'], ascending=True)
# relevancyclf_test_predictions = relevancyclf_test_predictions.reset_index(drop=True)
# relevancyclf_test_predictions.head()

In [190]:
training_predictions_df.to_csv('./data/hotelsearch_training_predictions.csv')
test_predictions_df.to_csv('./data/hotelsearch_test_predictions.csv')

In [191]:
relevancyclf_training_predictions.to_csv('./data/hotelsearch_clf_training.csv')
relevancyclf_test_predictions.to_csv('./data/hotelsearch_clf_test.csv')

In [192]:
relevancyclf_training_predictions.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,ActualRelevancy,Not_Relevant_prob,Yes_Relevant_prob,RelevancePrediction
0,71.0,24.0,216.0,,,98.0,76541.0,3.0,3.5,0.0,2.2,0.0482,3.21,1.0,16.9,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.960739,0.039261,0.0
1,71.0,24.0,216.0,,,98.0,37933.0,3.0,4.0,0.0,2.08,0.0812,3.59,2.0,18.34,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,0.0,0.974633,0.025367,0.0
2,71.0,24.0,216.0,,,98.0,108588.0,4.0,4.0,0.0,0.0,0.0268,4.08,3.0,26.78,1.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,0.0,0.980494,0.019506,0.0
3,71.0,24.0,216.0,,,98.0,28672.0,3.0,4.0,0.0,2.2,0.0151,3.35,4.0,21.48,0.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.983915,0.016085,0.0
4,71.0,24.0,216.0,,,98.0,25741.0,3.0,3.5,0.0,1.61,0.0359,4.02,6.0,19.47,1.0,341.0,1.0,18.0,1.0,1.0,1.0,1.0,,,1.0,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,0.987769,0.012231,0.0
