In [349]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRanker, Pool, MetricVisualizer, CatBoostClassifier
from copy import deepcopy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [350]:


def featurize_df(df:pd.DataFrame) ->pd.DataFrame:
    """
    Extract more features
    """
    df["day"] = df["date_time"].dt.day
    df["weekday"] = df["date_time"].dt.weekday
    df["week_of_year"] = df["date_time"].dt.isocalendar().week

    df["hour"] = df["date_time"].dt.hour
    df["minute"] = df["date_time"].dt.minute
    ## total time elapsed - allows model to learn continous trend over time to a degree
    df["time_epoch"] = df["date_time"].astype('int64')//1e9
    ## if we were looking at fraud: df["seconds"] = df.timestamp.dt.second
    df["early_night"] = ((df["hour"]>19) | (df["hour"]<3)) # no added value from feature
    
    df["nans_count"] = df.isna().sum(axis=1)
    
    ## we won't make any time series features for now
    ## We could add time series features per property/hotel. We'd need to check for unaries, and to add a shift/offset dependant on forecast horizon

    return df


In [351]:
HotelSearch_train=pd.read_csv("./data/HotelSearch_train_sampled.csv", index_col=0)
HotelSearch_test=pd.read_csv("./data/HotelSearch_test_sampled.csv", index_col=0)



#print(HotelSearch_orig['booking_bool'].unique())
HotelSearch_train["date_time"] = pd.to_datetime(HotelSearch_train["date_time"],infer_datetime_format=True)
#HotelSearch_train["target"] = HotelSearch_train.apply(get_target,axis=1)

HotelSearch_test["date_time"] = pd.to_datetime(HotelSearch_test["date_time"],infer_datetime_format=True)
#HotelSearch_test["target"] = HotelSearch_test.apply(get_target,axis=1)

HotelSearch_train["date_time"].max()

Timestamp('2013-06-30 20:02:52')

In [352]:
drop_cols=[]
drop_unary_cols = [c for c in list(HotelSearch_train)
    if HotelSearch_train[c].nunique(dropna=False) <= 1]
#print(drop_unary_cols)
target_cols = ["gross_bookings_usd","click_bool","booking_bool","Unnamed: 0"] # leaky column, and original target columns
drop_cols.extend(drop_unary_cols)
drop_cols.extend(target_cols) 
print(drop_cols)
HotelSearch_train = HotelSearch_train.drop(columns=drop_cols,errors="ignore")
HotelSearch_test  = HotelSearch_test.drop(columns=drop_cols,errors="ignore")
print(HotelSearch_train.shape)
print(HotelSearch_test.shape)


['gross_bookings_usd', 'click_bool', 'booking_bool', 'Unnamed: 0']
(98361, 53)
(147730, 53)


In [353]:
HotelSearch_train = featurize_df(HotelSearch_train)
HotelSearch_test = featurize_df(HotelSearch_test)
HotelSearch_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147730 entries, 0 to 147729
Data columns (total 61 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   srch_id                      147730 non-null  int64         
 1   date_time                    147730 non-null  datetime64[ns]
 2   site_id                      147730 non-null  int64         
 3   visitor_location_country_id  147730 non-null  int64         
 4   visitor_hist_starrating      7770 non-null    float64       
 5   visitor_hist_adr_usd         7801 non-null    float64       
 6   prop_country_id              147730 non-null  int64         
 7   prop_id                      147730 non-null  int64         
 8   prop_starrating              147730 non-null  int64         
 9   prop_review_score            147499 non-null  float64       
 10  prop_brand_bool              147730 non-null  int64         
 11  prop_location_score1      

In [354]:
HotelSearch_train.drop(['comp3_rate',
       'comp3_inv', 'comp3_rate_percent_diff', 'comp4_inv', 'comp5_rate',
       'comp5_inv', 'comp5_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff'],axis=1).groupby(HotelSearch_train["target"]>0).mean()
HotelSearch_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98361 entries, 0 to 98360
Data columns (total 61 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   srch_id                      98361 non-null  int64         
 1   date_time                    98361 non-null  datetime64[ns]
 2   site_id                      98361 non-null  int64         
 3   visitor_location_country_id  98361 non-null  int64         
 4   visitor_hist_starrating      4699 non-null   float64       
 5   visitor_hist_adr_usd         4724 non-null   float64       
 6   prop_country_id              98361 non-null  int64         
 7   prop_id                      98361 non-null  int64         
 8   prop_starrating              98361 non-null  int64         
 9   prop_review_score            98201 non-null  float64       
 10  prop_brand_bool              98361 non-null  int64         
 11  prop_location_score1         98361 non-nu

In [361]:
cutoff_id = HotelSearch_train["srch_id"].quantile(0.94) # 90/10 split

X_train_cutoff=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]
X_eval_cutoff = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]
#X_test_cut =HotelSearch_test.drop(["target"],axis=1)

X_train_df=X_train_cutoff.drop(["date_time","target"],axis=1)
X_eval_df = X_eval_cutoff.drop(["date_time","target"],axis=1)
X_test_df =HotelSearch_test.drop(["date_time","target"],axis=1)


feature_cols=X_train_df.columns.tolist()
# X_train_df=HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]
# X_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]

y_train_df = HotelSearch_train.loc[HotelSearch_train.srch_id< cutoff_id]["target"].astype(float)
y_eval_df = HotelSearch_train.loc[HotelSearch_train.srch_id>= cutoff_id]["target"].astype(float)
y_test_df = HotelSearch_test["target"].astype(float)

X_train = X_train_df.values
X_eval = X_eval_df.values
X_test= X_test_df.values
queries_train = X_train_df["srch_id"].values

y_train = y_train_df.values
y_eval = y_eval_df.values
y_test = y_test_df.values

queries_test = X_eval_df["srch_id"].values



max_relevance = float(np.max(y_train))
y_train /= max_relevance
y_eval /= max_relevance
y_train_df.value_counts()


0.0    89823
1.0     2628
Name: target, dtype: int64

In [362]:
print("mean relevancy train",round(y_train.mean(),4))
print("mean relevancy eval",round(y_eval.mean(),4))
print(y_eval_df.value_counts()) # check we have all 3 "labels" in subset

mean relevancy train 0.0284
mean relevancy eval 0.0293
0.0    5737
1.0     173
Name: target, dtype: int64


In [363]:
HotelSearch_train['target'].value_counts()

0    95560
1     2801
Name: target, dtype: int64

In [364]:
categorical_cols = ['prop_id',"srch_destination_id", "weekday"] # ,"week_of_year"

In [365]:
X_train_df.shape

(92451, 59)

In [366]:
#set(X_train.columns).symmetric_difference(set(HotelSearch_test.columns))
train_pool = Pool(data=X_train,
                  label = y_train,
#                  cat_features=categorical_cols,
                  group_id=queries_train
                 )

eval_pool = Pool(data=X_eval,
                  label = y_eval,
#                  cat_features=categorical_cols,
                  group_id=queries_test
                 )

In [367]:
default_parameters  = {
    'iterations': 2000,
    'custom_metric': ['NDCG', "AUC:type=Ranking",'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], # , 'AverageGain:top=3'# 'QueryRMSE', "YetiLoss" (use with hints)
    'verbose': False,
    'random_seed': 42,
#     "task_type":"GPU",
#    "has_time":True,
    "metric_period":4,
    "save_snapshot":False,
    "use_best_model":True, # requires eval set to be set
} 

default_clf_parameters  = {
    'iterations':2000,
    'random_seed':42,
    'learning_rate':0.5,
    'custom_loss':['AUC', 'Accuracy']
}    


In [368]:
def fit_model(model,loss_function, prediction_function, feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval,additional_params=None, train_pool=train_pool, test_pool=eval_pool):

    print("Model Type: Is it CatboostRanker?")
    print(isinstance(model, type(CatBoostRanker())))

    
    if isinstance(model, type(CatBoostRanker())) is True:   
        print("Model Type is CatboostRanker")
        parameters = deepcopy(default_parameters)
        parameters['loss_function'] = loss_function
        parameters['train_dir'] = loss_function
        print(parameters) 
        if additional_params is not None:
            parameters.update(additional_params)
        model = CatBoostRanker(**parameters)
        model.fit(train_pool, eval_set=test_pool, plot=True)
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
    else:
        print("Model Type is CatboostClassifier")
        parameters = deepcopy(default_clf_parameters)
#        parameters['loss_function'] = loss_function
#        parameters['train_dir'] = loss_function
        model = CatBoostClassifier(**parameters)
        print(parameters) 
        model.fit(X_train, y_train,eval_set=(X_eval, y_eval),plot=True)        
        print("best results (train on train):")
        print(model.get_best_score()["learn"])
        print("best results (on validation set):")
        print(model.get_best_score()["validation"])
#Training predictions
    y_train_predictions = prediction_function(model, X_train)
    training_predictions = pd.DataFrame(X_train,columns=feature_cols)
    training_predictions['ActualRelevancyScore'] = y_train
    X_train_df
    training_predictions['ActualRelevancy']=np.where(y_train==1.0,"Relevant","Not Relevant")

    if isinstance(model, type(CatBoostRanker())) is True:
        training_predictions['PredictedRelevancyScore'] = y_train_predictions
    else:
        training_predictions['Not_Relevant_prob'] = y_train_predictions[:,0]
        training_predictions['Yes_Relevant_prob'] = y_train_predictions[:,1]
        training_predictions['RelevanceLabelPrediction'] = np.argmax(y_train_predictions, axis=1).astype(np.float32)
        training_predictions['PredictedRelevancy']=np.where(training_predictions['RelevanceLabelPrediction']==1.0,"Relevant","Not Relevant")
        training_predictions['PredictedRelevancyScore'] = np.where(training_predictions['RelevanceLabelPrediction']==1.0,training_predictions['Yes_Relevant_prob'],training_predictions['Not_Relevant_prob'])
    
#Test predictions
    y_test_predictions = prediction_function(model, X_test)
    test_predictions = pd.DataFrame(X_test,columns=feature_cols)
    test_predictions['ActualRelevancyScore'] = y_test
    test_predictions['ActualRelevancy']=np.where(y_test==1.0,"Relevant","Not Relevant")

    if isinstance(model, type(CatBoostRanker())) is True:
        test_predictions['PredictedRelevancyScore'] = y_test_predictions
    else:
        test_predictions['Not_Relevant_prob'] = y_test_predictions[:,0]
        test_predictions['Yes_Relevant_prob'] = y_test_predictions[:,1]
        test_predictions['RelevanceLabelPrediction'] = np.argmax(y_test_predictions, axis=1).astype(np.float32)
        test_predictions['PredictedRelevancy']=np.where(test_predictions['RelevanceLabelPrediction']==1.0,"Relevant","Not Relevant")
        test_predictions['PredictedRelevancyScore'] = np.where(test_predictions['RelevanceLabelPrediction']==1.0,test_predictions['Yes_Relevant_prob'],test_predictions['Not_Relevant_prob'])
       

#     print("(Default) Feature importance (on train pool)")
#     display(model.get_feature_importance(data=train_pool,prettified=True).head(15))
    
#     try:
#         print("SHAP features importance, on all data:")
#         explainer = shap.TreeExplainer(model)
#         shap_values = explainer.shap_values(pd.concat([X_train,X_eval]),
#                                             y=pd.concat([y_train,y_eval]))

#         # # summarize the effects of all the features
#         shap.summary_plot(shap_values, pd.concat([X_train,X_eval]))
#    finally:
    return model, training_predictions , test_predictions  


In [369]:
def get_predicted_ranking_outcome(model, data):
#    a=model.predict(data)
#    print(a)
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict(data)

In [370]:
def get_predicted_clf_outcome(model, data):
#    print(model.predict_proba(data))
#    print(np.argmax(model.predict_proba(data), axis=1).astype(np.float32))
#    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
    return model.predict_proba(data)

In [371]:
#model = fit_model('RMSE', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10']})
parameters = {}
ranking_model,ranking_training_predictions, ranking_test_predictions = fit_model(CatBoostRanker(),'QueryRMSE',get_predicted_ranking_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
True
Model Type is CatboostRanker
{'iterations': 2000, 'custom_metric': ['NDCG', 'AUC:type=Ranking', 'PrecisionAt:top=10', 'RecallAt:top=10', 'MAP:top=10'], 'verbose': False, 'random_seed': 42, 'metric_period': 4, 'save_snapshot': False, 'use_best_model': True, 'loss_function': 'QueryRMSE', 'train_dir': 'QueryRMSE'}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

best results (train on train):
{'MAP:top=10': 0.3732661929338389, 'RecallAt:top=10': 0.9321943406300054, 'PrecisionAt:top=10': 0.06704770803142165, 'QueryRMSE': 0.15072671497164444}
best results (on validation set):
{'NDCG:type=Base': 0.673909821092739, 'MAP:top=10': 0.2714639162007585, 'RecallAt:top=10': 0.8825910931174089, 'PrecisionAt:top=10': 0.0624172611014715, 'QueryRMSE': 0.1619892904627872, 'AUC:type=Ranking': 0.7731105560598932}


In [372]:
ranking_training_predictions.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,day,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancyScore,ActualRelevancy,PredictedRelevancyScore
0,71,24,216,,,98,76541,3,3.5,0,2.2,0.0482,3.21,1,16.9,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,12,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.041477
1,71,24,216,,,98,37933,3,4.0,0,2.08,0.0812,3.59,2,18.34,0,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,1,12,1,11,23,14,1363130090.0,True,20,0.0,Not Relevant,0.033541
2,71,24,216,,,98,108588,4,4.0,0,0.0,0.0268,4.08,3,26.78,1,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,2,12,1,11,23,14,1363130090.0,True,17,0.0,Not Relevant,0.01757
3,71,24,216,,,98,28672,3,4.0,0,2.2,0.0151,3.35,4,21.48,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,3,12,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.010443
4,71,24,216,,,98,25741,3,3.5,0,1.61,0.0359,4.02,6,19.47,1,341,1,18,1,1,1,1,,,1,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,4,12,1,11,23,14,1363130090.0,True,19,0.0,Not Relevant,0.011178


In [373]:
parameters = {}
relevancyclf_model,relevancyclf_training_predictions, relevancyclf_test_predictions = fit_model(CatBoostClassifier(),'AUC',get_predicted_clf_outcome,feature_cols,X_train,y_train,X_test,y_test,X_eval,y_eval)


Model Type: Is it CatboostRanker?
False
Model Type is CatboostClassifier
{'iterations': 2000, 'random_seed': 42, 'learning_rate': 0.5, 'custom_loss': ['AUC', 'Accuracy']}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.1811927	test: 0.1833513	best: 0.1833513 (0)	total: 28.6ms	remaining: 57.2s
1:	learn: 0.1283662	test: 0.1311115	best: 0.1311115 (1)	total: 48.1ms	remaining: 48.1s
2:	learn: 0.1151302	test: 0.1185129	best: 0.1185129 (2)	total: 62.5ms	remaining: 41.6s
3:	learn: 0.1103391	test: 0.1149689	best: 0.1149689 (3)	total: 76.5ms	remaining: 38.2s
4:	learn: 0.1093358	test: 0.1142934	best: 0.1142934 (4)	total: 90.1ms	remaining: 35.9s
5:	learn: 0.1081472	test: 0.1135967	best: 0.1135967 (5)	total: 104ms	remaining: 34.7s
6:	learn: 0.1080192	test: 0.1137392	best: 0.1135967 (5)	total: 115ms	remaining: 32.8s
7:	learn: 0.1075576	test: 0.1134394	best: 0.1134394 (7)	total: 128ms	remaining: 31.8s
8:	learn: 0.1072172	test: 0.1133746	best: 0.1133746 (8)	total: 142ms	remaining: 31.4s
9:	learn: 0.1066683	test: 0.1130443	best: 0.1130443 (9)	total: 155ms	remaining: 30.8s
10:	learn: 0.1066394	test: 0.1130321	best: 0.1130321 (10)	total: 166ms	remaining: 30.1s
11:	learn: 0.1065333	test: 0.1129785	best: 0.11

94:	learn: 0.0899362	test: 0.1200265	best: 0.1129785 (11)	total: 1.27s	remaining: 25.5s
95:	learn: 0.0897029	test: 0.1199966	best: 0.1129785 (11)	total: 1.28s	remaining: 25.5s
96:	learn: 0.0896540	test: 0.1199726	best: 0.1129785 (11)	total: 1.29s	remaining: 25.4s
97:	learn: 0.0894813	test: 0.1199686	best: 0.1129785 (11)	total: 1.32s	remaining: 25.6s
98:	learn: 0.0893228	test: 0.1198868	best: 0.1129785 (11)	total: 1.34s	remaining: 25.7s
99:	learn: 0.0892813	test: 0.1199461	best: 0.1129785 (11)	total: 1.36s	remaining: 25.8s
100:	learn: 0.0891229	test: 0.1199892	best: 0.1129785 (11)	total: 1.37s	remaining: 25.7s
101:	learn: 0.0889790	test: 0.1212054	best: 0.1129785 (11)	total: 1.38s	remaining: 25.6s
102:	learn: 0.0889319	test: 0.1212504	best: 0.1129785 (11)	total: 1.39s	remaining: 25.6s
103:	learn: 0.0888872	test: 0.1212115	best: 0.1129785 (11)	total: 1.4s	remaining: 25.5s
104:	learn: 0.0885968	test: 0.1212855	best: 0.1129785 (11)	total: 1.43s	remaining: 25.8s
105:	learn: 0.0882955	test: 

194:	learn: 0.0755484	test: 0.1283075	best: 0.1129785 (11)	total: 2.53s	remaining: 23.4s
195:	learn: 0.0752158	test: 0.1283783	best: 0.1129785 (11)	total: 2.54s	remaining: 23.4s
196:	learn: 0.0751160	test: 0.1283123	best: 0.1129785 (11)	total: 2.56s	remaining: 23.4s
197:	learn: 0.0748731	test: 0.1282411	best: 0.1129785 (11)	total: 2.56s	remaining: 23.4s
198:	learn: 0.0746800	test: 0.1282274	best: 0.1129785 (11)	total: 2.58s	remaining: 23.3s
199:	learn: 0.0745713	test: 0.1281768	best: 0.1129785 (11)	total: 2.59s	remaining: 23.3s
200:	learn: 0.0745430	test: 0.1281184	best: 0.1129785 (11)	total: 2.6s	remaining: 23.3s
201:	learn: 0.0745341	test: 0.1281109	best: 0.1129785 (11)	total: 2.62s	remaining: 23.3s
202:	learn: 0.0744884	test: 0.1281249	best: 0.1129785 (11)	total: 2.63s	remaining: 23.3s
203:	learn: 0.0744615	test: 0.1281349	best: 0.1129785 (11)	total: 2.65s	remaining: 23.3s
204:	learn: 0.0744429	test: 0.1281354	best: 0.1129785 (11)	total: 2.66s	remaining: 23.3s
205:	learn: 0.0742451	

295:	learn: 0.0635799	test: 0.1324332	best: 0.1129785 (11)	total: 3.8s	remaining: 21.9s
296:	learn: 0.0634270	test: 0.1326501	best: 0.1129785 (11)	total: 3.81s	remaining: 21.9s
297:	learn: 0.0632530	test: 0.1326601	best: 0.1129785 (11)	total: 3.82s	remaining: 21.8s
298:	learn: 0.0632152	test: 0.1325667	best: 0.1129785 (11)	total: 3.83s	remaining: 21.8s
299:	learn: 0.0631472	test: 0.1326147	best: 0.1129785 (11)	total: 3.84s	remaining: 21.8s
300:	learn: 0.0630831	test: 0.1326863	best: 0.1129785 (11)	total: 3.85s	remaining: 21.7s
301:	learn: 0.0630337	test: 0.1326526	best: 0.1129785 (11)	total: 3.86s	remaining: 21.7s
302:	learn: 0.0630037	test: 0.1325430	best: 0.1129785 (11)	total: 3.87s	remaining: 21.7s
303:	learn: 0.0627529	test: 0.1328573	best: 0.1129785 (11)	total: 3.88s	remaining: 21.7s
304:	learn: 0.0625526	test: 0.1328883	best: 0.1129785 (11)	total: 3.89s	remaining: 21.6s
305:	learn: 0.0623532	test: 0.1329246	best: 0.1129785 (11)	total: 3.91s	remaining: 21.6s
306:	learn: 0.0622623	

395:	learn: 0.0530950	test: 0.1418887	best: 0.1129785 (11)	total: 5.05s	remaining: 20.4s
396:	learn: 0.0529752	test: 0.1417814	best: 0.1129785 (11)	total: 5.07s	remaining: 20.5s
397:	learn: 0.0529270	test: 0.1416242	best: 0.1129785 (11)	total: 5.08s	remaining: 20.4s
398:	learn: 0.0529179	test: 0.1427221	best: 0.1129785 (11)	total: 5.09s	remaining: 20.4s
399:	learn: 0.0528153	test: 0.1452549	best: 0.1129785 (11)	total: 5.1s	remaining: 20.4s
400:	learn: 0.0527643	test: 0.1452928	best: 0.1129785 (11)	total: 5.11s	remaining: 20.4s
401:	learn: 0.0527216	test: 0.1452889	best: 0.1129785 (11)	total: 5.12s	remaining: 20.4s
402:	learn: 0.0527120	test: 0.1453818	best: 0.1129785 (11)	total: 5.13s	remaining: 20.3s
403:	learn: 0.0526192	test: 0.1454884	best: 0.1129785 (11)	total: 5.14s	remaining: 20.3s
404:	learn: 0.0526004	test: 0.1455202	best: 0.1129785 (11)	total: 5.15s	remaining: 20.3s
405:	learn: 0.0524825	test: 0.1455364	best: 0.1129785 (11)	total: 5.16s	remaining: 20.3s
406:	learn: 0.0524758	

492:	learn: 0.0449496	test: 0.1506569	best: 0.1129785 (11)	total: 6.27s	remaining: 19.2s
493:	learn: 0.0448648	test: 0.1504063	best: 0.1129785 (11)	total: 6.28s	remaining: 19.2s
494:	learn: 0.0447440	test: 0.1504379	best: 0.1129785 (11)	total: 6.3s	remaining: 19.1s
495:	learn: 0.0447371	test: 0.1503999	best: 0.1129785 (11)	total: 6.31s	remaining: 19.1s
496:	learn: 0.0446274	test: 0.1505404	best: 0.1129785 (11)	total: 6.32s	remaining: 19.1s
497:	learn: 0.0445817	test: 0.1505473	best: 0.1129785 (11)	total: 6.33s	remaining: 19.1s
498:	learn: 0.0445610	test: 0.1505856	best: 0.1129785 (11)	total: 6.34s	remaining: 19.1s
499:	learn: 0.0445314	test: 0.1505768	best: 0.1129785 (11)	total: 6.36s	remaining: 19.1s
500:	learn: 0.0445223	test: 0.1505568	best: 0.1129785 (11)	total: 6.37s	remaining: 19.1s
501:	learn: 0.0444235	test: 0.1506681	best: 0.1129785 (11)	total: 6.38s	remaining: 19s
502:	learn: 0.0443977	test: 0.1506644	best: 0.1129785 (11)	total: 6.39s	remaining: 19s
503:	learn: 0.0443435	test

589:	learn: 0.0385397	test: 0.1552522	best: 0.1129785 (11)	total: 7.54s	remaining: 18s
590:	learn: 0.0384399	test: 0.1553518	best: 0.1129785 (11)	total: 7.55s	remaining: 18s
591:	learn: 0.0384196	test: 0.1553319	best: 0.1129785 (11)	total: 7.57s	remaining: 18s
592:	learn: 0.0383751	test: 0.1553169	best: 0.1129785 (11)	total: 7.58s	remaining: 18s
593:	learn: 0.0383333	test: 0.1553583	best: 0.1129785 (11)	total: 7.59s	remaining: 18s
594:	learn: 0.0382788	test: 0.1553859	best: 0.1129785 (11)	total: 7.6s	remaining: 18s
595:	learn: 0.0381514	test: 0.1555356	best: 0.1129785 (11)	total: 7.62s	remaining: 18s
596:	learn: 0.0380783	test: 0.1558025	best: 0.1129785 (11)	total: 7.63s	remaining: 17.9s
597:	learn: 0.0380497	test: 0.1557523	best: 0.1129785 (11)	total: 7.65s	remaining: 17.9s
598:	learn: 0.0379792	test: 0.1556338	best: 0.1129785 (11)	total: 7.66s	remaining: 17.9s
599:	learn: 0.0378534	test: 0.1557277	best: 0.1129785 (11)	total: 7.67s	remaining: 17.9s
600:	learn: 0.0377813	test: 0.156024

686:	learn: 0.0328690	test: 0.1609598	best: 0.1129785 (11)	total: 8.82s	remaining: 16.9s
687:	learn: 0.0327585	test: 0.1611357	best: 0.1129785 (11)	total: 8.84s	remaining: 16.9s
688:	learn: 0.0327366	test: 0.1612007	best: 0.1129785 (11)	total: 8.85s	remaining: 16.8s
689:	learn: 0.0326952	test: 0.1612036	best: 0.1129785 (11)	total: 8.86s	remaining: 16.8s
690:	learn: 0.0326130	test: 0.1612219	best: 0.1129785 (11)	total: 8.88s	remaining: 16.8s
691:	learn: 0.0325578	test: 0.1612014	best: 0.1129785 (11)	total: 8.89s	remaining: 16.8s
692:	learn: 0.0324596	test: 0.1623760	best: 0.1129785 (11)	total: 8.9s	remaining: 16.8s
693:	learn: 0.0324401	test: 0.1623768	best: 0.1129785 (11)	total: 8.92s	remaining: 16.8s
694:	learn: 0.0324180	test: 0.1622928	best: 0.1129785 (11)	total: 8.93s	remaining: 16.8s
695:	learn: 0.0323871	test: 0.1623392	best: 0.1129785 (11)	total: 8.94s	remaining: 16.8s
696:	learn: 0.0322755	test: 0.1625419	best: 0.1129785 (11)	total: 8.96s	remaining: 16.7s
697:	learn: 0.0322043	

780:	learn: 0.0278090	test: 0.1665907	best: 0.1129785 (11)	total: 10.1s	remaining: 15.8s
781:	learn: 0.0277640	test: 0.1667764	best: 0.1129785 (11)	total: 10.1s	remaining: 15.8s
782:	learn: 0.0277339	test: 0.1668472	best: 0.1129785 (11)	total: 10.1s	remaining: 15.8s
783:	learn: 0.0277151	test: 0.1668641	best: 0.1129785 (11)	total: 10.2s	remaining: 15.8s
784:	learn: 0.0276458	test: 0.1670172	best: 0.1129785 (11)	total: 10.2s	remaining: 15.7s
785:	learn: 0.0276031	test: 0.1670677	best: 0.1129785 (11)	total: 10.2s	remaining: 15.7s
786:	learn: 0.0275636	test: 0.1670986	best: 0.1129785 (11)	total: 10.2s	remaining: 15.7s
787:	learn: 0.0275216	test: 0.1671449	best: 0.1129785 (11)	total: 10.2s	remaining: 15.7s
788:	learn: 0.0274836	test: 0.1670874	best: 0.1129785 (11)	total: 10.2s	remaining: 15.7s
789:	learn: 0.0273756	test: 0.1672153	best: 0.1129785 (11)	total: 10.2s	remaining: 15.7s
790:	learn: 0.0273689	test: 0.1672185	best: 0.1129785 (11)	total: 10.3s	remaining: 15.7s
791:	learn: 0.0272782

879:	learn: 0.0233589	test: 0.1708425	best: 0.1129785 (11)	total: 11.5s	remaining: 14.6s
880:	learn: 0.0233249	test: 0.1710291	best: 0.1129785 (11)	total: 11.5s	remaining: 14.6s
881:	learn: 0.0233118	test: 0.1710971	best: 0.1129785 (11)	total: 11.5s	remaining: 14.6s
882:	learn: 0.0232590	test: 0.1711881	best: 0.1129785 (11)	total: 11.5s	remaining: 14.6s
883:	learn: 0.0231846	test: 0.1716003	best: 0.1129785 (11)	total: 11.5s	remaining: 14.6s
884:	learn: 0.0231497	test: 0.1714821	best: 0.1129785 (11)	total: 11.6s	remaining: 14.6s
885:	learn: 0.0231426	test: 0.1714884	best: 0.1129785 (11)	total: 11.6s	remaining: 14.5s
886:	learn: 0.0230839	test: 0.1715443	best: 0.1129785 (11)	total: 11.6s	remaining: 14.5s
887:	learn: 0.0230364	test: 0.1715651	best: 0.1129785 (11)	total: 11.6s	remaining: 14.5s
888:	learn: 0.0229504	test: 0.1717422	best: 0.1129785 (11)	total: 11.6s	remaining: 14.5s
889:	learn: 0.0229377	test: 0.1716819	best: 0.1129785 (11)	total: 11.6s	remaining: 14.5s
890:	learn: 0.0228869

975:	learn: 0.0199744	test: 0.1763360	best: 0.1129785 (11)	total: 12.8s	remaining: 13.4s
976:	learn: 0.0199069	test: 0.1765697	best: 0.1129785 (11)	total: 12.8s	remaining: 13.4s
977:	learn: 0.0198939	test: 0.1766722	best: 0.1129785 (11)	total: 12.8s	remaining: 13.4s
978:	learn: 0.0198451	test: 0.1767917	best: 0.1129785 (11)	total: 12.8s	remaining: 13.4s
979:	learn: 0.0197933	test: 0.1767584	best: 0.1129785 (11)	total: 12.8s	remaining: 13.3s
980:	learn: 0.0197367	test: 0.1767520	best: 0.1129785 (11)	total: 12.8s	remaining: 13.3s
981:	learn: 0.0197019	test: 0.1767756	best: 0.1129785 (11)	total: 12.8s	remaining: 13.3s
982:	learn: 0.0196602	test: 0.1768139	best: 0.1129785 (11)	total: 12.9s	remaining: 13.3s
983:	learn: 0.0196332	test: 0.1768870	best: 0.1129785 (11)	total: 12.9s	remaining: 13.3s
984:	learn: 0.0196129	test: 0.1769410	best: 0.1129785 (11)	total: 12.9s	remaining: 13.3s
985:	learn: 0.0196106	test: 0.1769591	best: 0.1129785 (11)	total: 12.9s	remaining: 13.3s
986:	learn: 0.0195902

1069:	learn: 0.0170470	test: 0.1823731	best: 0.1129785 (11)	total: 14s	remaining: 12.2s
1070:	learn: 0.0169938	test: 0.1825069	best: 0.1129785 (11)	total: 14.1s	remaining: 12.2s
1071:	learn: 0.0169516	test: 0.1826380	best: 0.1129785 (11)	total: 14.1s	remaining: 12.2s
1072:	learn: 0.0169187	test: 0.1826748	best: 0.1129785 (11)	total: 14.1s	remaining: 12.2s
1073:	learn: 0.0168759	test: 0.1831715	best: 0.1129785 (11)	total: 14.1s	remaining: 12.1s
1074:	learn: 0.0167973	test: 0.1835792	best: 0.1129785 (11)	total: 14.1s	remaining: 12.1s
1075:	learn: 0.0167844	test: 0.1836819	best: 0.1129785 (11)	total: 14.1s	remaining: 12.1s
1076:	learn: 0.0167737	test: 0.1836240	best: 0.1129785 (11)	total: 14.1s	remaining: 12.1s
1077:	learn: 0.0167396	test: 0.1837896	best: 0.1129785 (11)	total: 14.1s	remaining: 12.1s
1078:	learn: 0.0167349	test: 0.1838272	best: 0.1129785 (11)	total: 14.2s	remaining: 12.1s
1079:	learn: 0.0166804	test: 0.1839699	best: 0.1129785 (11)	total: 14.2s	remaining: 12.1s
1080:	learn:

1167:	learn: 0.0143225	test: 0.1874766	best: 0.1129785 (11)	total: 15.7s	remaining: 11.2s
1168:	learn: 0.0142811	test: 0.1879553	best: 0.1129785 (11)	total: 15.7s	remaining: 11.2s
1169:	learn: 0.0142635	test: 0.1879176	best: 0.1129785 (11)	total: 15.7s	remaining: 11.1s
1170:	learn: 0.0141871	test: 0.1878929	best: 0.1129785 (11)	total: 15.7s	remaining: 11.1s
1171:	learn: 0.0141497	test: 0.1878148	best: 0.1129785 (11)	total: 15.7s	remaining: 11.1s
1172:	learn: 0.0141212	test: 0.1877714	best: 0.1129785 (11)	total: 15.8s	remaining: 11.1s
1173:	learn: 0.0141175	test: 0.1878324	best: 0.1129785 (11)	total: 15.8s	remaining: 11.1s
1174:	learn: 0.0141130	test: 0.1878962	best: 0.1129785 (11)	total: 15.8s	remaining: 11.1s
1175:	learn: 0.0140925	test: 0.1878996	best: 0.1129785 (11)	total: 15.8s	remaining: 11.1s
1176:	learn: 0.0140731	test: 0.1878567	best: 0.1129785 (11)	total: 15.8s	remaining: 11.1s
1177:	learn: 0.0140561	test: 0.1879327	best: 0.1129785 (11)	total: 15.9s	remaining: 11.1s
1178:	lear

1263:	learn: 0.0123045	test: 0.1897545	best: 0.1129785 (11)	total: 17s	remaining: 9.92s
1264:	learn: 0.0122920	test: 0.1899002	best: 0.1129785 (11)	total: 17s	remaining: 9.9s
1265:	learn: 0.0122908	test: 0.1898995	best: 0.1129785 (11)	total: 17.1s	remaining: 9.89s
1266:	learn: 0.0122728	test: 0.1899929	best: 0.1129785 (11)	total: 17.1s	remaining: 9.88s
1267:	learn: 0.0122716	test: 0.1887859	best: 0.1129785 (11)	total: 17.1s	remaining: 9.86s
1268:	learn: 0.0122666	test: 0.1887760	best: 0.1129785 (11)	total: 17.1s	remaining: 9.85s
1269:	learn: 0.0122467	test: 0.1888000	best: 0.1129785 (11)	total: 17.1s	remaining: 9.84s
1270:	learn: 0.0122390	test: 0.1887742	best: 0.1129785 (11)	total: 17.1s	remaining: 9.82s
1271:	learn: 0.0122316	test: 0.1887911	best: 0.1129785 (11)	total: 17.2s	remaining: 9.83s
1272:	learn: 0.0121969	test: 0.1889322	best: 0.1129785 (11)	total: 17.2s	remaining: 9.82s
1273:	learn: 0.0121835	test: 0.1889383	best: 0.1129785 (11)	total: 17.2s	remaining: 9.8s
1274:	learn: 0.0

1361:	learn: 0.0107108	test: 0.1907571	best: 0.1129785 (11)	total: 18.3s	remaining: 8.57s
1362:	learn: 0.0107023	test: 0.1907744	best: 0.1129785 (11)	total: 18.3s	remaining: 8.55s
1363:	learn: 0.0106995	test: 0.1907487	best: 0.1129785 (11)	total: 18.3s	remaining: 8.54s
1364:	learn: 0.0106986	test: 0.1907412	best: 0.1129785 (11)	total: 18.3s	remaining: 8.53s
1365:	learn: 0.0106938	test: 0.1907953	best: 0.1129785 (11)	total: 18.3s	remaining: 8.51s
1366:	learn: 0.0106898	test: 0.1904051	best: 0.1129785 (11)	total: 18.4s	remaining: 8.5s
1367:	learn: 0.0106505	test: 0.1906311	best: 0.1129785 (11)	total: 18.4s	remaining: 8.48s
1368:	learn: 0.0106258	test: 0.1906842	best: 0.1129785 (11)	total: 18.4s	remaining: 8.48s
1369:	learn: 0.0106130	test: 0.1906821	best: 0.1129785 (11)	total: 18.4s	remaining: 8.47s
1370:	learn: 0.0106064	test: 0.1906838	best: 0.1129785 (11)	total: 18.4s	remaining: 8.45s
1371:	learn: 0.0105827	test: 0.1906319	best: 0.1129785 (11)	total: 18.4s	remaining: 8.44s
1372:	learn

1456:	learn: 0.0093477	test: 0.1935141	best: 0.1129785 (11)	total: 19.5s	remaining: 7.28s
1457:	learn: 0.0093340	test: 0.1934940	best: 0.1129785 (11)	total: 19.6s	remaining: 7.27s
1458:	learn: 0.0093308	test: 0.1934799	best: 0.1129785 (11)	total: 19.6s	remaining: 7.25s
1459:	learn: 0.0093177	test: 0.1933940	best: 0.1129785 (11)	total: 19.6s	remaining: 7.24s
1460:	learn: 0.0093145	test: 0.1933843	best: 0.1129785 (11)	total: 19.6s	remaining: 7.23s
1461:	learn: 0.0093096	test: 0.1933609	best: 0.1129785 (11)	total: 19.6s	remaining: 7.21s
1462:	learn: 0.0092637	test: 0.1936841	best: 0.1129785 (11)	total: 19.6s	remaining: 7.2s
1463:	learn: 0.0092302	test: 0.1935873	best: 0.1129785 (11)	total: 19.6s	remaining: 7.19s
1464:	learn: 0.0092262	test: 0.1936594	best: 0.1129785 (11)	total: 19.6s	remaining: 7.17s
1465:	learn: 0.0092072	test: 0.1937370	best: 0.1129785 (11)	total: 19.7s	remaining: 7.16s
1466:	learn: 0.0091799	test: 0.1939735	best: 0.1129785 (11)	total: 19.7s	remaining: 7.15s
1467:	learn

1552:	learn: 0.0081247	test: 0.1971481	best: 0.1129785 (11)	total: 20.8s	remaining: 5.98s
1553:	learn: 0.0081090	test: 0.1971393	best: 0.1129785 (11)	total: 20.8s	remaining: 5.97s
1554:	learn: 0.0081034	test: 0.1971406	best: 0.1129785 (11)	total: 20.8s	remaining: 5.95s
1555:	learn: 0.0080733	test: 0.1973143	best: 0.1129785 (11)	total: 20.8s	remaining: 5.94s
1556:	learn: 0.0080692	test: 0.1973478	best: 0.1129785 (11)	total: 20.8s	remaining: 5.93s
1557:	learn: 0.0080518	test: 0.1972498	best: 0.1129785 (11)	total: 20.8s	remaining: 5.91s
1558:	learn: 0.0080436	test: 0.1972250	best: 0.1129785 (11)	total: 20.9s	remaining: 5.9s
1559:	learn: 0.0080387	test: 0.1972464	best: 0.1129785 (11)	total: 20.9s	remaining: 5.9s
1560:	learn: 0.0080294	test: 0.1972166	best: 0.1129785 (11)	total: 20.9s	remaining: 5.89s
1561:	learn: 0.0080258	test: 0.1971991	best: 0.1129785 (11)	total: 21s	remaining: 5.88s
1562:	learn: 0.0079993	test: 0.1973992	best: 0.1129785 (11)	total: 21s	remaining: 5.86s
1563:	learn: 0.0

1646:	learn: 0.0071271	test: 0.1993429	best: 0.1129785 (11)	total: 22s	remaining: 4.72s
1647:	learn: 0.0071271	test: 0.1993444	best: 0.1129785 (11)	total: 22.1s	remaining: 4.71s
1648:	learn: 0.0071255	test: 0.1993438	best: 0.1129785 (11)	total: 22.1s	remaining: 4.7s
1649:	learn: 0.0071214	test: 0.1993833	best: 0.1129785 (11)	total: 22.1s	remaining: 4.68s
1650:	learn: 0.0071208	test: 0.1993522	best: 0.1129785 (11)	total: 22.1s	remaining: 4.67s
1651:	learn: 0.0071150	test: 0.1994294	best: 0.1129785 (11)	total: 22.1s	remaining: 4.66s
1652:	learn: 0.0071076	test: 0.1995118	best: 0.1129785 (11)	total: 22.1s	remaining: 4.64s
1653:	learn: 0.0071049	test: 0.1995435	best: 0.1129785 (11)	total: 22.1s	remaining: 4.63s
1654:	learn: 0.0070888	test: 0.1996232	best: 0.1129785 (11)	total: 22.1s	remaining: 4.62s
1655:	learn: 0.0070592	test: 0.1998612	best: 0.1129785 (11)	total: 22.2s	remaining: 4.61s
1656:	learn: 0.0070327	test: 0.2000460	best: 0.1129785 (11)	total: 22.2s	remaining: 4.6s
1657:	learn: 0

1744:	learn: 0.0064210	test: 0.2024718	best: 0.1129785 (11)	total: 23.3s	remaining: 3.41s
1745:	learn: 0.0064036	test: 0.2026316	best: 0.1129785 (11)	total: 23.3s	remaining: 3.4s
1746:	learn: 0.0063902	test: 0.2026736	best: 0.1129785 (11)	total: 23.4s	remaining: 3.38s
1747:	learn: 0.0063898	test: 0.2026715	best: 0.1129785 (11)	total: 23.4s	remaining: 3.37s
1748:	learn: 0.0063895	test: 0.2027318	best: 0.1129785 (11)	total: 23.4s	remaining: 3.35s
1749:	learn: 0.0063842	test: 0.2027193	best: 0.1129785 (11)	total: 23.4s	remaining: 3.34s
1750:	learn: 0.0063822	test: 0.2027000	best: 0.1129785 (11)	total: 23.4s	remaining: 3.33s
1751:	learn: 0.0063784	test: 0.2027052	best: 0.1129785 (11)	total: 23.4s	remaining: 3.31s
1752:	learn: 0.0063778	test: 0.2025827	best: 0.1129785 (11)	total: 23.4s	remaining: 3.3s
1753:	learn: 0.0063765	test: 0.2026058	best: 0.1129785 (11)	total: 23.4s	remaining: 3.29s
1754:	learn: 0.0063550	test: 0.2027808	best: 0.1129785 (11)	total: 23.5s	remaining: 3.27s
1755:	learn:

1841:	learn: 0.0058522	test: 0.2044170	best: 0.1129785 (11)	total: 24.5s	remaining: 2.1s
1842:	learn: 0.0058517	test: 0.2044059	best: 0.1129785 (11)	total: 24.6s	remaining: 2.09s
1843:	learn: 0.0058502	test: 0.2043633	best: 0.1129785 (11)	total: 24.6s	remaining: 2.08s
1844:	learn: 0.0058498	test: 0.2043714	best: 0.1129785 (11)	total: 24.6s	remaining: 2.06s
1845:	learn: 0.0058472	test: 0.2043794	best: 0.1129785 (11)	total: 24.6s	remaining: 2.05s
1846:	learn: 0.0058360	test: 0.2046025	best: 0.1129785 (11)	total: 24.6s	remaining: 2.04s
1847:	learn: 0.0058266	test: 0.2046051	best: 0.1129785 (11)	total: 24.6s	remaining: 2.02s
1848:	learn: 0.0058213	test: 0.2046502	best: 0.1129785 (11)	total: 24.6s	remaining: 2.01s
1849:	learn: 0.0058133	test: 0.2046501	best: 0.1129785 (11)	total: 24.6s	remaining: 2s
1850:	learn: 0.0058122	test: 0.2046595	best: 0.1129785 (11)	total: 24.7s	remaining: 1.99s
1851:	learn: 0.0058122	test: 0.2046576	best: 0.1129785 (11)	total: 24.7s	remaining: 1.97s
1852:	learn: 0

1939:	learn: 0.0052651	test: 0.2061335	best: 0.1129785 (11)	total: 25.8s	remaining: 798ms
1940:	learn: 0.0052532	test: 0.2060214	best: 0.1129785 (11)	total: 25.8s	remaining: 785ms
1941:	learn: 0.0052443	test: 0.2061795	best: 0.1129785 (11)	total: 25.8s	remaining: 772ms
1942:	learn: 0.0052338	test: 0.2061320	best: 0.1129785 (11)	total: 25.8s	remaining: 758ms
1943:	learn: 0.0052332	test: 0.2061446	best: 0.1129785 (11)	total: 25.9s	remaining: 745ms
1944:	learn: 0.0052320	test: 0.2061796	best: 0.1129785 (11)	total: 25.9s	remaining: 732ms
1945:	learn: 0.0052196	test: 0.2062582	best: 0.1129785 (11)	total: 25.9s	remaining: 718ms
1946:	learn: 0.0052102	test: 0.2063920	best: 0.1129785 (11)	total: 25.9s	remaining: 705ms
1947:	learn: 0.0052061	test: 0.2064677	best: 0.1129785 (11)	total: 25.9s	remaining: 692ms
1948:	learn: 0.0052056	test: 0.2064674	best: 0.1129785 (11)	total: 26s	remaining: 680ms
1949:	learn: 0.0052039	test: 0.2064708	best: 0.1129785 (11)	total: 26s	remaining: 667ms
1950:	learn: 0

In [375]:
ranking_training_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92451 entries, 0 to 92450
Data columns (total 62 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   srch_id                      92451 non-null  object 
 1   site_id                      92451 non-null  object 
 2   visitor_location_country_id  92451 non-null  object 
 3   visitor_hist_starrating      4461 non-null   object 
 4   visitor_hist_adr_usd         4486 non-null   object 
 5   prop_country_id              92451 non-null  object 
 6   prop_id                      92451 non-null  object 
 7   prop_starrating              92451 non-null  object 
 8   prop_review_score            92300 non-null  object 
 9   prop_brand_bool              92451 non-null  object 
 10  prop_location_score1         92451 non-null  object 
 11  prop_location_score2         72046 non-null  object 
 12  prop_log_historical_price    92451 non-null  object 
 13  position        

In [376]:
relevancyclf_training_predictions.head()
ranking_training_predictions["prediction_ts"] = X_train_cutoff["date_time"]

In [377]:
ranking_training_predictions["Predictedrank"] = ranking_training_predictions.groupby("srch_id")["PredictedRelevancyScore"].rank("dense", ascending=False)
ranking_training_predictions_df=ranking_training_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_training_predictions_df = ranking_training_predictions_df.reset_index(drop=True)

ranking_training_predictions_df[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedScore']] = relevancyclf_training_predictions[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedRelevancyScore']]

ranking_training_predictions_df['PredictedRelevancy'].value_counts()

Not Relevant    92445
Relevant            6
Name: PredictedRelevancy, dtype: int64

In [378]:
ranking_test_predictions["prediction_ts"] = HotelSearch_test["date_time"]

ranking_test_predictions["Predictedrank"] = ranking_test_predictions.groupby("srch_id")["PredictedRelevancyScore"].rank("dense", ascending=False)
ranking_test_predictions_df=ranking_test_predictions.sort_values(by=['srch_id','position'], ascending=True)
ranking_test_predictions_df = ranking_test_predictions_df.reset_index(drop=True)

ranking_test_predictions_df[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedScore']] = relevancyclf_test_predictions[['Not_Relevant_prob','Yes_Relevant_prob','RelevanceLabelPrediction','PredictedRelevancy','PredictedRelevancyScore']]

ranking_test_predictions_df.head()


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,day,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancyScore,ActualRelevancy,PredictedRelevancyScore,prediction_ts,Predictedrank,Not_Relevant_prob,Yes_Relevant_prob,RelevanceLabelPrediction,PredictedRelevancy,PredictedScore
0,246,14,100,,,219,11396,4,5.0,0,0.69,0.0551,5.36,1,159.64,0,18294,1,6,2,0,1,0,,87.17,0,,,,,,,,,,,,,,,,,,,,,,,,,0,4,0,10,9,55,1362390943.0,False,27,1.0,Relevant,0.129661,2013-03-04 09:55:43,1.0,0.86451,0.13549,0.0,Not Relevant,0.86451
1,246,14,100,,,219,103885,2,4.0,1,1.1,0.154,4.75,2,98.32,0,18294,1,6,2,0,1,0,,93.87,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,1,4,0,10,9,55,1362390943.0,False,25,0.0,Not Relevant,0.119552,2013-03-04 09:55:43,2.0,0.847195,0.152805,0.0,Not Relevant,0.847195
2,246,14,100,,,219,61167,3,4.5,0,1.1,0.0314,4.87,3,101.24,0,18294,1,6,2,0,1,0,,92.31,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,2,4,0,10,9,55,1362390943.0,False,25,0.0,Not Relevant,0.049152,2013-03-04 09:55:43,3.0,0.920953,0.079047,0.0,Not Relevant,0.920953
3,246,14,100,,,219,95490,2,3.5,1,1.1,0.0142,4.88,4,95.4,0,18294,1,6,2,0,1,0,,89.28,0,,,,,,,,,,,,,,,,,,,,,,,,,3,4,0,10,9,55,1362390943.0,False,27,0.0,Not Relevant,0.010796,2013-03-04 09:55:43,6.0,0.928265,0.071735,0.0,Not Relevant,0.928265
4,246,14,100,,,219,127213,3,4.0,1,1.1,0.1652,4.95,6,119.73,0,18294,1,6,2,0,1,0,,93.97,0,,,,,,,,,,,,,0.0,0.0,,,,,,,,,,,4,4,0,10,9,55,1362390943.0,False,25,0.0,Not Relevant,0.040975,2013-03-04 09:55:43,4.0,0.927597,0.072403,0.0,Not Relevant,0.927597


In [379]:
ranking_training_predictions_df.to_csv('./data/hotelsearch_training_predictions.csv')
ranking_test_predictions_df.to_csv('./data/hotelsearch_test_predictions.csv')

In [278]:
#relevancyclf_training_predictions.to_csv('./data/hotelsearch_clf_training.csv')
#relevancyclf_test_predictions.to_csv('./data/hotelsearch_clf_test.csv')

In [380]:
ranking_training_predictions_df.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,predictionid,day,weekday,week_of_year,hour,minute,time_epoch,early_night,nans_count,ActualRelevancyScore,ActualRelevancy,PredictedRelevancyScore,prediction_ts,Predictedrank,Not_Relevant_prob,Yes_Relevant_prob,RelevanceLabelPrediction,PredictedRelevancy,PredictedScore
0,71,24,216,,,98,76541,3,3.5,0,2.2,0.0482,3.21,1,16.9,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0,12,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.041477,2013-03-12 23:14:50,1.0,0.957026,0.042974,0.0,Not Relevant,0.957026
1,71,24,216,,,98,37933,3,4.0,0,2.08,0.0812,3.59,2,18.34,0,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,46.0,,,,0.0,0.0,,1.0,0.0,2.0,,,,,,,,,,1,12,1,11,23,14,1363130090.0,True,20,0.0,Not Relevant,0.033541,2013-03-12 23:14:50,2.0,0.955699,0.044301,0.0,Not Relevant,0.955699
2,71,24,216,,,98,108588,4,4.0,0,0.0,0.0268,4.08,3,26.78,1,341,1,18,1,1,1,1,,,1,,,,1.0,0.0,108.0,1.0,0.0,32.0,-1.0,0.0,9.0,0.0,0.0,,,,,,,,,,,2,12,1,11,23,14,1363130090.0,True,17,0.0,Not Relevant,0.01757,2013-03-12 23:14:50,5.0,0.986572,0.013428,0.0,Not Relevant,0.986572
3,71,24,216,,,98,28672,3,4.0,0,2.2,0.0151,3.35,4,21.48,0,341,1,18,1,1,1,1,,,1,,,,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,3,12,1,11,23,14,1363130090.0,True,24,0.0,Not Relevant,0.010443,2013-03-12 23:14:50,8.0,0.986382,0.013618,0.0,Not Relevant,0.986382
4,71,24,216,,,98,25741,3,3.5,0,1.61,0.0359,4.02,6,19.47,1,341,1,18,1,1,1,1,,,1,,,,0.0,0.0,,1.0,0.0,10.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,4,12,1,11,23,14,1363130090.0,True,19,0.0,Not Relevant,0.011178,2013-03-12 23:14:50,6.0,0.986321,0.013679,0.0,Not Relevant,0.986321
