In [1]:
%pip install -q sklearn matplotlib pandas dask[complete] joblib

Note: you may need to restart the kernel to use updated packages.


Importing the Training File

In [2]:
import pandas as pd
import os
import gc


file_path = os.path.join(os.getcwd(),'data/train_data_rf.csv') 

train_data = pd.read_csv(file_path)
train_data.head()

Unnamed: 0,prop_starrating,prop_brand_bool,prop_location_score1,prop_log_historical_price,price_usd,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,relevancy
0,2,1,2.2,4.44,100.89,0,1,0,4,0,1,1,1,5
1,3,1,2.3,5.03,138.4,0,1,0,4,0,1,1,1,0
2,3,1,2.4,5.03,210.84,0,1,0,4,0,1,1,1,0
3,4,1,2.83,5.15,280.69,0,1,0,4,0,1,1,1,0
4,3,1,2.2,4.81,129.35,0,1,0,4,0,1,1,1,0


In [3]:
len(train_data)

499227

In [4]:
train_label = train_data['relevancy']
train_data = train_data.drop('relevancy',axis=1)

Training

In [5]:
from dask.distributed import Client
import joblib

from sklearn.ensemble import RandomForestClassifier

client = Client(processes=False) 

clf = RandomForestClassifier(class_weight='balanced')

with joblib.parallel_backend('dask'):
    clf.fit(train_data,train_label)
    

train_data = pd.DataFrame()
train_label = pd.DataFrame()
gc.collect()

3496

Testing Locally

In [6]:
test_data_local_path = os.path.join(os.getcwd(),'data/test_data_rf_local.csv')

test_data_local = pd.read_csv(test_data_local_path)

test_data_local.head()

Unnamed: 0,prop_starrating,prop_brand_bool,prop_location_score1,prop_log_historical_price,price_usd,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,relevancy,srch_id,prop_id
0,5,0,4.73,5.98,845.0,1,3,45,3,0,1,1,0,0,299424,99130
1,2,0,4.99,5.04,179.0,0,3,45,3,0,1,1,0,0,299424,104871
2,4,0,5.23,5.54,372.0,1,3,45,3,0,1,1,0,0,299424,107732
3,2,0,5.23,5.12,146.0,0,3,45,3,0,1,1,0,0,299424,109781
4,3,0,5.11,5.25,170.0,0,3,45,3,0,1,1,0,0,299424,111300


In [7]:
srch_id_list = test_data_local['srch_id'].unique()
print(srch_id_list)

[299424 299429 299430 ... 332782 332784 332785]


In [8]:
import numpy as np

np.random.seed(20)

srch_id_list_sample = np.random.choice(srch_id_list,300,False)
print(srch_id_list_sample)

[329412 313058 317657 315016 311080 310836 317927 305446 313767 305710
 322345 327284 318111 303571 324319 330185 328553 317477 307570 300589
 308602 320670 311682 317074 308304 302825 301607 310064 312467 328790
 318595 301553 309559 317033 310608 314113 329357 311280 329782 323233
 326000 310556 305552 315362 309592 307090 306104 301880 332340 325631
 316836 328106 312286 327731 299857 307281 302150 320590 308167 301892
 319851 332167 326549 301644 300013 301558 316293 329366 328751 320594
 332719 325508 302609 326055 302574 312166 305077 325347 327663 328977
 331897 300264 310029 310945 301213 304914 305641 321295 310987 314205
 299982 320728 320008 305462 319468 323031 303960 312481 314899 311390
 315124 299574 318530 315857 331397 329997 311452 323767 311417 323508
 307147 306934 325372 304896 310314 329719 306799 315208 321119 317696
 318050 325125 309107 317610 300128 302110 300558 323537 318713 315742
 309973 314667 307042 307057 310089 301697 325735 300811 329132 315301
 30600

In [9]:
srch_grps = test_data_local.groupby('srch_id')

srch_grps_list = []

for srch_id in srch_id_list_sample:
    srch_grps_list.append(srch_grps.get_group(srch_id).reset_index(drop=True))

print(len(srch_grps_list))

300


In [10]:
true_data = []
true_labels = []
for srch_grp in srch_grps_list:
    true_labels.append(np.array(srch_grp['relevancy']))
    true_data.append(srch_grp.drop(['relevancy','srch_id','prop_id'],axis=1))

In [11]:
from sklearn.metrics import ndcg_score

scores = []
preds = []

with joblib.parallel_backend('dask'):
    for i in range(len(true_data)):
        preds.append(clf.predict(true_data[i]))
        scores.append(ndcg_score([true_labels[i]],[preds[i]]))

mean_of_scores = np.mean(scores)
print("NDCG Score : ",mean_of_scores)

NDCG Score :  0.6489000194085071


This score would be fake if the entire training set is used without split :)

In [12]:
test_data_local = pd.DataFrame()
srch_grps = pd.Grouper()
srch_grps_list = []
true_data = []
true_labels = []
preds = []

gc.collect()

806

Doing Predictions on Final Test Dataset

In [13]:
test_file_path = os.path.join(os.getcwd(),'data/test_data_rf.csv')
test_df = pd.read_csv(test_file_path)

print(len(test_df))
test_df.head()


4959183


Unnamed: 0,srch_id,prop_id,prop_starrating,prop_brand_bool,prop_location_score1,prop_log_historical_price,price_usd,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool
0,1,3180,3,1,2.94,5.03,119.0,0,1,10,2,0,1,0,0
1,1,5543,3,1,2.64,4.93,118.0,0,1,10,2,0,1,0,0
2,1,14142,2,1,2.71,4.16,49.0,0,1,10,2,0,1,0,0
3,1,22393,3,1,2.4,5.03,143.0,0,1,10,2,0,1,0,0
4,1,24194,3,1,2.94,4.72,79.0,0,1,10,2,0,1,0,0


In [14]:
test_df_feat = test_df.drop(['srch_id','prop_id'],axis=1)
test_df_feat.head()

Unnamed: 0,prop_starrating,prop_brand_bool,prop_location_score1,prop_log_historical_price,price_usd,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool
0,3,1,2.94,5.03,119.0,0,1,10,2,0,1,0,0
1,3,1,2.64,4.93,118.0,0,1,10,2,0,1,0,0
2,2,1,2.71,4.16,49.0,0,1,10,2,0,1,0,0
3,3,1,2.4,5.03,143.0,0,1,10,2,0,1,0,0
4,3,1,2.94,4.72,79.0,0,1,10,2,0,1,0,0


In [15]:
with joblib.parallel_backend('dask'):
    final_preds = clf.predict(test_df_feat)

print(final_preds)

test_df_feat = pd.DataFrame()
gc.collect()


[0 0 0 ... 0 0 5]


184

In [16]:
test_df['pred'] = final_preds
test_df.head()

Unnamed: 0,srch_id,prop_id,prop_starrating,prop_brand_bool,prop_location_score1,prop_log_historical_price,price_usd,promotion_flag,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,pred
0,1,3180,3,1,2.94,5.03,119.0,0,1,10,2,0,1,0,0,0
1,1,5543,3,1,2.64,4.93,118.0,0,1,10,2,0,1,0,0,0
2,1,14142,2,1,2.71,4.16,49.0,0,1,10,2,0,1,0,0,0
3,1,22393,3,1,2.4,5.03,143.0,0,1,10,2,0,1,0,0,0
4,1,24194,3,1,2.94,4.72,79.0,0,1,10,2,0,1,0,0,5


In [17]:
final_output_df = test_df[['srch_id','prop_id','pred']]
print(final_output_df.head())


test_df = pd.DataFrame()
gc.collect()

   srch_id  prop_id  pred
0        1     3180     0
1        1     5543     0
2        1    14142     0
3        1    22393     0
4        1    24194     5


3

In [18]:
final_output_df = final_output_df.sort_values(by=['srch_id','pred'],ascending=[True,False],ignore_index=True)
final_output_df.head()


Unnamed: 0,srch_id,prop_id,pred
0,1,24194,5
1,1,28181,5
2,1,54937,5
3,1,95031,5
4,1,99484,5


In [19]:
final_output_df.drop('pred',axis=1,inplace=True)
out_path = os.path.join(os.getcwd(),'data/final_submission.csv')
final_output_df.to_csv(path_or_buf=out_path,index=False)