In [1]:
%pip install -q sklearn matplotlib pandas dask[complete] joblib

Note: you may need to restart the kernel to use updated packages.


Importing the Training File

In [2]:
import pandas as pd
import os
import gc


file_path = os.path.join(os.getcwd(),'data/train_data.csv') 

train_data = pd.read_csv(file_path,index_col=0)
train_data.head()

Unnamed: 0,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country,relevancy
0,12,187,219,2,3.0,1,2.2,0.0206,4.44,0,...,4,0,1,1,1,0.0,0.0,11.0,0,5
1,12,187,219,4,4.0,1,2.83,0.1028,5.15,0,...,4,0,1,1,1,0.0,0.0,0.0,0,0
2,12,187,219,3,4.0,1,2.3,0.0155,5.03,0,...,4,0,1,1,1,0.0,0.0,0.0,0,0
3,12,187,219,3,3.0,1,1.39,0.0038,4.8,0,...,4,0,1,1,1,0.0,0.0,0.0,0,0
4,12,187,219,3,3.5,1,2.2,0.0356,4.81,0,...,4,0,1,1,1,0.0,0.0,0.0,0,0


In [3]:
len(train_data)

665637

Transforming Train Data

from sklearn.feature_extraction import FeatureHasher

h = FeatureHasher(n_features=5,input_type='string')
site_id_hashed = h.fit_transform(train_data['site_id'].astype(str))
site_id_hashed = site_id_hashed.toarray()

visitor_location_country_id_hashed = h.fit_transform(train_data['visitor_location_country_id'].astype(str))
visitor_location_country_id_hashed = visitor_location_country_id_hashed.toarray()

prop_country_id_hashed = h.fit_transform(train_data['prop_country_id'].astype(str))
prop_country_id_hashed = prop_country_id_hashed.toarray()

srch_destination_id_hashed = h.fit_transform(train_data['srch_destination_id'].astype(str))
srch_destination_id_hashed = srch_destination_id_hashed.toarray()


train_data = pd.concat([train_data,pd.DataFrame(site_id_hashed,columns=[f'site_id{i}' for i in range(5)])],axis=1)
train_data = pd.concat([train_data,pd.DataFrame(visitor_location_country_id_hashed,columns=[f'visitor_location_country_id{i}' for i in range(5)])],axis=1)
train_data = pd.concat([train_data,pd.DataFrame(prop_country_id_hashed,columns=[f'prop_country_id{i}' for i in range(5)])],axis=1)
train_data = pd.concat([train_data,pd.DataFrame(srch_destination_id_hashed,columns=[f'srch_destination_id{i}' for i in range(5)])],axis=1)

train_data.drop(labels=['site_id','visitor_location_country_id','prop_country_id','srch_destination_id'],axis=1,inplace=True)

train_data.head()

Splitting into test and train

In [4]:
feature_list = ['prop_starrating', 'prop_brand_bool', 'prop_location_score1',
       'prop_log_historical_price', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
       'random_bool']

#, 'price_usd'

In [5]:
from typing import Tuple

def split_test_train(data_df:pd.DataFrame, label_df:pd.DataFrame, train_percent:int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    tot_len = len(label_df)
    train_len = int(train_percent/100*tot_len)

    train_data = data_df[:train_len]
    train_label = label_df[:train_len]
    test_data = data_df[train_len:]
    test_label = label_df[train_len:]

    train_data 

    return (train_data,train_label,test_data,test_label)

train_data_x = train_data.drop('relevancy',axis=1)

train_data_x = train_data[feature_list]
train_data_y = train_data['relevancy']

train_data,train_label,test_data,test_label = split_test_train(train_data_x,train_data_y,100)

del[[train_data_x,train_data_y]]
gc.collect()
train_data_x = pd.DataFrame()
train_data_y = pd.DataFrame()

In [6]:
print(test_label.value_counts())

Series([], Name: relevancy, dtype: int64)


In [7]:
#weight_0 = 1-train_label.value_counts()[0]/len(train_label)
#weight_1 = 1-train_label.value_counts()[1]/len(train_label)
#weight_5 = 1-train_label.value_counts()[5]/len(train_label)

#print(weight_0)
#print(weight_1)
#print(weight_5)

In [8]:
print(len(train_data))
print(len(test_data))

665637
0


Training

In [9]:
"""import functions.preprocessing as prep
import numpy as np

train_file_og_path = os.path.join(os.getcwd(),'data/training_set_VU_DM.csv')
test_data = pd.read_csv(train_file_og_path)

samp_size = 100000

test_data = test_data.sample(samp_size,random_state=10)

conditions = [
    (test_data['booking_bool'] == 1),
    (test_data['click_bool'] == 1 ) & (test_data['booking_bool'] == 0),
    (test_data['click_bool'] == 0),
    ]
values = ['5', '1', '0']
test_data['relevancy'] = np.select(conditions, values)
test_data['relevancy'] = test_data['relevancy'].astype(int)

print(test_data['relevancy'].value_counts())

test_data = prep.mergeCompsAll(test_data)
test_data = prep.fill_comp_rate(test_data)
test_data = prep.fill_comp_inv(test_data)
test_data = prep.fill_rate_diff(test_data)

print(test_data.head())

conditions = [
    (test_data['visitor_location_country_id'] == test_data['prop_country_id']),
    (test_data['visitor_location_country_id'] != test_data['prop_country_id'])
]

values = ['1','0']
test_data['same_country'] = np.select(conditions,values)

print(test_data['same_country'].value_counts())


test_label = test_data['relevancy']
test_data = test_data.drop('relevancy',axis=1)


common_columns_train_test = ['site_id','visitor_location_country_id','prop_country_id','prop_starrating','prop_review_score','prop_brand_bool','prop_location_score1','prop_location_score2','prop_log_historical_price','promotion_flag','srch_destination_id','srch_length_of_stay','srch_booking_window','srch_adults_count','srch_children_count','srch_room_count','srch_saturday_night_bool','random_bool','comp_rate','comp_inv','comp_rate_percent_diff','same_country']

test_data = test_data[common_columns_train_test]

test_data.fillna({'prop_review_score':test_data['prop_review_score'].mean()},inplace=True)
test_data.fillna({'prop_location_score2':test_data['prop_location_score2'].mean()},inplace=True)"""



"import functions.preprocessing as prep\nimport numpy as np\n\ntrain_file_og_path = os.path.join(os.getcwd(),'data/training_set_VU_DM.csv')\ntest_data = pd.read_csv(train_file_og_path)\n\nsamp_size = 100000\n\ntest_data = test_data.sample(samp_size,random_state=10)\n\nconditions = [\n    (test_data['booking_bool'] == 1),\n    (test_data['click_bool'] == 1 ) & (test_data['booking_bool'] == 0),\n    (test_data['click_bool'] == 0),\n    ]\nvalues = ['5', '1', '0']\ntest_data['relevancy'] = np.select(conditions, values)\ntest_data['relevancy'] = test_data['relevancy'].astype(int)\n\nprint(test_data['relevancy'].value_counts())\n\ntest_data = prep.mergeCompsAll(test_data)\ntest_data = prep.fill_comp_rate(test_data)\ntest_data = prep.fill_comp_inv(test_data)\ntest_data = prep.fill_rate_diff(test_data)\n\nprint(test_data.head())\n\nconditions = [\n    (test_data['visitor_location_country_id'] == test_data['prop_country_id']),\n    (test_data['visitor_location_country_id'] != test_data['prop_c

In [10]:
from dask.distributed import Client
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

client = Client(processes=False) 

clf = RandomForestClassifier(class_weight='balanced')

with joblib.parallel_backend('dask'):
    clf.fit(train_data,train_label)
    #preds = knn_classifier.predict(test_data)
    #f1_sc = f1_score(preds,test_label,average='macro')

#client.restart()
train_data_cols = train_data.columns

#clf.fit(train_data,train_label)

del[[train_data,train_label]]
gc.collect()
train_data = pd.DataFrame()
train_label = pd.DataFrame()

In [11]:
#with joblib.parallel_backend('dask'):
#    accuracy = knn_classifier.score(test_data,test_label)

#print(accuracy)
#print(f1_sc)


In [12]:
#print(preds)

del[[test_data,test_label]]
gc.collect()
test_data = pd.DataFrame()
test_label = pd.DataFrame()

In [13]:
#import numpy as np

#unique, counts = np.unique(preds, return_counts=True)
#dict(zip(unique,counts))

Testing Locally

In [14]:
import pandas as pd
import os


train_data_og_path = os.path.join(os.getcwd(),'data/training_set_VU_DM.csv')
test_data_local = pd.read_csv(train_data_og_path,chunksize=3000000)
test_data_local = test_data_local.get_chunk(3000000)
test_data_local.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


In [15]:
test_data_local.columns.values

array(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd',
       'prop_country_id', 'prop_id', 'prop_starrating',
       'prop_review_score', 'prop_brand_bool', 'prop_location_score1',
       'prop_location_score2', 'prop_log_historical_price', 'position',
       'price_usd', 'promotion_flag', 'srch_destination_id',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate',
       'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp

In [16]:
import numpy as np

conditions = [
    (test_data_local['booking_bool'] == 1),
    (test_data_local['click_bool'] == 1 ) & (test_data_local['booking_bool'] == 0),
    (test_data_local['click_bool'] == 0),
    ]
values = ['5', '1', '0']
test_data_local['relevancy'] = np.select(conditions, values)
test_data_local['relevancy'] = test_data_local['relevancy'].astype(int)

test_data_local.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevancy
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,0.0,0.0,,0,,0,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,0.0,0.0,,0,,0,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,0.0,0.0,,0,,0,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,-1.0,0.0,5.0,0,,0,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,0.0,0.0,,0,,0,0


In [17]:
srch_id_list = test_data_local['srch_id'].unique()
print(srch_id_list)

[     1      4      6 ... 201418 201419 201420]


In [18]:
np.random.seed(20)

srch_id_list_sample = np.random.choice(srch_id_list,100,False)
print(srch_id_list_sample)

[195924 190438 103332  49040 121622 175951   3978 191587  37905 156832
 154391 163159 146701  58310  94461 134294  65286 109689 143852  52381
  81971 175786  76106 133845  24999  40160 164195 122752  46343  87583
 143357 145427 175249 107240 174073 142850  22662  36176  37453 180527
 119360 117484 149034  42799  16019  94319 183230  81691 148865 186881
  76824 186421  87191  66015 191250  12727  89151  11151 112895  83329
  84516  51777  58040  86293  46477 104385  53333 181597  38188  20387
  33644 178725 144509 201130 154545  61940 195413 166302  43899 140391
 150292  56225  51578 148222 109418  79666 162253 104194 176179  56799
  71679  50142 126682  69857 172973  86497 157215 189803 121482 200859]


In [19]:
srch_grps = test_data_local.groupby('srch_id')

srch_grps_list = []

for srch_id in srch_id_list_sample:
    srch_grps_list.append(srch_grps.get_group(srch_id).reset_index(drop=True))

print(len(srch_grps_list))

100


In [20]:
true_data = []
true_labels = []
for srch_grp in srch_grps_list:
    true_labels.append(np.array(srch_grp['relevancy']))
    true_data.append(srch_grp[feature_list])

#print(true_data[0])
#print(true_labels[0])

In [21]:
from sklearn.metrics import ndcg_score

scores = []
preds = []

with joblib.parallel_backend('dask'):
    for i in range(len(true_data)):
        preds.append(clf.predict(true_data[i]))
        scores.append(ndcg_score([true_labels[i]],[preds[i]]))
    #scores.append(ndcg_score([true_labels[i]],[true_labels[i]]))

mean_of_scores = np.mean(scores)
print("NDCG Score : ",mean_of_scores)

NDCG Score :  0.723473197286933


Doing Predictions on Final Test Dataset

In [None]:
test_file_path = os.path.join(os.getcwd(),'data/test_data.csv')
test_df = pd.read_csv(test_file_path,index_col=0)

print(len(test_df))
test_df.head()




4959183


Unnamed: 0,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country,srch_id,prop_id
0,24,216,219,3,4.5,1,2.94,0.0691,5.03,0,...,0,1,0,0,0.0,0.0,0.0,0,1,3180
1,24,216,219,3,4.5,1,2.64,0.0843,4.93,0,...,0,1,0,0,0.0,0.0,0.0,0,1,5543
2,24,216,219,2,3.5,1,2.71,0.0556,4.16,0,...,0,1,0,0,1.0,0.0,10.0,0,1,14142
3,24,216,219,3,4.5,1,2.4,0.0561,5.03,0,...,0,1,0,0,0.0,0.0,0.0,0,1,22393
4,24,216,219,3,4.5,1,2.94,0.209,4.72,0,...,0,1,0,0,0.0,0.0,0.0,0,1,24194


In [None]:
test_df_feat = test_df.drop(['srch_id','prop_id'],axis=1)
test_df_feat.head()

Unnamed: 0,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country
0,24,216,219,3,4.5,1,2.94,0.0691,5.03,0,...,10,2,0,1,0,0,0.0,0.0,0.0,0
1,24,216,219,3,4.5,1,2.64,0.0843,4.93,0,...,10,2,0,1,0,0,0.0,0.0,0.0,0
2,24,216,219,2,3.5,1,2.71,0.0556,4.16,0,...,10,2,0,1,0,0,1.0,0.0,10.0,0
3,24,216,219,3,4.5,1,2.4,0.0561,5.03,0,...,10,2,0,1,0,0,0.0,0.0,0.0,0
4,24,216,219,3,4.5,1,2.94,0.209,4.72,0,...,10,2,0,1,0,0,0.0,0.0,0.0,0


In [None]:
print(test_df_feat.columns)
print(train_data_cols)

Index(['site_id', 'visitor_location_country_id', 'prop_country_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'promotion_flag', 'srch_destination_id',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
       'random_bool', 'comp_rate', 'comp_inv', 'comp_rate_percent_diff',
       'same_country'],
      dtype='object')
Index(['site_id', 'visitor_location_country_id', 'prop_country_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'promotion_flag', 'srch_destination_id',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
       'random_bool', 'comp_rate', 'comp_inv', 

In [None]:
with joblib.parallel_backend('dask'):
    final_preds = knn_classifier.predict(test_df_feat)

print(final_preds)

del[[test_df_feat]]
gc.collect()
test_df_feat = pd.DataFrame()

[0 0 0 ... 0 0 0]


In [None]:
test_df['pred'] = final_preds
test_df.head()

Unnamed: 0,site_id,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,promotion_flag,...,srch_room_count,srch_saturday_night_bool,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country,srch_id,prop_id,pred
0,24,216,219,3,4.5,1,2.94,0.0691,5.03,0,...,1,0,0,0.0,0.0,0.0,0,1,3180,0
1,24,216,219,3,4.5,1,2.64,0.0843,4.93,0,...,1,0,0,0.0,0.0,0.0,0,1,5543,0
2,24,216,219,2,3.5,1,2.71,0.0556,4.16,0,...,1,0,0,1.0,0.0,10.0,0,1,14142,0
3,24,216,219,3,4.5,1,2.4,0.0561,5.03,0,...,1,0,0,0.0,0.0,0.0,0,1,22393,0
4,24,216,219,3,4.5,1,2.94,0.209,4.72,0,...,1,0,0,0.0,0.0,0.0,0,1,24194,0


In [None]:
final_output_df = test_df[['srch_id','prop_id','pred']]
final_output_df.head()

del[[test_df]]
gc.collect()
test_df = pd.DataFrame()

In [None]:
final_output_df = final_output_df.sort_values(by=['srch_id','pred'],ascending=[True,False],ignore_index=True)
final_output_df.head()


Unnamed: 0,srch_id,prop_id,pred
0,1,54937,5
1,1,99484,5
2,1,3180,0
3,1,5543,0
4,1,14142,0


In [None]:
final_output_df.drop('pred',axis=1,inplace=True)
out_path = os.path.join(os.getcwd(),'data/final_submission.csv')
final_output_df.to_csv(path_or_buf=out_path,index=False)