In [1]:
%pip install -q sklearn matplotlib pandas dask[complete] joblib

Note: you may need to restart the kernel to use updated packages.


Importing the Training File

In [2]:
import pandas as pd
import os
import gc


file_path = os.path.join(os.getcwd(),'data/train_data.csv') 

train_data = pd.read_csv(file_path,index_col=0)
train_data.head()

Unnamed: 0,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score2,prop_log_historical_price,promotion_flag,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country,relevancy
0,4,4.0,0,0.2544,5.98,0,1,0.0,0.0,48.56563,1,0
1,4,4.0,0,0.131351,5.64,0,1,0.0,0.0,48.56563,1,0
2,5,4.5,1,0.1924,6.1,1,1,0.0,0.0,48.56563,1,0
3,3,4.0,0,0.3729,5.74,0,1,0.0,0.0,48.56563,1,0
4,5,4.5,0,0.2508,6.21,0,1,0.0,0.0,48.56563,1,0


In [3]:
len(train_data)

100000

Splitting into test and train

In [4]:
from typing import Tuple

def split_test_train(data_df:pd.DataFrame, label_df:pd.DataFrame, train_percent:int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    tot_len = len(label_df)
    train_len = int(train_percent/100*tot_len)

    train_data = data_df[:train_len]
    train_label = label_df[:train_len]
    test_data = data_df[train_len:]
    test_label = label_df[train_len:]

    return (train_data,train_label,test_data,test_label)

train_data_x = train_data.drop('relevancy',axis=1)
train_data_y = train_data['relevancy']

train_data,train_label,test_data,test_label = split_test_train(train_data_x,train_data_y,80)

del[[train_data_x,train_data_y]]
gc.collect()
train_data_x = pd.DataFrame()
train_data_y = pd.DataFrame()

In [5]:
print(test_label.value_counts())

0    19129
5      801
1       70
Name: relevancy, dtype: int64


In [6]:
weight_0 = 1-train_label.value_counts()[0]/len(train_label)
weight_1 = 1-train_label.value_counts()[1]/len(train_label)
weight_5 = 1-train_label.value_counts()[5]/len(train_label)

print(weight_0)
print(weight_1)
print(weight_5)

0.04520000000000002
0.979675
0.975125


In [7]:
print(len(train_data))
print(len(test_data))

80000
20000


Training

In [8]:
from dask.distributed import Client
import joblib

from sklearn.ensemble import RandomForestClassifier

client = Client(processes=False) 

knn_classifier = RandomForestClassifier(class_weight={0:weight_0,1:weight_1,5:weight_5})


with joblib.parallel_backend('dask'):
    knn_classifier.fit(train_data,train_label)

del[[train_data,train_label]]
gc.collect()
train_data = pd.DataFrame()
train_label = pd.DataFrame()

In [9]:
with joblib.parallel_backend('dask'):
    accuracy = knn_classifier.score(test_data,test_label)

print(accuracy)


0.9478


In [10]:
with joblib.parallel_backend('dask'):
    preds = knn_classifier.predict(test_data)

print(preds)

del[[test_data,test_label]]
gc.collect()
test_data = pd.DataFrame()
test_label = pd.DataFrame()

[0 0 0 ... 0 0 0]


In [11]:
import numpy as np

unique, counts = np.unique(preds, return_counts=True)
dict(zip(unique,counts))

{0: 19812, 1: 39, 5: 149}

Doing Predictions on Final Test Dataset

In [12]:
test_file_path = os.path.join(os.getcwd(),'data/test_data.csv')
test_df = pd.read_csv(test_file_path,index_col=0)

test_df.head()

Unnamed: 0,srch_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score2,prop_log_historical_price,promotion_flag,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country
0,1,3180,3,4.5,1,0.0691,5.03,0,0,0.0,0.0,18.48029,0
1,1,5543,3,4.5,1,0.0843,4.93,0,0,0.0,0.0,18.48029,0
2,1,14142,2,3.5,1,0.0556,4.16,0,0,0.0,0.0,10.0,0
3,1,22393,3,4.5,1,0.0561,5.03,0,0,0.0,0.0,18.48029,0
4,1,24194,3,4.5,1,0.209,4.72,0,0,0.0,0.0,18.48029,0


In [13]:
test_df_feat = test_df.drop(['srch_id','prop_id'],axis=1)
test_df_feat.head()

Unnamed: 0,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score2,prop_log_historical_price,promotion_flag,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country
0,3,4.5,1,0.0691,5.03,0,0,0.0,0.0,18.48029,0
1,3,4.5,1,0.0843,4.93,0,0,0.0,0.0,18.48029,0
2,2,3.5,1,0.0556,4.16,0,0,0.0,0.0,10.0,0
3,3,4.5,1,0.0561,5.03,0,0,0.0,0.0,18.48029,0
4,3,4.5,1,0.209,4.72,0,0,0.0,0.0,18.48029,0


In [14]:
with joblib.parallel_backend('dask'):
    final_preds = knn_classifier.predict(test_df_feat)

print(final_preds)

del[[test_df_feat]]
gc.collect()
test_df_feat = pd.DataFrame()

[0 0 0 ... 0 0 0]


In [15]:
test_df['pred'] = final_preds
test_df.head()

Unnamed: 0,srch_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score2,prop_log_historical_price,promotion_flag,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country,pred
0,1,3180,3,4.5,1,0.0691,5.03,0,0,0.0,0.0,18.48029,0,0
1,1,5543,3,4.5,1,0.0843,4.93,0,0,0.0,0.0,18.48029,0,0
2,1,14142,2,3.5,1,0.0556,4.16,0,0,0.0,0.0,10.0,0,0
3,1,22393,3,4.5,1,0.0561,5.03,0,0,0.0,0.0,18.48029,0,0
4,1,24194,3,4.5,1,0.209,4.72,0,0,0.0,0.0,18.48029,0,0


In [16]:
final_output_df = test_df[['srch_id','prop_id','pred']]
final_output_df.head()

del[[test_df]]
gc.collect()
test_df = pd.DataFrame()

In [17]:
final_output_df = final_output_df.sort_values(by=['srch_id','pred'],ascending=[True,False],ignore_index=True)
final_output_df.head()


Unnamed: 0,srch_id,prop_id,pred
0,1,3180,0
1,1,5543,0
2,1,14142,0
3,1,22393,0
4,1,24194,0


In [18]:
final_output_df.drop('pred',axis=1,inplace=True)
out_path = os.path.join(os.getcwd(),'data/final_submission.csv')
final_output_df.to_csv(path_or_buf=out_path,index=False)