In [1]:
%pip install -q sklearn matplotlib pandas dask[complete] joblib

Note: you may need to restart the kernel to use updated packages.


Importing the Training File

In [2]:
import pandas as pd
import os

file_path = os.path.join(os.getcwd(),'data/train_data.csv') 

train_data = pd.read_csv(file_path,index_col=0)
train_data.head()

Unnamed: 0,site_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score2,promotion_flag,random_bool,comp_rate,comp_inv,comp_rate_percent_diff,same_country,relevancy
0,5,4,4.0,0,0.2544,0,1,0.0,0.0,76.002012,1,0
1,5,4,4.0,0,0.133785,0,1,0.0,0.0,76.002012,1,0
2,5,5,4.5,1,0.1924,1,1,0.0,0.0,76.002012,1,0
3,5,3,4.0,0,0.3729,0,1,0.0,0.0,76.002012,1,0
4,5,5,4.5,0,0.2508,0,1,0.0,0.0,76.002012,1,0


In [3]:
len(train_data)

50000

Splitting into test and train

In [4]:
from typing import Tuple

def split_test_train(data_df:pd.DataFrame, label_df:pd.DataFrame, train_percent:int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    tot_len = len(label_df)
    train_len = int(train_percent/100*tot_len)

    train_data = data_df[:train_len]
    train_label = label_df[:train_len]
    test_data = data_df[train_len:]
    test_label = label_df[train_len:]

    return (train_data,train_label,test_data,test_label)

train_data_x = train_data.drop('relevancy',axis=1)
train_data_y = train_data['relevancy']

train_data,train_label,test_data,test_label = split_test_train(train_data_x,train_data_y,80)

In [5]:
print(test_label.value_counts())

0    9575
5     390
1      35
Name: relevancy, dtype: int64


In [6]:
weight_0 = 1-test_label.value_counts()[0]/len(test_label)
weight_1 = 1-test_label.value_counts()[1]/len(test_label)
weight_5 = 1-test_label.value_counts()[5]/len(test_label)

print(weight_0)
print(weight_1)
print(weight_5)

0.04249999999999998
0.9965
0.961


In [7]:
print(len(train_data))
print(len(test_data))

40000
10000


Training

In [8]:
from dask.distributed import Client
import joblib

from sklearn.ensemble import RandomForestClassifier

client = Client(processes=False) 

knn_classifier = RandomForestClassifier(class_weight={0:weight_0,1:weight_1,5:weight_5})


with joblib.parallel_backend('dask'):
    knn_classifier.fit(train_data,train_label)

In [9]:
with joblib.parallel_backend('dask'):
    accuracy = knn_classifier.score(test_data,test_label)

print(accuracy)


0.9399


In [10]:
with joblib.parallel_backend('dask'):
    preds = knn_classifier.predict(test_data)

print(preds)

[0 0 0 ... 0 0 0]


In [11]:
import numpy as np

unique, counts = np.unique(preds, return_counts=True)
dict(zip(unique,counts))

{0: 9805, 1: 34, 5: 161}