In [1]:
%pip install -q sklearn matplotlib pandas dask[complete] joblib

Note: you may need to restart the kernel to use updated packages.


Importing the Training File

In [2]:
import pandas as pd
import os

file_path = os.path.join(os.getcwd(),'data/training_set_VU_DM.csv') 
chunks = pd.read_csv(file_path,chunksize=50000)
train_data = chunks.get_chunk(50000)

In [3]:
len(train_data)

50000

In [4]:
import functions.preprocessing as prep

train_data = prep.mergeCompsAll(train_data)

In [5]:
import numpy as np

conditions = [
    (train_data['booking_bool'] == 1),
    (train_data['click_bool'] == 1 ) & (train_data['booking_bool'] == 0),
    (train_data['click_bool'] == 0),
    ]
values = ['5', '1', '0']
train_data['relevancy'] = np.select(conditions, values)
train_data['relevancy'] = train_data['relevancy'].astype('category')

In [6]:
columns_with_relevance = ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_brand_bool', 'prop_location_score1',
                              'prop_log_historical_price', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
                              'srch_saturday_night_bool', 'random_bool', 'comp_rate', 'comp_inv', 'comp_rate_percent_diff','relevancy']

train_data = train_data[columns_with_relevance]
print('Full length : ',len(train_data))
train_data = train_data.dropna(thresh=2)
train_data = prep.fill_comp_rate(train_data)
train_data = prep.fill_comp_inv(train_data)
train_data = prep.fill_rate_diff(train_data)

columns = ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_brand_bool', 'prop_location_score1',
                              'prop_log_historical_price', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
                              'srch_saturday_night_bool', 'random_bool', 'comp_rate', 'comp_inv', 'comp_rate_percent_diff']

train_data_x = train_data[columns]
train_data_y = train_data.relevancy

print('Length after dropping rows : ',len(train_data_x))
print('Length after dropping rows : ',len(train_data_y))

Full length :  50000
Length after dropping rows :  50000
Length after dropping rows :  50000


<h3> Splitting into test and train </h3>

In [8]:
from typing import Tuple

def split_test_train(data_df:pd.DataFrame, label_df:pd.DataFrame, train_percent:int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    tot_len = len(label_df)
    train_len = int(train_percent/100*tot_len)

    train_data = data_df[:train_len]
    train_label = label_df[:train_len]
    test_data = data_df[train_len:]
    test_label = label_df[train_len:]

    return (train_data,train_label,test_data,test_label)

train_data,train_label,test_data,test_label = split_test_train(train_data_x,train_data_y,80)

In [9]:
print(len(train_data))
print(len(test_data))

40000
10000


<h3> Training </h3>

In [10]:
from dask.distributed import Client
import joblib

from sklearn.neighbors import KNeighborsClassifier

client = Client(processes=False) 

knn_classifier = KNeighborsClassifier()


with joblib.parallel_backend('dask'):
    knn_classifier.fit(train_data,train_label)

In [13]:
with joblib.parallel_backend('dask'):
    accuracy = knn_classifier.score(test_data,test_label)

print(accuracy)


0.9548
