In [None]:
import pandas as pd
import os
import gc

Splitting training data for local training/validation

In [None]:
def split_train(train_file_full:pd.DataFrame,size_test:int=10):
    len_train = len(train_file_full)
    train_split_path = os.path.join(os.getcwd(),'data/train_data_split.csv')
    test_split_path = os.path.join(os.getcwd(),'data/test_data_split.csv')

    len_train = int((100-size_test)/100*len_train)
    df_train = train_file_full[:len_train]
    df_test = train_file_full[len_train:]

    df_train.to_csv(path_or_buf=train_split_path,index=False)
    df_test.to_csv(path_or_buf=test_split_path,index=False)


do_split = False

if(do_split):
    train_file_path = os.path.join(os.getcwd(),'data/training_set_VU_DM.csv')
    train_file_full = pd.read_csv(train_file_path)
    train_file_full.head()
    split_train(train_file_full)
    train_file_full = pd.DataFrame()
    gc.collect()

Preparing training data for algorithms

In [None]:
use_split = False

if(use_split):
    train_file_path = os.path.join(os.getcwd(),'data/train_data_split.csv')
else:
    train_file_path = os.path.join(os.getcwd(),'data/training_set_VU_DM.csv')

train_df = pd.read_csv(train_file_path)
train_df.head()

In [None]:
train_data = train_df

Balancing classes

In [None]:
book_and_clicked_grp = train_df[(train_df['booking_bool']==1) & (train_df['click_bool']==1)]
clicked_grp = train_df[(train_df['booking_bool']==0) & (train_df['click_bool']==1)]
nothing_grp = train_df[(train_df['booking_bool']==0) & (train_df['click_bool']==0)]

size_most_rel = len(book_and_clicked_grp)
size_rel = len(clicked_grp)
size_least_rel = len(nothing_grp)

print(size_most_rel, size_rel, size_least_rel)

In [None]:
sample_size_mr = size_most_rel
sample_size_r = size_rel
sample_size_lr = int(1.25*(sample_size_mr+sample_size_r))

book_and_clicked_grp = book_and_clicked_grp.sample(sample_size_mr,random_state=7).reset_index(drop=True)
clicked_grp = clicked_grp.sample(sample_size_r,random_state=7).reset_index(drop=True)
nothing_grp = nothing_grp.sample(sample_size_lr,random_state=7).reset_index(drop=True)

In [None]:
train_data = pd.concat([book_and_clicked_grp,clicked_grp,nothing_grp]).reset_index(drop=True)
train_data.sort_values(by='srch_id',ascending=True,inplace=True,ignore_index=True)

book_and_clicked_grp = pd.DataFrame()
clicked_grp = pd.DataFrame()
nothing_grp = pd.DataFrame()

gc.collect()


print(len(train_data))
train_data.head()

Adding Relevancy Column

In [None]:
import numpy as np

conditions = [
    (train_data['booking_bool'] == 1),
    (train_data['click_bool'] == 1 ) & (train_data['booking_bool'] == 0),
    (train_data['click_bool'] == 0),
    ]
values = ['5', '1', '0']
train_data['relevancy'] = np.select(conditions, values)
train_data['relevancy'] = train_data['relevancy'].astype('category')

train_data['relevancy'].value_counts()

Selecting Required Features

In [None]:
feature_list = ['prop_starrating', 'prop_brand_bool', 'prop_location_score1', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'random_bool']

In [None]:
train_data = train_data[feature_list+['relevancy']]
train_data.head()

Outputting to disk

In [None]:
train_data_path = os.path.join(os.getcwd(),'data/train_data_rf.csv')
train_data.to_csv(path_or_buf=train_data_path,index=False)

In [None]:
train_data = pd.DataFrame()
train_df = pd.DataFrame()
gc.collect()

Preparing Local Test Data

In [None]:
prep_local_test = False

In [None]:
if(prep_local_test):

    local_test_path = os.path.join(os.getcwd(),'data/test_data_split.csv')
    test_data_local = pd.read_csv(local_test_path)

    print(test_data_local.head())

Adding Relevancy Column

In [None]:
if(prep_local_test):

    conditions = [
        (test_data_local['booking_bool'] == 1),
        (test_data_local['click_bool'] == 1 ) & (test_data_local['booking_bool'] == 0),
        (test_data_local['click_bool'] == 0),
        ]
    values = ['5', '1', '0']
    test_data_local['relevancy'] = np.select(conditions, values)
    test_data_local['relevancy'] = test_data_local['relevancy'].astype('category')

    print(test_data_local['relevancy'].value_counts())

Selecting Features and Outputting to Disk

In [None]:
if(prep_local_test):
    test_data_local = test_data_local[feature_list+['relevancy','srch_id','prop_id']]
    print(test_data_local.head())

In [None]:
if(prep_local_test):
    test_data_local_path = os.path.join(os.getcwd(),'data/test_data_rf_local.csv')
    test_data_local.to_csv(path_or_buf=test_data_local_path,index=False)

    test_data_local = pd.DataFrame()
    gc.collect()

Processing Kaggle Test file for prediction

In [None]:
process_kaggle_file = False

In [None]:
if(process_kaggle_file):
    kaggle_test_path = os.path.join(os.getcwd(),'data/test_set_VU_DM.csv')
    kaggle_test_df = pd.read_csv(kaggle_test_path)
    print(kaggle_test_df.head())

In [None]:
if(process_kaggle_file):
    kaggle_test_df = kaggle_test_df[['srch_id','prop_id']+feature_list]
    print(kaggle_test_df.head())

In [None]:
if(process_kaggle_file):
    out_path = os.path.join(os.getcwd(),'data/test_data_rf.csv')
    kaggle_test_df.to_csv(path_or_buf=out_path,index=False)

    kaggle_test_df = pd.DataFrame()
    gc.collect()