In [1]:
%pip install -q pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os
import functions.preprocessing as prep
import gc

Load the original dataset

In [3]:
dataset_path = os.path.join(os.getcwd(),'data/training_set_VU_DM.csv')

#change this to increase size of training data
no_of_rows_to_get = 10000000

train_df_og = pd.read_csv(dataset_path,chunksize=no_of_rows_to_get)

train_df_og = train_df_og.get_chunk(no_of_rows_to_get)

train_df_og.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


Balancing the Dataset

In [4]:
df1 = train_df_og.groupby('srch_id')["booking_bool"].apply(lambda x: (x==1).sum()).reset_index(name='booked')
df2 = df1.groupby('booked').count()
print("Number of Searches (Booked vs Non-Booked) : ")
df2

Number of Searches (Booked vs Non-Booked) : 


Unnamed: 0_level_0,srch_id
booked,Unnamed: 1_level_1
0,12435
1,27738


In [5]:
all_srch_df = df1.groupby('booked')
df_booked = all_srch_df.get_group(1).drop("booked",axis=1)
df_not_booked = all_srch_df.get_group(0).drop("booked",axis=1)

len_df = [len(df_not_booked),len(df_booked)]
lower_number = len_df[np.argmin(len_df)]
print(np.argmin(len_df))
print(lower_number)

0
12435


In [6]:
no_of_non_booked = np.min([lower_number,20000])
no_of_booked = int(5*no_of_non_booked)
no_of_booked = np.min([no_of_booked,len(df_booked)])
print(no_of_non_booked)
print(no_of_booked)

12435
27738


In [7]:
df_booked = df_booked.sample(n=no_of_booked,random_state=7).reset_index(drop=True)
df_not_booked = df_not_booked.sample(n=no_of_non_booked,random_state=10).reset_index(drop=True)

In [8]:
train_data_booked = train_df_og.merge(df_booked,on='srch_id')
train_data_not_booked = train_df_og.merge(df_not_booked,on='srch_id')

In [9]:
train_data = pd.concat([train_data_not_booked,train_data_booked]).reset_index(drop=True)

In [10]:
train_data.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,4,2012-12-31 08:59:22,5,219,,,219,3625,4,4.0,...,,,,,,,,0,,0
1,4,2012-12-31 08:59:22,5,219,,,219,11622,4,4.0,...,,,,,,,,0,,0
2,4,2012-12-31 08:59:22,5,219,,,219,11826,5,4.5,...,,,,,,,,0,,0
3,4,2012-12-31 08:59:22,5,219,,,219,22824,3,4.0,...,,,,,,,,0,,0
4,4,2012-12-31 08:59:22,5,219,,,219,37581,5,4.5,...,,,,,,,,0,,0


In [11]:
len(train_data)

1000000

In [12]:
del[[train_df_og,df_booked,df_not_booked,train_data_booked,train_data_not_booked]]
gc.collect()
train_df_og = pd.DataFrame()
df_booked = pd.DataFrame()
df_not_booked = pd.DataFrame()
train_data_booked = pd.DataFrame()
train_data_not_booked = pd.DataFrame()

Adding New Features

Merging Comp Stats across all rows

In [13]:
train_data = prep.mergeCompsAll(train_data)
train_data = prep.fill_comp_rate(train_data)
train_data = prep.fill_comp_inv(train_data)
train_data = prep.fill_rate_diff(train_data)
train_data.head()

KeyboardInterrupt: 

Adding Relevancy Column

In [None]:
conditions = [
    (train_data['booking_bool'] == 1),
    (train_data['click_bool'] == 1 ) & (train_data['booking_bool'] == 0),
    (train_data['click_bool'] == 0),
    ]
values = ['5', '1', '0']
train_data['relevancy'] = np.select(conditions, values)
train_data['relevancy'] = train_data['relevancy'].astype('category')

train_data['relevancy'].value_counts()

Adding Visitor+Prop same country Column

In [None]:
conditions = [
    (train_data['visitor_location_country_id'] == train_data['prop_country_id']),
    (train_data['visitor_location_country_id'] != train_data['prop_country_id'])
]

values = ['1','0']
train_data['same_country'] = np.select(conditions,values)

train_data['same_country'].value_counts()

Selecting Features required for training in output csv

In [None]:
columns_to_output = ['prop_starrating','prop_review_score', 'prop_brand_bool','prop_location_score2','prop_log_historical_price','promotion_flag','random_bool', 'comp_rate', 'comp_inv', 'comp_rate_percent_diff','same_country','relevancy']

train_data = train_data[columns_to_output]

train_data.fillna({'prop_review_score':train_data['prop_review_score'].mean()},inplace=True)
train_data.fillna({'prop_location_score2':train_data['prop_location_score2'].mean()},inplace=True)


train_data.head()

Outputting to disk

In [None]:
out_path = os.path.join(os.getcwd(),'data/train_data.csv')
train_data.to_csv(path_or_buf=out_path)

In [None]:
del[[train_data]]
gc.collect()
train_data = pd.DataFrame()

Preprocessing Test Data Set

In [None]:
test_set_path = os.path.join(os.getcwd(),'data/test_set_VU_DM.csv')

no_of_test_rows = 10000000

test_df_og = pd.read_csv(test_set_path,chunksize=no_of_test_rows)

test_df_og = test_df_og.get_chunk(no_of_test_rows)

test_df_og.head()

Adding New Features (same as training set)

In [None]:
test_data = prep.mergeCompsAll(test_df_og)
test_data = prep.fill_comp_rate(test_data)
test_data = prep.fill_comp_inv(test_data)
test_data = prep.fill_rate_diff(test_data)
test_data.head()

In [None]:
del[[test_df_og]]
gc.collect()
test_df_og = pd.DataFrame()

In [None]:
conditions = [
    (test_data['visitor_location_country_id'] == test_data['prop_country_id']),
    (test_data['visitor_location_country_id'] != test_data['prop_country_id'])
]

values = ['1','0']
test_data['same_country'] = np.select(conditions,values)

test_data['same_country'].value_counts()

Selecting Features for final test set

In [None]:
columns_to_output = ['srch_id','prop_id','prop_starrating','prop_review_score', 'prop_brand_bool','prop_location_score2','prop_log_historical_price','promotion_flag','random_bool', 'comp_rate', 'comp_inv', 'comp_rate_percent_diff','same_country']

test_data = test_data[columns_to_output]

test_data.fillna({'prop_review_score':test_data['prop_review_score'].mean()},inplace=True)
test_data.fillna({'prop_location_score2':test_data['prop_location_score2'].mean()},inplace=True)


test_data.head()

Outputting to disk

In [None]:
out_path = os.path.join(os.getcwd(),'data/test_data.csv')
test_data.to_csv(path_or_buf=out_path)