In [1]:
import pandas as pd
import numpy as np
import time
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns

In [56]:
train = pd.read_csv('train_filled.csv')

In [None]:
train.head()

In [21]:
train.shape

(4958347, 54)

In [57]:
dropped_set = train[train['booking_bool'] == 0].index

In [58]:
train.drop(dropped_set, inplace=True)

In [None]:
# missing value percents

In [59]:
table = []
for i in train.columns:
    total_null = train[i].isnull().sum()
    table.append([i, total_null, 100*total_null/len(train[i])])

In [60]:
missing_values_overview = pd.DataFrame(table, columns = ["Feature Name", "Total Null", "% Null"])
missing_values_overview.sort_values("% Null",ascending = False) # arrange features based on % Null 

Unnamed: 0,Feature Name,Total Null,% Null
30,comp1_rate_percent_diff,135883,98.188453
45,comp6_rate_percent_diff,135352,97.804755
28,comp1_rate,135222,97.710817
29,comp1_inv,135039,97.578582
39,comp4_rate_percent_diff,134606,97.265698
48,comp7_rate_percent_diff,133843,96.714358
43,comp6_rate,130693,94.438182
44,comp6_inv,130070,93.988005
37,comp4_rate,129605,93.651998
5,visitor_hist_starrating,129302,93.433052


In [None]:
# drop null values

In [61]:
train.drop(['comp1_rate_percent_diff', 'comp6_rate_percent_diff', 'comp1_rate', 'comp1_inv', 'comp4_rate_percent_diff'], axis = 1, inplace=True)

In [62]:
train.drop(['comp7_rate_percent_diff', 'comp6_rate', 'comp6_inv', 'comp4_rate', 'visitor_hist_starrating', 'visitor_hist_adr_usd'], axis = 1, inplace=True)

In [63]:
train.drop(['comp4_inv', 'comp7_rate', 'comp7_inv', 'comp3_rate_percent_diff', 'comp2_rate_percent_diff', 'comp8_rate_percent_diff', 'comp5_rate_percent_diff'], axis = 1, inplace=True)

In [64]:
train.drop(['comp3_rate', 'comp3_inv', 'comp8_rate', 'comp8_inv', 'comp2_rate', 'comp2_inv'], axis = 1, inplace=True)

In [65]:
train.shape

(138390, 31)

In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138390 entries, 12 to 4958345
Data columns (total 31 columns):
Unnamed: 0                     138390 non-null int64
srch_id                        138390 non-null int64
date_time                      138390 non-null object
site_id                        138390 non-null int64
visitor_location_country_id    138390 non-null int64
prop_country_id                138390 non-null int64
prop_id                        138390 non-null int64
prop_starrating                138390 non-null int64
prop_review_score              138390 non-null float64
prop_brand_bool                138390 non-null int64
prop_location_score1           138390 non-null float64
prop_location_score2           138390 non-null float64
prop_log_historical_price      138390 non-null float64
position                       138390 non-null int64
price_usd                      138390 non-null float64
promotion_flag                 138390 non-null int64
srch_destination_id         

In [67]:
train.head()

Unnamed: 0.1,Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp5_rate,comp5_inv,click_bool,gross_bookings_usd,booking_bool
12,12,1,2013-04-04 08:32:15,12,187,219,68914,2,3.0,1,...,1,1,-18.2597,264.06,1,0.0,-1.0,1,114.29,1
63,63,6,2013-06-05 12:27:51,14,100,100,104251,3,4.0,1,...,1,0,-53.9831,264.06,0,1.0,1.0,1,162.38,1
68,68,8,2013-03-20 17:50:44,5,219,219,27669,3,3.5,1,...,1,0,-21.8237,264.06,0,0.0,1.0,1,96.41,1
194,194,21,2013-02-18 17:16:25,15,55,216,61662,3,3.5,0,...,1,0,-20.2124,264.06,0,1.0,1.0,1,222.58,1
211,211,25,2012-12-11 13:03:42,5,219,219,23228,4,4.0,0,...,1,0,-16.043,264.06,0,0.0,0.0,1,47.1,1


In [68]:
train.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [69]:
train.shape

(138390, 30)

In [70]:
def month(date):
    return int(date[5:7])
def year(date):
    return int(date[0:4])

In [71]:
train['year'] = train['date_time'].apply(year)
train['month'] = train['date_time'].apply(month)

In [72]:
train.drop(['date_time'], axis = 1, inplace=True)

In [73]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138390 entries, 12 to 4958345
Data columns (total 31 columns):
srch_id                        138390 non-null int64
site_id                        138390 non-null int64
visitor_location_country_id    138390 non-null int64
prop_country_id                138390 non-null int64
prop_id                        138390 non-null int64
prop_starrating                138390 non-null int64
prop_review_score              138390 non-null float64
prop_brand_bool                138390 non-null int64
prop_location_score1           138390 non-null float64
prop_location_score2           138390 non-null float64
prop_log_historical_price      138390 non-null float64
position                       138390 non-null int64
price_usd                      138390 non-null float64
promotion_flag                 138390 non-null int64
srch_destination_id            138390 non-null int64
srch_length_of_stay            138390 non-null int64
srch_booking_window          

In [74]:
train.to_csv('train_cleaned.csv')

In [None]:
# building model

In [2]:
train = pd.read_csv('train_cleaned.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138390 entries, 0 to 138389
Data columns (total 32 columns):
Unnamed: 0                     138390 non-null int64
srch_id                        138390 non-null int64
site_id                        138390 non-null int64
visitor_location_country_id    138390 non-null int64
prop_country_id                138390 non-null int64
prop_id                        138390 non-null int64
prop_starrating                138390 non-null int64
prop_review_score              138390 non-null float64
prop_brand_bool                138390 non-null int64
prop_location_score1           138390 non-null float64
prop_location_score2           138390 non-null float64
prop_log_historical_price      138390 non-null float64
position                       138390 non-null int64
price_usd                      138390 non-null float64
promotion_flag                 138390 non-null int64
srch_destination_id            138390 non-null int64
srch_length_of_stay            

In [4]:
X = train.drop(['prop_id'], axis=1)
y = train['prop_id']

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138390 entries, 0 to 138389
Data columns (total 31 columns):
Unnamed: 0                     138390 non-null int64
srch_id                        138390 non-null int64
site_id                        138390 non-null int64
visitor_location_country_id    138390 non-null int64
prop_country_id                138390 non-null int64
prop_starrating                138390 non-null int64
prop_review_score              138390 non-null float64
prop_brand_bool                138390 non-null int64
prop_location_score1           138390 non-null float64
prop_location_score2           138390 non-null float64
prop_log_historical_price      138390 non-null float64
position                       138390 non-null int64
price_usd                      138390 non-null float64
promotion_flag                 138390 non-null int64
srch_destination_id            138390 non-null int64
srch_length_of_stay            138390 non-null int64
srch_booking_window            

In [6]:
from sklearn.preprocessing import LabelEncoder
import keras
from keras.utils import np_utils

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [7]:
encoder = LabelEncoder()
binary_encoded_y = pd.Series(encoder.fit_transform(y))

In [8]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
train_X, test_X, train_y, test_y = train_test_split(X, binary_encoded_y, test_size=0.2, random_state=1)

In [18]:
classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), algorithm="SAMME",
                               n_estimators=200, learning_rate=0.8)

In [20]:
classifier.n_jobs = 3

In [None]:
classifier.fit(train_X, train_y)

In [None]:
predictions = classifier.predict(test_X)
confusion_matrix(test_y, predictions)