In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from datetime import datetime

In [2]:
train_dtypes = {'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'is_booking': pd.np.int64,
'cnt': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64,
'hotel_cluster': pd.np.int64}

In [3]:
%%time
all_train = pd.read_csv('train.csv', dtype=train_dtypes)
#temp_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
#all_train = pd.concat(temp_train, ignore_index=True)

CPU times: user 1min 15s, sys: 15.6 s, total: 1min 31s
Wall time: 3min 40s


In [4]:
%%time
all_train['orig_destination_distance'] = all_train['orig_destination_distance'].fillna(0.0).astype(int)
#all_train['date_time'] = pd.to_datetime(all_train['date_time'], errors='coerce')
#all_train['srch_ci'] = pd.to_datetime(all_train['srch_ci'], errors='coerce')
#all_train['srch_co'] = pd.to_datetime(all_train['srch_co'], errors='coerce')
#Remove dates columns
all_train = all_train.drop(['date_time','srch_ci','srch_co','is_booking', 'cnt'], 1)

CPU times: user 9.47 s, sys: 13 s, total: 22.5 s
Wall time: 2min 34s


In [5]:
%%time
#Split into training and test data
split = int(0.75*len(all_train))
train = all_train[0:split]
test  = all_train[split:]

CPU times: user 2.07 ms, sys: 0 ns, total: 2.07 ms
Wall time: 8.44 ms


In [6]:
%%time
features_train = train.ix[:,:'hotel_market'] 
labels_train = train.ix[:,'hotel_cluster':]
features_test = test.ix[:,:'hotel_market'] 
labels_test = test.ix[:,'hotel_cluster':]

CPU times: user 0 ns, sys: 3.97 ms, total: 3.97 ms
Wall time: 9.87 ms


In [7]:
features_train = features_train.values
labels_train = labels_train.values
features_test = features_test.values
labels_test = labels_test.values

In [8]:
%%time
clf = GaussianNB()
clf.fit(features_train, labels_train.ravel())

CPU times: user 26.5 s, sys: 975 ms, total: 27.5 s
Wall time: 54 s


In [9]:
%%time
pred = clf.predict(features_test)

CPU times: user 2min 9s, sys: 1min 34s, total: 3min 44s
Wall time: 6min 48s


In [11]:
print accuracy_score(pred, labels_test.ravel())

0.0577647704175


In [12]:
test_dtypes = {'id': pd.np.int64,
'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64}

In [13]:
%%time
all_test = pd.read_csv('test.csv', dtype=test_dtypes)

CPU times: user 5.6 s, sys: 490 ms, total: 6.09 s
Wall time: 20.2 s


In [14]:
all_test['orig_destination_distance'] = all_train['orig_destination_distance'].fillna(0.0).astype(int)
#all_train['date_time'] = pd.to_datetime(all_train['date_time'], errors='coerce')
#all_train['srch_ci'] = pd.to_datetime(all_train['srch_ci'], errors='coerce')
#all_train['srch_co'] = pd.to_datetime(all_train['srch_co'], errors='coerce')
#Remove dates columns
testing_file = all_test.drop(['id', 'date_time','srch_ci','srch_co'], 1)

In [15]:
testing_file = testing_file.values

In [16]:
%%time
pred = clf.predict(testing_file)

CPU times: user 36 s, sys: 22.7 s, total: 58.6 s
Wall time: 1min 14s


In [17]:
len(pred)

2528243

In [18]:
submission = pd.DataFrame()

In [19]:
submission['id'] = pd.Series(all_test['id'])
submission['hotel_cluster'] = pd.Series(pred)

In [20]:
submission.to_csv('submission.csv', index=False)