In [1]:
# %%time
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn import cross_validation, ensemble, tree, preprocessing, neighbors, naive_bayes
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
le = preprocessing.LabelEncoder()
import xgboost as xgb
from datetime import datetime, date
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

CPU times: user 779 ms, sys: 153 ms, total: 932 ms
Wall time: 985 ms


In [2]:
# %%time
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

CPU times: user 10 µs, sys: 3 µs, total: 13 µs
Wall time: 15 µs


In [3]:
# %%time
ids = []
predictions = []

CPU times: user 23 µs, sys: 34 µs, total: 57 µs
Wall time: 98.9 µs


In [4]:
# %%time
train_dtypes = {'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'is_booking': pd.np.int64,
'cnt': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64,
'hotel_cluster': pd.np.int64}

test_dtypes = {'id': pd.np.int64,
'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64}

CPU times: user 21 µs, sys: 9 µs, total: 30 µs
Wall time: 26 µs


In [5]:
# %%time
test = pd.read_csv('test.csv', dtype=test_dtypes)
train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=100000)
# train = pd.concat([chunk[chunk['user_location_city'].isin(np.unique(test.user_location_city))] for chunk in train], ignore_index=True)
# train = pd.concat([chunk[chunk['is_booking'] == 1] for chunk in train], ignore_index=True)
# train = pd.concat([chunk for chunk in train], ignore_index=True)
train = pd.concat([chunk[(chunk['user_id'].isin(np.unique(test.user_id))) & (chunk['is_booking'] == 1)] for chunk in train], ignore_index=True)
# train = train[train['user_id'].isin(np.unique(test.user_id))]
destinations = pd.read_csv('destinations.csv')

CPU times: user 3min 8s, sys: 45.3 s, total: 3min 53s
Wall time: 3min 55s


In [6]:
print len(train), len(test)

2968785 2528243


In [7]:
# %%time
destination_ids = destinations['srch_destination_id']
destinations = destinations.drop(['srch_destination_id'], 1)
pca = PCA(n_components=11, whiten=True)
destinations = pca.fit_transform(destinations)
destinations = pd.DataFrame(destinations)
destinations['srch_destination_id'] = destination_ids

CPU times: user 1.22 s, sys: 157 ms, total: 1.38 s
Wall time: 957 ms


In [8]:
# %%time
# train['id'] = [i for i in range(0, len(train))]
train['orig_destination_distance'] = train['orig_destination_distance'].fillna(-1)
train['date_time'] = pd.to_datetime(train['date_time'], errors='coerce')
train['srch_ci'] = pd.to_datetime(train['srch_ci'], errors='coerce')
train['srch_co'] = pd.to_datetime(train['srch_co'], errors='coerce')
train['activity_month'] = train['date_time'].fillna(-1).dt.month.astype(int)
train['activity_year'] = train['date_time'].fillna(-1).dt.year.astype(int)
train['activity_dow'] = train['date_time'].fillna(-1).dt.dayofweek.astype(int)
train['activity_day'] = train['date_time'].fillna(-1).dt.day.astype(int)
train['activity_quarter'] = train['date_time'].fillna(-1).dt.quarter.astype(int)
train['checkin_month'] = train['srch_ci'].fillna(-1).dt.month.astype(int)
train['checkin_year'] = train['srch_ci'].fillna(-1).dt.year.astype(int)
train['checkin_dow'] = train['srch_ci'].fillna(-1).dt.dayofweek.astype(int)
train['checkin_day'] = train['srch_ci'].fillna(-1).dt.day.astype(int)
train['checkin_quarter'] = train['srch_ci'].fillna(-1).dt.quarter.astype(int)
train['checkout_month'] = train['srch_co'].fillna(-1).dt.month.astype(int)
train['checkout_year'] = train['srch_co'].fillna(-1).dt.year.astype(int)
train['checkout_dow'] = train['srch_co'].fillna(-1).dt.dayofweek.astype(int)
train['checkout_day'] = train['srch_co'].fillna(-1).dt.day.astype(int)
train['checkout_quarter'] = train['srch_co'].fillna(-1).dt.quarter.astype(int)
train['stay_length'] = (train['srch_co'] - train['srch_ci']).astype(int)

CPU times: user 5.95 s, sys: 530 ms, total: 6.48 s
Wall time: 6.47 s


In [9]:
# %%time
test['orig_destination_distance'] = test['orig_destination_distance'].fillna(-1)
test['date_time'] = pd.to_datetime(test['date_time'], errors='coerce')
test['srch_ci'] = pd.to_datetime(test['srch_ci'], errors='coerce')
test['srch_co'] = pd.to_datetime(test['srch_co'], errors='coerce')
test['activity_month'] = test['date_time'].fillna(-1).dt.month.astype(int)
test['activity_year'] = test['date_time'].fillna(-1).dt.year.astype(int)
test['activity_dow'] = test['date_time'].fillna(-1).dt.dayofweek.astype(int)
test['activity_day'] = test['date_time'].fillna(-1).dt.day.astype(int)
test['activity_quarter'] = test['date_time'].fillna(-1).dt.quarter.astype(int)
test['checkin_month'] = test['srch_ci'].fillna(-1).dt.month.astype(int)
test['checkin_year'] = test['srch_ci'].fillna(-1).dt.year.astype(int)
test['checkin_dow'] = test['srch_ci'].fillna(-1).dt.dayofweek.astype(int)
test['checkin_day'] = test['srch_ci'].fillna(-1).dt.day.astype(int)
test['checkin_quarter'] = test['srch_ci'].fillna(-1).dt.quarter.astype(int)
test['checkout_month'] = test['srch_co'].fillna(-1).dt.month.astype(int)
test['checkout_year'] = test['srch_co'].fillna(-1).dt.year.astype(int)
test['checkout_dow'] = test['srch_co'].fillna(-1).dt.dayofweek.astype(int)
test['checkout_day'] = test['srch_co'].fillna(-1).dt.day.astype(int)
test['checkout_quarter'] = test['srch_co'].fillna(-1).dt.quarter.astype(int)
test['stay_length'] = (test['srch_co'] - test['srch_ci']).astype(int)

CPU times: user 4.75 s, sys: 511 ms, total: 5.26 s
Wall time: 5.26 s


In [10]:
# %%time
train = pd.merge(train, destinations, how='left')
train.fillna(-1, inplace=True)

CPU times: user 2.02 s, sys: 3.84 s, total: 5.86 s
Wall time: 6.77 s


In [11]:
# %%time
test = pd.merge(test, destinations, how='left')
test.fillna(-1, inplace=True)

CPU times: user 1.71 s, sys: 2.21 s, total: 3.92 s
Wall time: 4.81 s


In [20]:
# %%time
features = [c for c in train.columns if c not in ['id', 'is_booking', 'cnt', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co']]

CPU times: user 79 µs, sys: 72 µs, total: 151 µs
Wall time: 149 µs


In [None]:
print len(np.unique(train.user_id)), len(np.unique(test.user_id))

In [None]:
# %%time
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(train[features], train['hotel_cluster'], test_size=0.50)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.60)

In [22]:
# %%time
# neigh = neighbors.KNeighborsClassifier(weights='distance', n_jobs=-1).fit(train[features], train['hotel_cluster'])
forest = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(train[features], train['hotel_cluster'])
# bayes = naive_bayes.GaussianNB().fit(train[features], train['hotel_cluster'])

CPU times: user 8min 53s, sys: 51.9 s, total: 9min 45s
Wall time: 3min 30s


In [23]:
# %%time
probs = pd.DataFrame(forest.predict_proba(test[features]))
probs.columns = np.unique(train['hotel_cluster'].sort_values().values)
# probs.columns = np.unique(labels_train.sort_values().values)
# probs.columns = np.unique(labels_train.values)
preds = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in probs.iterrows()])
# print mapk([[l] for l in labels_test], preds[0], 5)

CPU times: user 11min 13s, sys: 3min 31s, total: 14min 44s
Wall time: 13min 27s


In [24]:
# %%time
submission = pd.DataFrame()
submission['id'] = test['id']
submission['hotel_cluster'] = [' '.join(str(x) for x in y) for y in preds.values]
submission.sort_values(by='id', inplace=True)

CPU times: user 7min 24s, sys: 8.29 s, total: 7min 33s
Wall time: 7min 38s


In [27]:
# %%time
submission.head()

CPU times: user 223 µs, sys: 0 ns, total: 223 µs
Wall time: 228 µs


Unnamed: 0,id,hotel_cluster
0,0,[37 55 11 35 5]
1,1,[22 28 30 78 91]
2,2,[ 0 42 48 23 31]
3,3,[79 45 24 19 99]
4,4,[77 99 83 25 28]


In [26]:
# %%time
submission.to_csv('submission.csv', index=False)

CPU times: user 2.69 s, sys: 109 ms, total: 2.8 s
Wall time: 2.9 s
