In [None]:
import numpy as np

def apk(actual, predicted, k=5):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [1]:
%%time
import pandas as pd
#from sklearn import cross_validation
#from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2
#from sklearn.naive_bayes import GaussianNB
from sklearn import tree
#from sklearn.svm import SVC
from sklearn import ensemble
#from sklearn.naive_bayes import BernoulliNB
#from datetime import datetime
#from blaze import Data
#import dask.dataframe as dd
#import dask.array as da

CPU times: user 1.12 s, sys: 536 ms, total: 1.66 s
Wall time: 4.25 s


In [2]:
%%time
train_dtypes = {'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'is_booking': pd.np.int64,
'cnt': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64,
'hotel_cluster': pd.np.int64}

CPU times: user 12 µs, sys: 4 µs, total: 16 µs
Wall time: 16.9 µs


In [3]:
%%time
all_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
#iter_csv = pandas.read_csv('file.csv', iterator=True, chunksize=1000)
all_train = pd.concat([chunk[chunk['is_booking'] == 1] for chunk in all_train])
#temp_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
#all_train = pd.concat(temp_train, ignore_index=True)

CPU times: user 3min 25s, sys: 36 s, total: 4min 1s
Wall time: 4min 2s


In [4]:
%%time
all_train['orig_destination_distance'] = all_train['orig_destination_distance'].fillna(-1)
all_train['date_time'] = pd.to_datetime(all_train['date_time'], errors='coerce')
all_train['srch_ci'] = pd.to_datetime(all_train['srch_ci'], errors='coerce')
all_train['srch_co'] = pd.to_datetime(all_train['srch_co'], errors='coerce')
all_train['activity_month'] = all_train['date_time'].fillna(-1).dt.month.astype(int)
all_train['checkin_month'] = all_train['srch_ci'].fillna(-1).dt.month.astype(int)
all_train['checkout_month'] = all_train['srch_co'].fillna(-1).dt.month.astype(int)
#Potential features - What percentage of each user?
#Split groups into two different classifiers for destinations vs. no destinations

CPU times: user 3.14 s, sys: 281 ms, total: 3.42 s
Wall time: 3.47 s


In [5]:
%%time
features = all_train[['activity_month', 'user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance', 'checkin_month', 'checkout_month']]
labels = all_train['hotel_cluster']
features = features.reindex_axis(sorted(features.columns), axis=1)
#features = features.values

CPU times: user 596 ms, sys: 1.31 s, total: 1.91 s
Wall time: 2.18 s


In [6]:
%%time
#features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.25, random_state=42)
selector = SelectKBest(k=8)
selector.fit(features, labels.values.ravel())
feature_list = selector.transform(features)

CPU times: user 1.91 s, sys: 524 ms, total: 2.43 s
Wall time: 2.48 s


In [7]:
%%time
#clf = LogisticRegression(tol=0.1)
clf = tree.DecisionTreeClassifier(min_samples_split=500)
#clf = GaussianNB()
#clf = ensemble.AdaBoostClassifier()
#clf = ensemble.AdaBoostClassifier(SVC(probability=True, kernel='linear'),n_estimators=10)
#clf = ensemble.GradientBoostingClassifier(SVC(probability=True, kernel='linear'),n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
#clf = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0, verbose=3)
#clf = ensemble.RandomForestClassifier()
#clf - BernoulliNB()
clf = clf.fit(feature_list, labels.values.ravel())

CPU times: user 39.1 s, sys: 221 ms, total: 39.3 s
Wall time: 39.4 s


In [None]:
#print accuracy_score(pred, labels_test.values.ravel())

In [8]:
%%time
test_dtypes = {'id': pd.np.int64,
'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64}

CPU times: user 10 µs, sys: 2 µs, total: 12 µs
Wall time: 14.1 µs


In [9]:
%%time
all_test = pd.read_csv('test.csv', dtype=test_dtypes)

CPU times: user 7.31 s, sys: 1.44 s, total: 8.75 s
Wall time: 9 s


In [10]:
%%time
all_test['orig_destination_distance'] = all_test['orig_destination_distance'].fillna(-1)
all_test['date_time'] = pd.to_datetime(all_test['date_time'], errors='coerce')
all_test['srch_ci'] = pd.to_datetime(all_test['srch_ci'], errors='coerce')
all_test['srch_co'] = pd.to_datetime(all_test['srch_co'], errors='coerce')
all_test['activity_month'] = all_test['date_time'].fillna(-1).dt.month.astype(int)
all_test['checkin_month'] = all_test['srch_ci'].fillna(-1).dt.month.astype(int)
all_test['checkout_month'] = all_test['srch_co'].fillna(-1).dt.month.astype(int)
#all_test['is_booking'] = 1

CPU times: user 2.3 s, sys: 101 ms, total: 2.4 s
Wall time: 2.41 s


In [11]:
%%time
testing_features = all_test[['activity_month', 'user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance', 'checkin_month', 'checkout_month']]
testing_features = testing_features.reindex_axis(sorted(testing_features.columns), axis=1)
#testing_features = testing_features.values

In [12]:
%%time
testing_list = selector.transform(testing_features)

CPU times: user 106 ms, sys: 110 ms, total: 216 ms
Wall time: 215 ms


In [13]:
%%time
#feature_test_file = selector.transform(all_test)
#pred = clf.predict(feature_test_list)
probs = pd.DataFrame(clf.predict_proba(testing_list))

CPU times: user 2.64 s, sys: 3.7 s, total: 6.33 s
Wall time: 7.24 s


In [14]:
%%time
probs_series = pd.Series([(i, r.sort_values(ascending=False)[:5].index.values) for i,r in probs.iterrows()])
probs_series = probs_series.values

CPU times: user 10min 14s, sys: 6.78 s, total: 10min 20s
Wall time: 10min 23s


In [15]:
%%time
indices = [a for a,b in probs_series]
values = [b for a,b in probs_series]

CPU times: user 924 ms, sys: 203 ms, total: 1.13 s
Wall time: 1.16 s


In [16]:
%%time
submission = pd.DataFrame()
submission['id'] = indices
submission['hotel_cluster'] = [' '.join(str(x) for x in y) for y in values]

CPU times: user 25.9 s, sys: 2.52 s, total: 28.4 s
Wall time: 28.7 s


In [17]:
%%time
submission.to_csv('submission.csv', index=False)

CPU times: user 2.77 s, sys: 118 ms, total: 2.89 s
Wall time: 2.92 s
