In [1]:
%%time
import numpy as np

def apk(actual, predicted, k=5):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    Computes the average precision at k.
    This function computes the average precision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if score <= 0:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

CPU times: user 59.4 ms, sys: 59.6 ms, total: 119 ms
Wall time: 401 ms


In [2]:
%%time
import pandas as pd
from sklearn import cross_validation
#from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2
#from sklearn.naive_bayes import GaussianNB
from sklearn import tree
#from sklearn.svm import SVC
from sklearn import ensemble
#from sklearn.naive_bayes import BernoulliNB
#from datetime import datetime
#from blaze import Data
#import dask.dataframe as dd
#import dask.array as da

CPU times: user 947 ms, sys: 375 ms, total: 1.32 s
Wall time: 3.19 s


In [3]:
%%time
train_dtypes = {'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'is_booking': pd.np.int64,
'cnt': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64,
'hotel_cluster': pd.np.int64}

CPU times: user 12 µs, sys: 4 µs, total: 16 µs
Wall time: 18.8 µs


In [4]:
%%time
all_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
#iter_csv = pandas.read_csv('file.csv', iterator=True, chunksize=1000)
all_train = pd.concat([chunk[chunk['is_booking'] == 1] for chunk in all_train])
#temp_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
#all_train = pd.concat(temp_train, ignore_index=True)

CPU times: user 3min 20s, sys: 32.5 s, total: 3min 53s
Wall time: 3min 54s


In [5]:
%%time
all_train['orig_destination_distance'] = all_train['orig_destination_distance'].fillna(-1)
all_train['date_time'] = pd.to_datetime(all_train['date_time'], errors='coerce')
all_train['srch_ci'] = pd.to_datetime(all_train['srch_ci'], errors='coerce')
all_train['srch_co'] = pd.to_datetime(all_train['srch_co'], errors='coerce')
all_train['activity_month'] = all_train['date_time'].fillna(-1).dt.month.astype(int)
all_train['checkin_month'] = all_train['srch_ci'].fillna(-1).dt.month.astype(int)
all_train['checkout_month'] = all_train['srch_co'].fillna(-1).dt.month.astype(int)
#Split groups into two different classifiers for destinations vs. no destinations

CPU times: user 3.01 s, sys: 252 ms, total: 3.26 s
Wall time: 3.27 s


In [6]:
%%time
#features = all_train[['activity_month', 'user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance', 'checkin_month', 'checkout_month']]
features = all_train.drop(['is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'],1)
labels = all_train['hotel_cluster']
features = features.reindex_axis(sorted(features.columns), axis=1)
#features = features.values

CPU times: user 735 ms, sys: 1.4 s, total: 2.14 s
Wall time: 2.26 s


In [7]:
all_scores = []

In [8]:
%%time
for i in range(0, 20):
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5)
    clf = tree.DecisionTreeClassifier(min_samples_split=500)
    clf = clf.fit(features_train, labels_train.values.ravel())
    pred_probs = pd.DataFrame(clf.predict_proba(features_test))
    pred_probs = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in pred_probs.iterrows()])
    labels_test_df = pd.DataFrame(labels_test.values, index=range(0, len(labels_test)))
    test_score = mapk(labels_test_df.values, pred_probs.values)
    all_scores.append(test_score)
all_scores = np.asarray(all_scores)
all_scores.mean()



CPU times: user 2h 15min 40s, sys: 1min 22s, total: 2h 17min 3s
Wall time: 2h 17min 5s


In [None]:
# %%time
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5)
# selector = SelectKBest(k=10)
# selector.fit(features_train, labels_train.values.ravel())
# features_train = selector.transform(features_train)

In [None]:
%%time
#clf = LogisticRegression(tol=0.1)
clf = tree.DecisionTreeClassifier(min_samples_split=500)
#clf = GaussianNB()
#clf = ensemble.AdaBoostClassifier()
#clf = ensemble.AdaBoostClassifier(SVC(probability=True, kernel='linear'),n_estimators=10)
#clf = ensemble.GradientBoostingClassifier(SVC(probability=True, kernel='linear'),n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
#clf = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0, verbose=3)
#clf = ensemble.RandomForestClassifier()
#clf - BernoulliNB()
clf = clf.fit(features_train, labels_train.values.ravel())

In [None]:
# %%time
# features_test = selector.transform(features_test)

In [None]:
%%time
pred_probs = pd.DataFrame(clf.predict_proba(features_test))

In [None]:
%%time
pred_probs = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in pred_probs.iterrows()])

In [None]:
%%time
labels_test_df = pd.DataFrame(labels_test.values, index=range(0, len(labels_test)))

In [None]:
%%time
test_score = mapk(labels_test_df.values, pred_probs.values)
print test_score

In [None]:
#print accuracy_score(pred, labels_test.values.ravel())

In [13]:
%%time
test_dtypes = {'id': pd.np.int64,
'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64}

CPU times: user 13 µs, sys: 5 µs, total: 18 µs
Wall time: 16.9 µs


In [14]:
%%time
all_test = pd.read_csv('test.csv', dtype=test_dtypes)

CPU times: user 7.25 s, sys: 1.62 s, total: 8.88 s
Wall time: 9.1 s


In [15]:
%%time
all_test['orig_destination_distance'] = all_test['orig_destination_distance'].fillna(-1)
all_test['date_time'] = pd.to_datetime(all_test['date_time'], errors='coerce')
all_test['srch_ci'] = pd.to_datetime(all_test['srch_ci'], errors='coerce')
all_test['srch_co'] = pd.to_datetime(all_test['srch_co'], errors='coerce')
all_test['activity_month'] = all_test['date_time'].fillna(-1).dt.month.astype(int)
all_test['checkin_month'] = all_test['srch_ci'].fillna(-1).dt.month.astype(int)
all_test['checkout_month'] = all_test['srch_co'].fillna(-1).dt.month.astype(int)

CPU times: user 2.24 s, sys: 160 ms, total: 2.41 s
Wall time: 2.41 s


In [16]:
%%time
#testing_features = all_test[['activity_month', 'user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance', 'checkin_month', 'checkout_month']]
testing_features = all_test.drop(['user_id', 'id', 'date_time', 'srch_ci', 'srch_co'],1)
testing_features = testing_features.reindex_axis(sorted(testing_features.columns), axis=1)
#testing_features = testing_features.values

CPU times: user 590 ms, sys: 762 ms, total: 1.35 s
Wall time: 1.62 s


In [None]:
# %%time
# testing_list = selector.transform(testing_features)

In [17]:
%%time
#feature_test_file = selector.transform(all_test)
#pred = clf.predict(feature_test_list)
test_probs = pd.DataFrame(clf.predict_proba(testing_features))

CPU times: user 2.62 s, sys: 2.79 s, total: 5.41 s
Wall time: 6.31 s


In [18]:
%%time
test_probs = pd.Series([(i, r.sort_values(ascending=False)[:5].index.values) for i,r in test_probs.iterrows()])
test_probs = test_probs.values

CPU times: user 10min 25s, sys: 6.96 s, total: 10min 32s
Wall time: 10min 36s


In [19]:
%%time
indices = [a for a,b in test_probs]
values = [b for a,b in test_probs]

CPU times: user 860 ms, sys: 359 ms, total: 1.22 s
Wall time: 1.3 s


In [20]:
%%time
submission = pd.DataFrame()
submission['id'] = indices
submission['hotel_cluster'] = [' '.join(str(x) for x in y) for y in values]

CPU times: user 25.9 s, sys: 3 s, total: 28.9 s
Wall time: 29.3 s


In [21]:
%%time
submission.to_csv('submission.csv', index=False)

CPU times: user 2.76 s, sys: 129 ms, total: 2.89 s
Wall time: 2.99 s
