In [1]:
%%time
import pandas as pd
#from sklearn import cross_validation
from sklearn import ensemble
from sklearn.decomposition import RandomizedPCA
#from sklearn.linear_model import LogisticRegression
#from sklearn.feature_selection import SelectKBest
#from sklearn.naive_bayes import GaussianNB
from sklearn import tree
#from sklearn.svm import SVC
#from sklearn.naive_bayes import BernoulliNB
#from datetime import datetime

CPU times: user 1.03 s, sys: 619 ms, total: 1.65 s
Wall time: 4.45 s


In [2]:
%%time
import numpy as np

def apk(actual, predicted, k=5):
    """
    Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    Computes the average precision at k.
    This function computes the average precision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if score <= 0:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

CPU times: user 13 µs, sys: 7 µs, total: 20 µs
Wall time: 21.9 µs


In [3]:
%%time
train_dtypes = {'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'is_booking': pd.np.int64,
'cnt': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64,
'hotel_cluster': pd.np.int64}

CPU times: user 14 µs, sys: 4 µs, total: 18 µs
Wall time: 17.9 µs


In [4]:
%%time
all_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
#iter_csv = pandas.read_csv('file.csv', iterator=True, chunksize=1000)
#all_train = pd.concat([chunk[chunk['is_booking'] == 1] for chunk in all_train])
all_train = pd.concat(all_train, ignore_index=True)
#temp_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
#all_train = pd.concat(temp_train, ignore_index=True)

CPU times: user 3min 45s, sys: 7min 43s, total: 11min 29s
Wall time: 12min 24s


In [5]:
%%time
all_train['id'] = [i for i in range(0, len(all_train))]
all_train['orig_destination_distance'] = all_train['orig_destination_distance'].fillna(-1)
all_train['date_time'] = pd.to_datetime(all_train['date_time'], errors='coerce')
all_train['srch_ci'] = pd.to_datetime(all_train['srch_ci'], errors='coerce')
all_train['srch_co'] = pd.to_datetime(all_train['srch_co'], errors='coerce')
all_train['activity_month'] = all_train['date_time'].fillna(-1).dt.month.astype(int)
all_train['checkin_month'] = all_train['srch_ci'].fillna(-1).dt.month.astype(int)
all_train['checkout_month'] = all_train['srch_co'].fillna(-1).dt.month.astype(int)
#Split groups into two different classifiers for destinations vs. no destinations

CPU times: user 45.3 s, sys: 10.7 s, total: 56 s
Wall time: 1min 2s


In [15]:
%%time
destinations = pd.read_csv('destinations.csv')

CPU times: user 2.13 s, sys: 157 ms, total: 2.29 s
Wall time: 2.31 s


In [16]:
%%time
destination_ids = destinations['srch_destination_id']
destination_ds = destinations.drop(['srch_destination_id'], 1)

CPU times: user 18.4 ms, sys: 13.1 ms, total: 31.5 ms
Wall time: 31.4 ms


In [20]:
%%time
pca = RandomizedPCA(n_components=1, whiten=True).fit(destination_ds)

CPU times: user 646 ms, sys: 120 ms, total: 766 ms
Wall time: 363 ms


In [21]:
%%time
destinations_pca = pca.transform(destination_ds)

CPU times: user 41.4 ms, sys: 15.6 ms, total: 57 ms
Wall time: 50.7 ms


In [22]:
%%time
destinations_df = pd.DataFrame()
destinations_df['srch_destination_id'] = destination_ids
destinations_df['latent_destinations'] = destinations_pca

CPU times: user 7.95 ms, sys: 1.09 ms, total: 9.04 ms
Wall time: 8.05 ms


In [23]:
%%time
with_dest_match = pd.merge(all_train, destinations_df)

CPU times: user 17.4 s, sys: 22.2 s, total: 39.5 s
Wall time: 1min 1s


In [24]:
%%time
wo_dest_match = all_train[~(all_train.id.isin(with_dest_match.id))]

CPU times: user 23.8 s, sys: 40.5 s, total: 1min 4s
Wall time: 1min 41s


In [25]:
%%time
with_features = with_dest_match.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'],1)
with_labels = with_dest_match['hotel_cluster']
wo_features = wo_dest_match.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'],1)
wo_labels = wo_dest_match['hotel_cluster']
with_features = with_features.reindex_axis(sorted(with_features.columns), axis=1)
wo_features = wo_features.reindex_axis(sorted(wo_features.columns), axis=1)

CPU times: user 13.8 s, sys: 51.3 s, total: 1min 5s
Wall time: 1min 54s


In [None]:
# %%time
# pca_wo = RandomizedPCA(n_components=20, whiten=True).fit(wo_features)

In [None]:
# %%time
# wo_train_features = pca_wo.transform(wo_features)

In [26]:
%%time
#clf = LogisticRegression(tol=0.1)
#clf = GaussianNB()
#clf_with = ensemble.AdaBoostClassifier().fit(with_features, with_labels.values.ravel())
#clf_wo = ensemble.AdaBoostClassifier().fit(wo_features, wo_labels.values.ravel())
#clf = ensemble.AdaBoostClassifier(SVC(probability=True, kernel='linear'),n_estimators=10)
#clf = ensemble.GradientBoostingClassifier(SVC(probability=True, kernel='linear'),n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
#clf = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0, verbose=3)
# clf_with = tree.DecisionTreeClassifier(min_samples_split=100).fit(with_features, with_labels.values.ravel())
# clf_wo = tree.DecisionTreeClassifier(min_samples_split=50).fit(wo_features, wo_labels.values.ravel())
clf_with = ensemble.RandomForestClassifier(n_estimators=20, min_samples_split=500, n_jobs=2).fit(with_features, with_labels.values.ravel())
clf_wo = ensemble.RandomForestClassifier(n_estimators=20, min_samples_split=500, n_jobs=2).fit(wo_features, wo_labels.values.ravel())
#clf - BernoulliNB()

CPU times: user 51min 7s, sys: 2min 56s, total: 54min 3s
Wall time: 29min 29s


In [None]:
# all_scores = []
# %%time
# for i in range(0, 20):
#     features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5)
#     clf = tree.DecisionTreeClassifier(min_samples_split=500)
#     clf = clf.fit(features_train, labels_train.values.ravel())
#     pred_probs = pd.DataFrame(clf.predict_proba(features_test))
#     pred_probs = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in pred_probs.iterrows()])
#     labels_test_df = pd.DataFrame(labels_test.values, index=range(0, len(labels_test)))
#     test_score = mapk(labels_test_df.values, pred_probs.values)
#     all_scores.append(test_score)
# print np.mean(all_scores)

In [None]:
# %%time
#features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5)

In [None]:
%%time
pred_probs = pd.DataFrame(clf.predict_proba(features_test))

In [None]:
%%time
pred_probs = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in pred_probs.iterrows()])

In [None]:
%%time
labels_test_df = pd.DataFrame(labels_test.values, index=range(0, len(labels_test)))

In [None]:
%%time
test_score = mapk(labels_test_df.values, pred_probs.values)
print test_score

In [27]:
%%time
test_dtypes = {'id': pd.np.int64,
'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64}

CPU times: user 62 µs, sys: 454 µs, total: 516 µs
Wall time: 2.49 ms


In [28]:
%%time
all_test = pd.read_csv('test.csv', dtype=test_dtypes)

CPU times: user 9.55 s, sys: 4.03 s, total: 13.6 s
Wall time: 25.8 s


In [29]:
%%time
all_test['orig_destination_distance'] = all_test['orig_destination_distance'].fillna(-1)
all_test['date_time'] = pd.to_datetime(all_test['date_time'], errors='coerce')
all_test['srch_ci'] = pd.to_datetime(all_test['srch_ci'], errors='coerce')
all_test['srch_co'] = pd.to_datetime(all_test['srch_co'], errors='coerce')
all_test['activity_month'] = all_test['date_time'].fillna(-1).dt.month.astype(int)
all_test['checkin_month'] = all_test['srch_ci'].fillna(-1).dt.month.astype(int)
all_test['checkout_month'] = all_test['srch_co'].fillna(-1).dt.month.astype(int)

CPU times: user 2.3 s, sys: 149 ms, total: 2.45 s
Wall time: 2.64 s


In [30]:
%%time
with_dest_test = pd.merge(all_test, destinations_df)
with_testing_ids = with_dest_test['id'] 
with_testing_features = with_dest_test.drop(['user_id', 'id', 'date_time', 'srch_ci', 'srch_co'],1)

CPU times: user 1.6 s, sys: 836 ms, total: 2.43 s
Wall time: 2.53 s


In [31]:
%%time
wo_dest_test = all_test[~(all_test.id.isin(with_dest_test.id))]
wo_testing_ids = wo_dest_test['id']
wo_testing_features = wo_dest_test.drop(['user_id', 'id', 'date_time', 'srch_ci', 'srch_co'],1)

CPU times: user 1.22 s, sys: 763 ms, total: 1.99 s
Wall time: 2.34 s


In [32]:
%%time
with_testing_features = with_testing_features.reindex_axis(sorted(with_testing_features.columns), axis=1)
wo_testing_features = wo_testing_features.reindex_axis(sorted(wo_testing_features.columns), axis=1)

CPU times: user 136 ms, sys: 278 ms, total: 414 ms
Wall time: 416 ms


In [None]:
# %%time
# with_test = pca_with.transform(with_testing_features)

In [None]:
# %%time
# wo_test = pca_wo.transform(wo_testing_features)

In [None]:
%%time
#feature_test_file = selector.transform(all_test)
#pred = clf.predict(feature_test_list)
with_test_probs = pd.DataFrame(clf_with.predict_proba(with_testing_features))
wo_test_probs = pd.DataFrame(clf_wo.predict_proba(wo_testing_features))

In [None]:
%%time
with_test_probs = pd.Series([(i, r.sort_values(ascending=False)[:5].index.values) for i,r in with_test_probs.iterrows()])
with_test_probs = with_test_probs.values

In [None]:
%%time
indices_1 = with_testing_ids.values
values_1 = [b for a,b in with_test_probs]

In [None]:
%%time
submission_1 = pd.DataFrame()
submission_1['id'] = indices_1
submission_1['hotel_cluster'] = [' '.join(str(x) for x in y) for y in values_1]

In [None]:
%%time
wo_test_probs = pd.Series([(i, r.sort_values(ascending=False)[:5].index.values) for i,r in wo_test_probs.iterrows()])
wo_test_probs = wo_test_probs.values

In [None]:
%%time
indices_2 = wo_testing_ids.values
values_2 = [b for a,b in wo_test_probs]

In [None]:
%%time
submission_2 = pd.DataFrame()
submission_2['id'] = indices_2
submission_2['hotel_cluster'] = [' '.join(str(x) for x in y) for y in values_2]

In [None]:
%%time
submission = pd.concat([submission_1, submission_2])

In [None]:
submission.sort_values(by='id', inplace=True)

In [None]:
%%time
submission.to_csv('submission.csv', index=False)