In [None]:
%%time
import pandas as pd
import ml_metrics as metrics
from sklearn import cross_validation
from sklearn import ensemble
from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest, RFECV, SelectFromModel
from sklearn import tree
#from datetime import datetime

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
%%time
train_dtypes = {'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'is_booking': pd.np.int64,
'cnt': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64,
'hotel_cluster': pd.np.int64}

In [None]:
%%time
all_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
all_train = pd.concat([chunk[chunk['is_booking'] == 1] for chunk in all_train], ignore_index=True)
# all_train = pd.concat(all_train, ignore_index=True)

In [None]:
%%time
all_train['id'] = [i for i in range(0, len(all_train))]
all_train['orig_destination_distance'] = all_train['orig_destination_distance'].fillna(-1)
all_train['date_time'] = pd.to_datetime(all_train['date_time'], errors='coerce')
all_train['srch_ci'] = pd.to_datetime(all_train['srch_ci'], errors='coerce')
all_train['srch_co'] = pd.to_datetime(all_train['srch_co'], errors='coerce')
all_train['activity_month'] = all_train['date_time'].fillna(-1).dt.month.astype(int)
all_train['activity_year'] = all_train['date_time'].fillna(-1).dt.year.astype(int)
all_train['activity_dow'] = all_train['date_time'].fillna(-1).dt.dayofweek.astype(int)
all_train['activity_day'] = all_train['date_time'].fillna(-1).dt.day.astype(int)
all_train['activity_quarter'] = all_train['date_time'].fillna(-1).dt.quarter.astype(int)
all_train['checkin_month'] = all_train['srch_ci'].fillna(-1).dt.month.astype(int)
all_train['checkin_year'] = all_train['srch_ci'].fillna(-1).dt.year.astype(int)
all_train['checkin_dow'] = all_train['srch_ci'].fillna(-1).dt.dayofweek.astype(int)
all_train['checkin_day'] = all_train['srch_ci'].fillna(-1).dt.day.astype(int)
all_train['checkin_quarter'] = all_train['srch_ci'].fillna(-1).dt.quarter.astype(int)
all_train['checkout_month'] = all_train['srch_co'].fillna(-1).dt.month.astype(int)
all_train['checkout_year'] = all_train['srch_co'].fillna(-1).dt.year.astype(int)
all_train['checkout_dow'] = all_train['srch_co'].fillna(-1).dt.dayofweek.astype(int)
all_train['checkout_day'] = all_train['srch_co'].fillna(-1).dt.day.astype(int)
all_train['checkout_quarter'] = all_train['srch_co'].fillna(-1).dt.quarter.astype(int)
all_train['stay_length'] = (all_train['srch_co'] - all_train['srch_ci']).astype(int)
#Split groups into two different classifiers for destinations vs. no destinations

In [None]:
%%time
destinations = pd.read_csv('destinations.csv')

In [None]:
%%time
destination_ids = destinations['srch_destination_id']
destinations_reduced = destinations.drop(['srch_destination_id'], 1)

In [None]:
%%time
pca = PCA(n_components=11, whiten=True)
#pca = PCA()

In [None]:
%%time
pca.fit(destinations_reduced)

In [None]:
print sum(pca.explained_variance_ratio_)

In [None]:
%%time
destinations_reduced = pca.fit_transform(destinations_reduced)
destinations_reduced = pd.DataFrame(destinations_reduced)
destinations_reduced['srch_destination_id'] = destination_ids

In [None]:
%%time
all_train = pd.merge(all_train, destinations_reduced, how='left')

In [None]:
%%time
all_train.fillna(-1, inplace=True)

In [None]:
%%time
features = [c for c in all_train.columns if c not in ['id', 'is_booking', 'cnt', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co']]

In [None]:
%%time
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(all_train[features], all_train['hotel_cluster'], test_size=0.50)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.50)

In [None]:
%%time
clf = ensemble.ExtraTreesClassifier(min_samples_split=500)
clf = clf.fit(features_train, labels_train)

In [None]:
%%time
model = SelectFromModel(clf, prefit=True)
features_train = model.transform(features_train)

In [None]:
# %%time
# forest = ensemble.RandomForestClassifier(min_samples_split=500)
# selector = RFECV(forest, step=1, cv=5)
# selector = selector.fit(features_train, labels_train)
# selector.support_ 
# selector.ranking_

In [None]:
%%time
forest = ensemble.RandomForestClassifier(min_samples_split=500)
forest.fit(features_train, labels_train)
# parameters = {'n_estimators':[10, 20], 'max_depth':[5, 10], 'min_samples_split':[250, 500]}
# search = GridSearchCV(forest, parameters, n_jobs=1)
# search.fit(features_train, labels_train)
# clf.fit(all_train.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'], 1), all_train['hotel_cluster'])

In [None]:
%%time
features_test = model.transform(features_test)
pred_probs = pd.DataFrame(forest.predict_proba(features_test))

In [None]:
%%time
#pred = pd.DataFrame([list([r.nlargest(5).index]) for i,r in pred_probs.iterrows()])
pred = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in pred_probs.iterrows()])

In [None]:
print mapk([[l] for l in labels_test], pred.values, k=5)

In [None]:
# %%time
#clf = LogisticRegression(tol=0.1)
#clf = GaussianNB()
#clf_with = ensemble.AdaBoostClassifier().fit(with_features, with_labels.values.ravel())
#clf_wo = ensemble.AdaBoostClassifier().fit(wo_features, wo_labels.values.ravel())
#clf = ensemble.AdaBoostClassifier(SVC(probability=True, kernel='linear'),n_estimators=10)
#clf = ensemble.GradientBoostingClassifier(SVC(probability=True, kernel='linear'),n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
#clf = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0, verbose=3)
# clf_with = tree.DecisionTreeClassifier(min_samples_split=100).fit(with_features, with_labels.values.ravel())
# clf_wo = tree.DecisionTreeClassifier(min_samples_split=50).fit(wo_features, wo_labels.values.ravel())
# clf_with = ensemble.RandomForestClassifier(n_estimators=100, max_depth=5).fit(with_features, with_labels.values.ravel())
# clf_wo = ensemble.RandomForestClassifier(n_estimators=20, min_samples_split=500, n_jobs=2).fit(wo_features, wo_labels.values.ravel())
#clf - BernoulliNB()

In [None]:
# %%time
# with_dest_match = pd.merge(all_train, destinations_df)

In [None]:
# %%time
# wo_dest_match = all_train[~(all_train.id.isin(with_dest_match.id))]

In [None]:
# %%time
# with_features = with_dest_match.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'],1)
# with_labels = with_dest_match['hotel_cluster']
# wo_features = wo_dest_match.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'],1)
# wo_labels = wo_dest_match['hotel_cluster']
# with_features = with_features.reindex_axis(sorted(with_features.columns), axis=1)
# wo_features = wo_features.reindex_axis(sorted(wo_features.columns), axis=1)

In [None]:
# all_scores = []
# %%time
# for i in range(0, 20):
#     features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5)
#     clf = tree.DecisionTreeClassifier(min_samples_split=500)
#     clf = clf.fit(features_train, labels_train.values.ravel())
#     pred_probs = pd.DataFrame(clf.predict_proba(features_test))
#     pred_probs = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in pred_probs.iterrows()])
#     labels_test_df = pd.DataFrame(labels_test.values, index=range(0, len(labels_test)))
#     test_score = mapk(labels_test_df.values, pred_probs.values)
#     all_scores.append(test_score)
# print np.mean(all_scores)

In [None]:
%%time
test_score = mapk(labels_test_df.values, pred_probs.values)
print test_score

In [None]:
%%time
test_dtypes = {'id': pd.np.int64,
'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64}

In [None]:
%%time
all_test = pd.read_csv('test.csv', dtype=test_dtypes)

In [None]:
%%time
all_test['orig_destination_distance'] = all_test['orig_destination_distance'].fillna(-1)
all_test['date_time'] = pd.to_datetime(all_test['date_time'], errors='coerce')
all_test['srch_ci'] = pd.to_datetime(all_test['srch_ci'], errors='coerce')
all_test['srch_co'] = pd.to_datetime(all_test['srch_co'], errors='coerce')
all_test['activity_month'] = all_test['date_time'].fillna(-1).dt.month.astype(int)
all_test['activity_year'] = all_test['date_time'].fillna(-1).dt.year.astype(int)
all_test['activity_dow'] = all_test['date_time'].fillna(-1).dt.dayofweek.astype(int)
all_test['activity_day'] = all_test['date_time'].fillna(-1).dt.day.astype(int)
all_test['activity_quarter'] = all_test['date_time'].fillna(-1).dt.quarter.astype(int)
all_test['checkin_month'] = all_test['srch_ci'].fillna(-1).dt.month.astype(int)
all_test['checkin_year'] = all_test['srch_ci'].fillna(-1).dt.year.astype(int)
all_test['checkin_dow'] = all_test['srch_ci'].fillna(-1).dt.dayofweek.astype(int)
all_test['checkin_day'] = all_test['srch_ci'].fillna(-1).dt.day.astype(int)
all_test['checkin_quarter'] = all_test['srch_ci'].fillna(-1).dt.quarter.astype(int)
all_test['checkout_month'] = all_test['srch_co'].fillna(-1).dt.month.astype(int)
all_test['checkout_year'] = all_test['srch_co'].fillna(-1).dt.year.astype(int)
all_test['checkout_dow'] = all_test['srch_co'].fillna(-1).dt.dayofweek.astype(int)
all_test['checkout_day'] = all_test['srch_co'].fillna(-1).dt.day.astype(int)
all_test['checkout_quarter'] = all_test['srch_co'].fillna(-1).dt.quarter.astype(int)
all_test['stay_length'] = (all_test['srch_co'] - all_test['srch_ci']).astype(int)
#Split groups into two different classifiers for destinations vs. no destinations

In [None]:
%%time
all_test = pd.merge(all_test, destinations_reduced, how='left')

In [None]:
%%time
all_test.fillna(-1, inplace=True)

In [None]:
%%time
test_features = all_test[features]

In [None]:
%%time
test_probs = pd.DataFrame(forest.predict_proba(test_features))

In [None]:
%%time
#pred = pd.DataFrame([list([r.nlargest(5).index]) for i,r in pred_probs.iterrows()])
test_pred = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in test_probs.iterrows()])

In [None]:
%%time
submission = pd.DataFrame()
submission['id'] = all_test['id']
submission['hotel_cluster'] = [' '.join(str(x) for x in y) for y in test_pred.values]

In [None]:
submission.head()

In [None]:
submission.sort_values(by='id', inplace=True)

In [None]:
%%time
submission.to_csv('submission.csv', index=False)