In [9]:
%%time
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn import cross_validation, ensemble, tree, metrics, preprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
le = preprocessing.LabelEncoder()
import xgboost as xgb
from datetime import datetime, date
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pylab as plt
from sklearn.decomposition import PCA
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

CPU times: user 10.7 ms, sys: 8.86 ms, total: 19.6 ms
Wall time: 66.5 ms


In [2]:
%%time
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
%%time
train_dtypes = {'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'is_booking': pd.np.int64,
'cnt': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64,
'hotel_cluster': pd.np.int64}

CPU times: user 11 µs, sys: 5 µs, total: 16 µs
Wall time: 16 µs


In [4]:
%%time
all_train = pd.read_csv('train.csv', dtype=train_dtypes, iterator=True, chunksize=1000)
all_train = pd.concat([chunk[chunk['is_booking'] == 1] for chunk in all_train], ignore_index=True)
# all_train = pd.concat(all_train, ignore_index=True)

CPU times: user 3min 27s, sys: 31.7 s, total: 3min 59s
Wall time: 4min


In [5]:
%%time
all_train['id'] = [i for i in range(0, len(all_train))]
all_train['orig_destination_distance'] = all_train['orig_destination_distance'].fillna(-1)
all_train['date_time'] = pd.to_datetime(all_train['date_time'], errors='coerce')
all_train['srch_ci'] = pd.to_datetime(all_train['srch_ci'], errors='coerce')
all_train['srch_co'] = pd.to_datetime(all_train['srch_co'], errors='coerce')
all_train['activity_month'] = all_train['date_time'].fillna(-1).dt.month.astype(int)
all_train['activity_year'] = all_train['date_time'].fillna(-1).dt.year.astype(int)
all_train['activity_dow'] = all_train['date_time'].fillna(-1).dt.dayofweek.astype(int)
all_train['activity_day'] = all_train['date_time'].fillna(-1).dt.day.astype(int)
all_train['activity_quarter'] = all_train['date_time'].fillna(-1).dt.quarter.astype(int)
all_train['checkin_month'] = all_train['srch_ci'].fillna(-1).dt.month.astype(int)
all_train['checkin_year'] = all_train['srch_ci'].fillna(-1).dt.year.astype(int)
all_train['checkin_dow'] = all_train['srch_ci'].fillna(-1).dt.dayofweek.astype(int)
all_train['checkin_day'] = all_train['srch_ci'].fillna(-1).dt.day.astype(int)
all_train['checkin_quarter'] = all_train['srch_ci'].fillna(-1).dt.quarter.astype(int)
all_train['checkout_month'] = all_train['srch_co'].fillna(-1).dt.month.astype(int)
all_train['checkout_year'] = all_train['srch_co'].fillna(-1).dt.year.astype(int)
all_train['checkout_dow'] = all_train['srch_co'].fillna(-1).dt.dayofweek.astype(int)
all_train['checkout_day'] = all_train['srch_co'].fillna(-1).dt.day.astype(int)
all_train['checkout_quarter'] = all_train['srch_co'].fillna(-1).dt.quarter.astype(int)
all_train['stay_length'] = (all_train['srch_co'] - all_train['srch_ci']).astype(int)
#Split groups into two different classifiers for destinations vs. no destinations

CPU times: user 6.93 s, sys: 641 ms, total: 7.57 s
Wall time: 7.6 s


In [6]:
%%time
destinations = pd.read_csv('destinations.csv')

CPU times: user 2.19 s, sys: 175 ms, total: 2.37 s
Wall time: 2.4 s


In [7]:
%%time
destination_ids = destinations['srch_destination_id']
destinations_reduced = destinations.drop(['srch_destination_id'], 1)

CPU times: user 25.7 ms, sys: 19.1 ms, total: 44.8 ms
Wall time: 49.7 ms


In [10]:
%%time
pca = PCA(n_components=11, whiten=True)
#pca = PCA()

CPU times: user 11 µs, sys: 3 µs, total: 14 µs
Wall time: 15 µs


In [11]:
%%time
pca.fit(destinations_reduced)

CPU times: user 1.19 s, sys: 165 ms, total: 1.35 s
Wall time: 943 ms


PCA(copy=True, n_components=11, whiten=True)

In [12]:
print sum(pca.explained_variance_ratio_)

0.755874193278


In [13]:
%%time
destinations_reduced = pca.fit_transform(destinations_reduced)
destinations_reduced = pd.DataFrame(destinations_reduced)
destinations_reduced['srch_destination_id'] = destination_ids

CPU times: user 1.03 s, sys: 38.8 ms, total: 1.06 s
Wall time: 589 ms


In [14]:
%%time
all_train = pd.merge(all_train, destinations_reduced, how='left')

CPU times: user 1.01 s, sys: 1.19 s, total: 2.2 s
Wall time: 2.44 s


In [15]:
%%time
all_train.fillna(-1, inplace=True)

CPU times: user 1.15 s, sys: 3.42 s, total: 4.57 s
Wall time: 5.64 s


In [18]:
%%time
all_train.describe()

Unnamed: 0,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,...,1,2,3,4,5,6,7,8,9,10
count,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,...,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0,3000693.0
mean,9.37306,2.705199,87.62437,312.9063,27880.12,1117.037,606620.9,0.09921275,0.1366978,6.179528,...,0.2118839,0.2970489,-0.01612115,-0.414447,-0.3104634,0.08079592,0.2932281,0.1759076,0.3067173,-0.08919478
std,11.91513,0.7296438,59.0182,204.498,16731.0,1928.388,349317.4,0.2989475,0.343528,3.610034,...,1.447072,1.68089,1.290193,1.352833,1.973668,2.170433,1.210089,1.185041,1.207508,1.307481
min,2.0,0.0,0.0,0.0,0.0,-1.0,5.0,0.0,0.0,0.0,...,-9.196391,-6.334904,-3.970428,-8.481397,-9.921926,-4.479331,-6.480873,-7.611597,-12.04188,-16.18853
25%,2.0,3.0,66.0,174.0,13400.0,-1.0,304726.0,0.0,0.0,2.0,...,-0.01826679,-0.7547446,-0.9218152,-1.145623,-0.8607346,-1.044911,-0.5209294,-0.4878143,-0.3607401,-0.793481
50%,2.0,3.0,66.0,314.0,27655.0,213.2473,606895.0,0.0,0.0,9.0,...,0.51864,0.3465774,-0.08788875,-0.1278972,0.05659693,-0.2826261,0.1410039,0.04617944,0.352945,-0.05230367
75%,11.0,3.0,69.0,395.0,42500.0,1312.669,910102.0,0.0,0.0,9.0,...,1.038655,1.516731,0.8149316,0.4823635,0.7913757,0.4130731,0.9632594,0.7810385,1.011367,0.6685248
max,53.0,4.0,239.0,1027.0,56507.0,12199.17,1198784.0,1.0,1.0,10.0,...,3.270848,4.852579,6.828371,4.463351,4.485526,10.24373,17.09379,8.582214,5.298626,5.106518


In [19]:
%%time
all_train.dtypes

CPU times: user 327 µs, sys: 39 µs, total: 366 µs
Wall time: 341 µs


date_time                    datetime64[ns]
site_name                             int64
posa_continent                        int64
user_location_country                 int64
user_location_region                  int64
user_location_city                    int64
orig_destination_distance           float64
user_id                               int64
is_mobile                             int64
is_package                            int64
channel                               int64
srch_ci                      datetime64[ns]
srch_co                      datetime64[ns]
srch_adults_cnt                       int64
srch_children_cnt                     int64
srch_rm_cnt                           int64
srch_destination_id                   int64
srch_destination_type_id              int64
is_booking                            int64
cnt                                   int64
hotel_continent                       int64
hotel_country                         int64
hotel_market                    

In [16]:
%%time
features = [c for c in all_train.columns if c not in ['id', 'is_booking', 'cnt', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co']]

CPU times: user 91 µs, sys: 142 µs, total: 233 µs
Wall time: 216 µs


In [None]:
%%time
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(all_train[features], all_train['hotel_cluster'], test_size=0.50)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.70)

In [None]:
%%time
clf = ensemble.ExtraTreesClassifier(min_samples_split=500)
clf = clf.fit(features_train, labels_train)

In [None]:
%%time
model = SelectFromModel(clf, prefit=True)
features_train = model.transform(features_train)

In [None]:
# %%time
# forest = ensemble.RandomForestClassifier(min_samples_split=500)
# selector = RFECV(forest, step=1, cv=5)
# selector = selector.fit(features_train, labels_train)
# selector.support_ 
# selector.ranking_

In [None]:
%%time
forest = ensemble.RandomForestClassifier(min_samples_split=500)
forest.fit(features_train, labels_train)
# parameters = {'n_estimators':[10, 20], 'max_depth':[5, 10], 'min_samples_split':[250, 500]}
# search = GridSearchCV(forest, parameters, n_jobs=1)
# search.fit(features_train, labels_train)
# clf.fit(all_train.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'], 1), all_train['hotel_cluster'])

In [None]:
%%time
features_test = model.transform(features_test)
pred_probs = pd.DataFrame(forest.predict_proba(features_test))

In [None]:
%%time
#pred = pd.DataFrame([list([r.nlargest(5).index]) for i,r in pred_probs.iterrows()])
pred = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in pred_probs.iterrows()])

In [None]:
print mapk([[l] for l in labels_test], pred.values, k=5)

In [None]:
# %%time
#clf = LogisticRegression(tol=0.1)
#clf = GaussianNB()
#clf_with = ensemble.AdaBoostClassifier().fit(with_features, with_labels.values.ravel())
#clf_wo = ensemble.AdaBoostClassifier().fit(wo_features, wo_labels.values.ravel())
#clf = ensemble.AdaBoostClassifier(SVC(probability=True, kernel='linear'),n_estimators=10)
#clf = ensemble.GradientBoostingClassifier(SVC(probability=True, kernel='linear'),n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
#clf = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0, verbose=3)
# clf_with = tree.DecisionTreeClassifier(min_samples_split=100).fit(with_features, with_labels.values.ravel())
# clf_wo = tree.DecisionTreeClassifier(min_samples_split=50).fit(wo_features, wo_labels.values.ravel())
# clf_with = ensemble.RandomForestClassifier(n_estimators=100, max_depth=5).fit(with_features, with_labels.values.ravel())
# clf_wo = ensemble.RandomForestClassifier(n_estimators=20, min_samples_split=500, n_jobs=2).fit(wo_features, wo_labels.values.ravel())
#clf - BernoulliNB()

In [None]:
# %%time
# with_dest_match = pd.merge(all_train, destinations_df)

In [None]:
# %%time
# wo_dest_match = all_train[~(all_train.id.isin(with_dest_match.id))]

In [None]:
# %%time
# with_features = with_dest_match.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'],1)
# with_labels = with_dest_match['hotel_cluster']
# wo_features = wo_dest_match.drop(['id', 'is_booking', 'cnt', 'user_id', 'hotel_cluster', 'date_time', 'srch_ci', 'srch_co'],1)
# wo_labels = wo_dest_match['hotel_cluster']
# with_features = with_features.reindex_axis(sorted(with_features.columns), axis=1)
# wo_features = wo_features.reindex_axis(sorted(wo_features.columns), axis=1)

In [None]:
# all_scores = []
# %%time
# for i in range(0, 20):
#     features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5)
#     clf = tree.DecisionTreeClassifier(min_samples_split=500)
#     clf = clf.fit(features_train, labels_train.values.ravel())
#     pred_probs = pd.DataFrame(clf.predict_proba(features_test))
#     pred_probs = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in pred_probs.iterrows()])
#     labels_test_df = pd.DataFrame(labels_test.values, index=range(0, len(labels_test)))
#     test_score = mapk(labels_test_df.values, pred_probs.values)
#     all_scores.append(test_score)
# print np.mean(all_scores)

In [None]:
%%time
test_score = mapk(labels_test_df.values, pred_probs.values)
print test_score

In [None]:
%%time
test_dtypes = {'id': pd.np.int64,
'date_time': pd.np.object,
'site_name': pd.np.int64,
'posa_continent': pd.np.int64,
'user_location_country': pd.np.int64,
'user_location_region': pd.np.int64,
'user_location_city': pd.np.int64,
'orig_destination_distance': pd.np.float64,
'user_id': pd.np.int64,
'is_mobile': pd.np.int64,
'is_package': pd.np.int64,
'channel': pd.np.int64,
'srch_ci': pd.np.object,
'srch_co': pd.np.object,
'srch_adults_cnt': pd.np.int64,
'srch_children_cnt': pd.np.int64,
'srch_rm_cnt': pd.np.int64,
'srch_destination_id': pd.np.int64,
'srch_destination_type_id': pd.np.int64,
'hotel_continent': pd.np.int64,
'hotel_country': pd.np.int64,
'hotel_market': pd.np.int64}

In [None]:
%%time
all_test = pd.read_csv('test.csv', dtype=test_dtypes)

In [None]:
%%time
all_test['orig_destination_distance'] = all_test['orig_destination_distance'].fillna(-1)
all_test['date_time'] = pd.to_datetime(all_test['date_time'], errors='coerce')
all_test['srch_ci'] = pd.to_datetime(all_test['srch_ci'], errors='coerce')
all_test['srch_co'] = pd.to_datetime(all_test['srch_co'], errors='coerce')
all_test['activity_month'] = all_test['date_time'].fillna(-1).dt.month.astype(int)
all_test['activity_year'] = all_test['date_time'].fillna(-1).dt.year.astype(int)
all_test['activity_dow'] = all_test['date_time'].fillna(-1).dt.dayofweek.astype(int)
all_test['activity_day'] = all_test['date_time'].fillna(-1).dt.day.astype(int)
all_test['activity_quarter'] = all_test['date_time'].fillna(-1).dt.quarter.astype(int)
all_test['checkin_month'] = all_test['srch_ci'].fillna(-1).dt.month.astype(int)
all_test['checkin_year'] = all_test['srch_ci'].fillna(-1).dt.year.astype(int)
all_test['checkin_dow'] = all_test['srch_ci'].fillna(-1).dt.dayofweek.astype(int)
all_test['checkin_day'] = all_test['srch_ci'].fillna(-1).dt.day.astype(int)
all_test['checkin_quarter'] = all_test['srch_ci'].fillna(-1).dt.quarter.astype(int)
all_test['checkout_month'] = all_test['srch_co'].fillna(-1).dt.month.astype(int)
all_test['checkout_year'] = all_test['srch_co'].fillna(-1).dt.year.astype(int)
all_test['checkout_dow'] = all_test['srch_co'].fillna(-1).dt.dayofweek.astype(int)
all_test['checkout_day'] = all_test['srch_co'].fillna(-1).dt.day.astype(int)
all_test['checkout_quarter'] = all_test['srch_co'].fillna(-1).dt.quarter.astype(int)
all_test['stay_length'] = (all_test['srch_co'] - all_test['srch_ci']).astype(int)
#Split groups into two different classifiers for destinations vs. no destinations

In [None]:
%%time
all_test = pd.merge(all_test, destinations_reduced, how='left')

In [None]:
%%time
all_test.fillna(-1, inplace=True)

In [None]:
%%time
test_features = all_test[features]

In [None]:
%%time
test_probs = pd.DataFrame(forest.predict_proba(test_features))

In [None]:
%%time
#pred = pd.DataFrame([list([r.nlargest(5).index]) for i,r in pred_probs.iterrows()])
test_pred = pd.DataFrame([list([r.sort_values(ascending=False)[:5].index.values]) for i,r in test_probs.iterrows()])

In [None]:
%%time
submission = pd.DataFrame()
submission['id'] = all_test['id']
submission['hotel_cluster'] = [' '.join(str(x) for x in y) for y in test_pred.values]

In [None]:
submission.head()

In [None]:
submission.sort_values(by='id', inplace=True)

In [None]:
%%time
submission.to_csv('submission.csv', index=False)