In [1]:
%%time
import warnings
warnings.filterwarnings('ignore')
import ml_metrics as metrics
import pandas as pd
import numpy as np
from sklearn import cross_validation, ensemble, tree, preprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
le = preprocessing.LabelEncoder()
import xgboost as xgb
from datetime import datetime, date
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

CPU times: user 1.27 s, sys: 454 ms, total: 1.72 s
Wall time: 3.76 s


In [2]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if score == 0.0:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
row_ids = []
preds = []

In [4]:
%%time
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

CPU times: user 27.4 s, sys: 3.03 s, total: 30.5 s
Wall time: 30.9 s


In [None]:
%%time
for i in np.arange(0,10.25,0.25):
    for r in np.arange(0,10.25,0.25):
        train_reduced = train[(train.x >= i) & (train.x < (i + .25)) & (train.y >= r) & (train.y < (r + .25))]
        test_reduced = test[(train.x >= i) & (test.x < (i + .25)) & (test.y >= r) & (test.y < (r + .25))]
        if (len(train_reduced) > 0) and (len(test_reduced) > 0):
            train_reduced['day_number'] = ((train_reduced['time']/60)//24).astype(int)
            train_reduced['seconds'] = (train_reduced['time'] * 60)
            train_reduced['date_time'] = pd.to_datetime(train_reduced['seconds'],unit='s')
            train_reduced['hour'] = train_reduced['date_time'].dt.hour
            train_reduced['day'] = train_reduced['date_time'].dt.day
            train_reduced['dow'] = train_reduced['date_time'].dt.dayofweek
            test_reduced['day_number'] = ((test_reduced['time']/60)//24).astype(int)
            test_reduced['seconds'] = (test_reduced['time'] * 60)
            test_reduced['date_time'] = pd.to_datetime(test_reduced['seconds'],unit='s')
            test_reduced['hour'] = test_reduced['date_time'].dt.hour
            test_reduced['day'] = test_reduced['date_time'].dt.day
            test_reduced['dow'] = test_reduced['date_time'].dt.dayofweek
            features = [c for c in train_reduced.columns if c in ['x', 'y', 'accuracy', 'hour', 'day', 'dow']]
            clf_rf = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1).fit(train_reduced[features], train_reduced['place_id'])
            pred = clf_rf.predict(test_reduced[features])
            print i, (i + .25), r, (r + .25), len(test_reduced['row_id'].values), len(pred)
            row_ids.append(list(test_reduced['row_id'].values))
            preds.append(pred)
        else:
            print 'Problem: ',i, (i + .25), r, (r + .25), len(train_reduced), len(test_reduced)


0.0 0.25 0.0 0.25 6041 6041
0.0 0.25 0.25 0.5 4388 4388
0.0 0.25 0.5 0.75 4738 4738
0.0 0.25 0.75 1.0 5309 5309
0.0 0.25 1.0 1.25 4888 4888
0.0 0.25 1.25 1.5 4670 4670
0.0 0.25 1.5 1.75 5152 5152
0.0 0.25 1.75 2.0 5187 5187
0.0 0.25 2.0 2.25 5979 5979
0.0 0.25 2.25 2.5 4857 4857
0.0 0.25 2.5 2.75 5848 5848
0.0 0.25 2.75 3.0 4093 4093
0.0 0.25 3.0 3.25 5599 5599
0.0 0.25 3.25 3.5 5220 5220
0.0 0.25 3.5 3.75 5133 5133
0.0 0.25 3.75 4.0 5126 5126
0.0 0.25 4.0 4.25 4917 4917
0.0 0.25 4.25 4.5 4633 4633
0.0 0.25 4.5 4.75 4530 4530
0.0 0.25 4.75 5.0 4253 4253
0.0 0.25 5.0 5.25 5900 5900
0.0 0.25 5.25 5.5 5537 5537
0.0 0.25 5.5 5.75 4939 4939


In [None]:
len(row_ids)

In [None]:
len(preds)

In [None]:
row_ids = [val for sublist in row_ids for val in sublist]
preds = [val for sublist in preds for val in sublist]

In [None]:
%%time
submission = pd.DataFrame()
submission['row_id'] = row_ids
submission['place_id'] = preds
submission.sort_values('row_id', inplace=True)
#submission['place_id'] = [' '.join(str(x) for x in y) for y in test_pred.values]

In [None]:
%%time
submission.head()

In [None]:
%%time
submission.to_csv('submission.csv', index=False)

In [None]:
%%time
#train_reduced = train[~train['place_id'].isin(counts[counts < 800].index)]
train_reduced = train[(train.x >= 0) & (train.x < .10) & (train.y >= 0) & (train.y < .10)]

In [None]:
%%time
train_reduced['day_number'] = ((train_reduced['time']/60)//24).astype(int)
train_reduced['seconds'] = (train_reduced['time'] * 60)
train_reduced['date_time'] = pd.to_datetime(train_reduced['seconds'],unit='s')
train_reduced['hour'] = train_reduced['date_time'].dt.hour
train_reduced['day'] = train_reduced['date_time'].dt.day
train_reduced['dow'] = train_reduced['date_time'].dt.dayofweek

In [None]:
train_reduced.describe()

In [None]:
print train_reduced['date_time'].min(), train_reduced['date_time'].max()

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.hist(train_reduced.day_number, bins=100, histtype = 'step')
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.hist(train_reduced.dow, bins=np.arange(7)-0.5)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.hist(train_reduced.hour, bins=24)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
features = [c for c in train_reduced.columns if c in ['x', 'y', 'accuracy', 'hour', 'day', 'dow']]

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.scatter(train_reduced.x,train_reduced.y, c=train_reduced.place_id)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
%%time
small_counts = train_reduced['place_id'].value_counts()
small_trainz = train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 0].index)]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(small_trainz.x, small_trainz.y, zs=small_trainz.hour, zdir='z', s=20, c=small_trainz.place_id, depthshade=True)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
# %%time
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(train_reduced[features], train_reduced['place_id'], test_size=0.60)
# features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.60)

In [None]:
type(labels_train)

In [None]:
%%time
# d_tree = tree.DecisionTreeClassifier(min_samples_split=1000).fit(features_train, labels_train)
clf_rf = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1).fit(features_train, labels_train)

In [None]:
%%time
pred = clf_rf.predict(features_test)

In [None]:
print accuracy_score(labels_test, pred)

In [None]:
train_reduced.head()

In [None]:
row_ids.append(train_reduced['row_id'].values)

In [None]:
row_ids[:5]

In [None]:
type(labels_test)

In [None]:
preds.append(pred)

In [None]:
preds[:5]

In [None]:
type(pred)

In [None]:
%%time
probs = pd.DataFrame(clf_rf.predict_proba(features_test))
probs.columns = np.unique(labels_train.values)

In [None]:
%%time
preds = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in probs.iterrows()])

In [None]:
%%time
print mapk([[l] for l in labels_test], [pred], 3)

In [None]:
%%time
test_probs = pd.DataFrame(clf_rf.predict_proba(test[features]))
test_pred = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in test_probs.iterrows()])

In [None]:
%%time
test_pred = clf_rf.predict(test[features])