In [None]:
%%time
import warnings
warnings.filterwarnings('ignore')
import ml_metrics as metrics
import pandas as pd
import numpy as np
from sklearn import cross_validation, ensemble, tree, preprocessing
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
le = preprocessing.LabelEncoder()
import xgboost as xgb
from datetime import datetime, date
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
ids = []
predictions = []

In [None]:
%%time
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
%%time
for i in np.arange(7.5,10,.5):
    for r in np.arange(0,10,.5):
        train_reduced = train[(train.x >= (i - .1)) & (train.x <= (i + .6)) & (train.y >= (r - .1)) & (train.y <= (r + .6))]
        if i == 9.5 and r == 9.5:
            test_reduced = test[(test.x >= i) & (test.x <= (i + .5)) & (test.y >= r) & (test.y <= (r + .5))]
        elif i == 9.5:
            test_reduced = test[(test.x >= i) & (test.x <= (i + .5)) & (test.y >= r) & (test.y < (r + .5))]
        elif r == 9.5:
            test_reduced = test[(test.x >= i) & (test.x < (i + .5)) & (test.y >= r) & (test.y <= (r + .5))]
        else:
            test_reduced = test[(test.x >= i) & (test.x < (i + .5)) & (test.y >= r) & (test.y < (r + .5))]
        small_counts = train_reduced['place_id'].value_counts()
        print 'Test: ',i, (i + .5), r, (r + .5), len(train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)]), len(test_reduced)
        train_reduced['day_number'] = ((train_reduced['time']/60)//24).astype(int)
        train_reduced['seconds'] = (train_reduced['time'] * 60)
        train_reduced['date_time'] = pd.to_datetime(train_reduced['seconds'],unit='s')
        train_reduced['hour'] = train_reduced['date_time'].dt.hour
        train_reduced['day'] = train_reduced['date_time'].dt.day
        train_reduced['dow'] = train_reduced['date_time'].dt.dayofweek
        test_reduced['day_number'] = ((test_reduced['time']/60)//24).astype(int)
        test_reduced['seconds'] = (test_reduced['time'] * 60)
        test_reduced['date_time'] = pd.to_datetime(test_reduced['seconds'],unit='s')
        test_reduced['hour'] = test_reduced['date_time'].dt.hour
        test_reduced['day'] = test_reduced['date_time'].dt.day
        test_reduced['dow'] = test_reduced['date_time'].dt.dayofweek
        features = [c for c in train_reduced.columns if c in ['x', 'y', 'accuracy', 'hour', 'day', 'dow']]
#         clf_rf = ensemble.RandomForestClassifier(n_estimators=20, n_jobs=-1).fit(train_reduced[features], train_reduced['place_id'])
        clf_rf = ensemble.RandomForestClassifier(n_estimators=20, n_jobs=-1).fit(train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)][features], train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)]['place_id'])
        probs = pd.DataFrame(clf_rf.predict_proba(test_reduced[features]))
        probs.columns = np.unique(train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)]['place_id'].values)
        preds = pd.DataFrame([list([p.sort_values(ascending=False)[:3].index.values]) for x,p in probs.iterrows()])
        #pred = clf_rf.predict(test_reduced[features])
        print 'All Good: ',i, (i + .5), r, (r + .5), len(test_reduced['row_id']), len(preds)
        ids.append(list(test_reduced['row_id'].values))
        predictions.append(preds[0])

In [None]:
len(ids)

In [None]:
len(predictions)

In [None]:
%%time
ids = [val for sublist in ids for val in sublist]
predictions = [val for sublist in predictions for val in sublist]

In [None]:
len(ids)

In [None]:
len(predictions)

In [None]:
%%time
submission = pd.DataFrame()
submission['row_id'] = ids
submission['place_id'] = [' '.join(str(x) for x in y) for y in predictions]
#submission.sort_values('row_id', inplace=True)

In [None]:
%%time
submission.head()

In [None]:
%%time
submission.to_csv('submissions/submission-7.5-10.csv', index=False)

In [None]:
%%time
submission_1 = pd.read_csv('submissions/submission-0-2.5.csv')
submission_2 = pd.read_csv('submissions/submission-2.5-5.csv')
submission_3 = pd.read_csv('submissions/submission-5-7.5.csv')
submission_4 = pd.read_csv('submissions/submission-7.5-10.csv')

In [None]:
%%time
submission_concat = pd.concat([submission_1,submission_2,submission_3,submission_4], ignore_index=True)

In [None]:
%%time
submission_concat.sort_values('row_id', inplace=True)

In [None]:
%%time
print len(test), len(submission_concat)

In [None]:
submission_concat.head()

In [None]:
%%time
submission_concat.to_csv('submission.csv', index=False)

In [None]:
%%time
#train_reduced = train[~train['place_id'].isin(counts[counts < 800].index)]
train_reduced = train[(train.x >= 4.40) & (train.x < 5.10) & (train.y >= 4.40) & (train.y < 5.10)]

In [None]:
%%time
train_reduced['day_number'] = ((train_reduced['time']/60)//24).astype(int)
train_reduced['seconds'] = (train_reduced['time'] * 60)
train_reduced['date_time'] = pd.to_datetime(train_reduced['seconds'],unit='s')
train_reduced['hour'] = train_reduced['date_time'].dt.hour
train_reduced['day'] = train_reduced['date_time'].dt.day
train_reduced['dow'] = train_reduced['date_time'].dt.dayofweek

In [None]:
print train_reduced['date_time'].min(), train_reduced['date_time'].max()

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.hist(train_reduced.day_number, bins=100, histtype = 'step')
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.hist(train_reduced.dow, bins=np.arange(7)-0.5)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.hist(train_reduced.hour, bins=24)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
features = [c for c in train_reduced.columns if c in ['x', 'y', 'accuracy', 'hour', 'day', 'dow']]

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
plt.scatter(train_reduced.x,train_reduced.y, c=train_reduced.place_id)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
len(train_reduced)

In [None]:
%%time
small_counts = train_reduced['place_id'].value_counts()
small_trainz = train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)]
print len(small_trainz)

In [None]:
small_counts.describe()

In [None]:
%%time
small_counts = train_reduced['place_id'].value_counts()
small_trainz = train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 600].index)]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(small_trainz.x, small_trainz.y, zs=small_trainz.hour, zdir='z', s=20, c=small_trainz.place_id, depthshade=True)
plt.autoscale(enable=True, axis='both', tight=True)

In [None]:
%%time
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(train_reduced[features], train_reduced['place_id'], test_size=0.60)
# features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.60)

In [None]:
%%time
clf_rf = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(features_train, labels_train)
# boost = xgb.XGBClassifier(n_estimators=5, nthread=4, objective='multi:softprob').fit(features_train, labels_train)
#  learning_rate = 1,
#  max_depth=5,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'multi:softprob',
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27).fit(features_train, labels_train)

In [None]:
%%time
# d_tree = tree.DecisionTreeClassifier(min_samples_split=1000).fit(features_train, labels_train)
#clf_rf = ensemble.RandomForestClassifier(n_estimators=20, n_jobs=-1).fit(train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)][features], train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)]['place_id'])
clf_rf = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(train_reduced[features], train_reduced['place_id'])

In [None]:
%%time
#train_reduced = train[~train['place_id'].isin(counts[counts < 800].index)]
test_reduced = train[(train.x >= 4.50) & (train.x < 5) & (train.y >= 4.50) & (train.y < 5)]

In [None]:
%%time
test_reduced['day_number'] = ((test_reduced['time']/60)//24).astype(int)
test_reduced['seconds'] = (test_reduced['time'] * 60)
test_reduced['date_time'] = pd.to_datetime(test_reduced['seconds'],unit='s')
test_reduced['hour'] = test_reduced['date_time'].dt.hour
test_reduced['day'] = test_reduced['date_time'].dt.day
test_reduced['dow'] = test_reduced['date_time'].dt.dayofweek

In [None]:
%%time
# pred = clf_rf.predict(test_reduced[features])
pred = clf_rf.predict(features_test)

In [None]:
%%time
print accuracy_score(labels_test, pred)

In [None]:
%%time
probs = pd.DataFrame(clf_rf.predict_proba(test_reduced[features]))
probs.columns = np.unique(train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 100].index)]['place_id'].values)
#probs.columns = np.unique(labels_train.values)
preds = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in probs.iterrows()])
# print mapk([[l] for l in labels_test], preds[0], 3)

In [None]:
features = [c for c in train_reduced.columns if c in ['x', 'y', 'accuracy', 'hour', 'day', 'dow']]

In [None]:
%%time
# d_tree = tree.DecisionTreeClassifier(min_samples_split=1000).fit(features_train, labels_train)
clf_rf = ensemble.RandomForestClassifier(n_estimators=50, n_jobs=-1).fit(train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 150].index)][features], train_reduced[train_reduced['place_id'].isin(small_counts[small_counts > 150].index)]['place_id'])

In [None]:
%%time
preds = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in probs.iterrows()])

In [None]:
%%time
print mapk([[l] for l in labels_test], preds[0], 3)

In [None]:
len(features_test)

In [None]:
len(features_train)

In [None]:
%%time
forest = ensemble.RandomForestClassifier(n_estimators=50, n_jobs=-1).fit(train_reduced[features], train_reduced['place_id'])

In [None]:
%%time
test_probs = pd.DataFrame(clf_rf.predict_proba(test[features]))
test_pred = pd.DataFrame([list([r.sort_values(ascending=False)[:3].index.values]) for i,r in test_probs.iterrows()])

In [None]:
%%time
test_pred = clf_rf.predict(test[features])