In [104]:
import pandas as pd
import numpy as np
from time import time
import math
from sklearn import metrics, preprocessing, linear_model
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [21]:
competition = '53'
# Set seed for reproducibility
np.random.seed(0)

# Load the data from the CSV files
training_data = pd.read_csv('numerai_datasets' + competition + '/numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('numerai_datasets' + competition + '/numerai_tournament_data.csv', header=0)


# Transform the loaded CSV data into numpy arrays
features = [f for f in list(training_data) if "feature" in f]
X = training_data[features]
Y = training_data["target"]
x_prediction = prediction_data[features]
val_data = prediction_data[prediction_data['data_type'] == 'validation']
x_validation = val_data[features]
y_validation = val_data["target"]
ids = prediction_data["id"]

# Prepare CV, functions

In [22]:
group_kfold = GroupKFold(n_splits=5)
group = training_data.era.apply(lambda x: int(x[3:]))
cv_split = list(group_kfold.split(X, Y, group))
for a, b in cv_split:
    print 'train', a, b
#     print 'test', b, X[b], y[b]

train [     0      1      2 ..., 108402 108403 108404] [  365   366   367 ..., 94972 94973 94974]
train [     0      1      2 ..., 108402 108403 108404] [   124    125    126 ..., 100379 100380 100381]
train [     0      1      2 ..., 105707 105708 105709] [   988    989    990 ..., 108402 108403 108404]
train [   124    125    126 ..., 108402 108403 108404] [     0      1      2 ..., 104386 104387 104388]
train [     0      1      2 ..., 108402 108403 108404] [   768    769    770 ..., 105707 105708 105709]


In [85]:
def consistency(model, XGB=False):
    print 'Threshold loss', -math.log(0.5)
    count = 0
    good = 0
    for era in val_data.era.unique():
        x_val = val_data[val_data.era == era][features]
        y_val = val_data[val_data.era == era]['target']
        if XGB:
            val = xgb.DMatrix(x_val.values, label=y_val.values)
            val_loss = log_loss(y_val, pd.DataFrame(model.predict(val)).as_matrix())
        else:
            val_loss = log_loss(y_val, pd.DataFrame(model.predict_proba(x_val)).as_matrix())
        print 'Validation Loss:', val_loss
        count += 1
        if val_loss < -math.log(0.5):
            good += 1

    print 'Count =', count
    print 'Good =', good 
    print 'Percent of Good =', good*100.0/count

In [80]:
# def loss(model):
#     train_loss = log_loss(Y, pd.DataFrame(model.predict_proba(X)).as_matrix())
#     print 'Train Loss:', train_loss
#     val_loss = log_loss(y_validation, pd.DataFrame(model.predict_proba(x_validation)).as_matrix())
#     print 'Validation Loss:', val_loss
def loss(model, x_tr, y_tr, x_val, y_val, proba=True):
    if proba:
        train_loss = log_loss(y_tr, pd.DataFrame(model.predict_proba(x_tr)).as_matrix())
        val_loss = log_loss(y_val, pd.DataFrame(model.predict_proba(x_val)).as_matrix())
    else:
        train_loss = log_loss(y_tr, pd.DataFrame(model.predict(x_tr)).as_matrix())
        val_loss = log_loss(y_val, pd.DataFrame(model.predict(x_val)).as_matrix())
        
    print 'Train Loss:', train_loss
    print 'Validation Loss:', val_loss

In [98]:
def write(model, name, XGB=False):
    global competition
    print("Predicting...")
    # Your trained model is now used to make predictions on the numerai_tournament_data
    # The model returns two columns: [probability of 0, probability of 1]
    # We are just interested in the probability that the target is 1.
    if XGB:
        x_pred = xgb.DMatrix(x_prediction.values)
        y_prediction = model.predict(x_pred)
    else:
        y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)

    print("Writing predictions to predictions.csv")
    # Save the predictions out to a CSV file
    joined.to_csv('numerai_datasets' + competition + '/predictions_' + name + '.csv', index=False)
    # Now you can upload these predictions on numer.ai
    print("Done")

# Logistic Regression

In [64]:
model = linear_model.LogisticRegression(penalty='l1', n_jobs=-1, C=0.1)
print("Training...")
model.fit(X, Y)

Training...


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
loss(model, x_tr=X, y_tr=Y, x_val=x_validation, y_val=y_validation, proba=True)

Train Loss: 0.692487817875
Validation Loss: 0.692596713598


In [66]:
consistency(model)

Validation Loss: 0.691643299389
Validation Loss: 0.692559359783
Validation Loss: 0.692949953109
Validation Loss: 0.692003647106
Validation Loss: 0.692688963808
Validation Loss: 0.69293531109
Validation Loss: 0.692992090129
Validation Loss: 0.692294311575
Validation Loss: 0.69125532033
Validation Loss: 0.693065943863
Validation Loss: 0.693425825007
Validation Loss: 0.693287082366
Count = 12
Good = 10
Percent of Good = 83.3333333333


In [None]:
write(model, 'LogisticRegression')

# Random Forest

In [35]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
clf = RandomForestClassifier()
# specify parameters and distributions to sample from
param_dist = {"max_depth": range(5,15),
              "max_features": ['sqrt',0.2, 0.3, 0.4, 0.5],
              "n_estimators": [300, 400, 500, 600, 700, 800],
             "min_samples_split": [100, 300, 400, 500, 600, 800, 1000],
             "min_samples_leaf": [20, 40, 50, 70 ,100]}

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=cv_split, n_jobs=-1, scoring=log_scoring, verbose=1)

In [36]:
start = time()
search = random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 354 tasks      | elapsed: 43.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 62.8min finished


RandomizedSearchCV took 3849.46 seconds for 100 candidates parameter settings.


In [37]:
search.grid_scores_

[mean: -0.69275, std: 0.00057, params: {'n_estimators': 500, 'min_samples_split': 100, 'max_depth': 7, 'max_features': 0.4, 'min_samples_leaf': 20},
 mean: -0.69295, std: 0.00080, params: {'n_estimators': 800, 'min_samples_split': 400, 'max_depth': 14, 'max_features': 0.2, 'min_samples_leaf': 100},
 mean: -0.69273, std: 0.00058, params: {'n_estimators': 300, 'min_samples_split': 800, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 50},
 mean: -0.69274, std: 0.00068, params: {'n_estimators': 800, 'min_samples_split': 600, 'max_depth': 9, 'max_features': 0.4, 'min_samples_leaf': 20},
 mean: -0.69287, std: 0.00081, params: {'n_estimators': 600, 'min_samples_split': 600, 'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 100},
 mean: -0.69279, std: 0.00070, params: {'n_estimators': 400, 'min_samples_split': 1000, 'max_depth': 12, 'max_features': 0.3, 'min_samples_leaf': 50},
 mean: -0.69276, std: 0.00067, params: {'n_estimators': 800, 'min_samples_split': 100, 'max_depth

In [38]:
search.best_score_, search.best_estimator_, search.best_params_

# (-0.6928779206980246,
#  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#              max_depth=5, max_features=0.4, max_leaf_nodes=None,
#              min_impurity_split=1e-07, min_samples_leaf=100,
#              min_samples_split=300, min_weight_fraction_leaf=0.0,
#              n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
#              verbose=0, warm_start=False),
#  {'max_depth': 5,
#   'max_features': 0.4,
#   'min_samples_leaf': 100,
#   'min_samples_split': 300,
#   'n_estimators': 600})

(-0.6927037617895702,
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=7, max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=70,
             min_samples_split=600, min_weight_fraction_leaf=0.0,
             n_estimators=400, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False),
 {'max_depth': 7,
  'max_features': 'sqrt',
  'min_samples_leaf': 70,
  'min_samples_split': 600,
  'n_estimators': 400})

In [39]:
model = search.best_estimator_.fit(X, Y)

In [41]:
loss(model, x_tr=X, y_tr=Y, x_val=x_validation, y_val=y_validation, proba=True)

Train Loss: 0.689733050243
Validation Loss: 0.692599043325


In [None]:
consistency(model)

In [42]:
write(model, 'RandomForest')

Predicting...
Writing predictions to predictions.csv
Done


# Gradient Boosting

In [43]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
clf = GradientBoostingClassifier()
# specify parameters and distributions to sample from
param_dist = {"max_depth": [1, 2, 3, 4, 5, 6, 7, 8],
              "max_features": ['sqrt',0.2, 0.3, 0.4, 0.5],
              "n_estimators": [300, 400, 500, 600, 700, 800],
              "learning_rate": [round(i,2) for i in np.arange(0.05,0.2,0.01)],
             "min_samples_split": [100, 300, 400, 500, 600, 800, 1000],
             "min_samples_leaf": [20, 40, 50, 70 ,100]}

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=cv_split, n_jobs=-1, scoring=log_scoring, verbose=1)

In [44]:
start = time()
search = random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 354 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 27.2min finished


RandomizedSearchCV took 1640.80 seconds for 100 candidates parameter settings.


In [45]:
search.grid_scores_

[mean: -0.69289, std: 0.00069, params: {'learning_rate': 0.09, 'min_samples_leaf': 100, 'n_estimators': 500, 'max_features': 0.3, 'min_samples_split': 400, 'max_depth': 1},
 mean: -0.69390, std: 0.00109, params: {'learning_rate': 0.09, 'min_samples_leaf': 50, 'n_estimators': 500, 'max_features': 0.2, 'min_samples_split': 600, 'max_depth': 3},
 mean: -0.69752, std: 0.00149, params: {'learning_rate': 0.1, 'min_samples_leaf': 20, 'n_estimators': 500, 'max_features': 0.5, 'min_samples_split': 100, 'max_depth': 5},
 mean: -0.69286, std: 0.00056, params: {'learning_rate': 0.08, 'min_samples_leaf': 100, 'n_estimators': 300, 'max_features': 0.5, 'min_samples_split': 300, 'max_depth': 1},
 mean: -0.69290, std: 0.00070, params: {'learning_rate': 0.08, 'min_samples_leaf': 70, 'n_estimators': 600, 'max_features': 'sqrt', 'min_samples_split': 600, 'max_depth': 1},
 mean: -0.69412, std: 0.00094, params: {'learning_rate': 0.06, 'min_samples_leaf': 50, 'n_estimators': 500, 'max_features': 'sqrt', 'min

In [46]:
model = search.best_estimator_.fit(X, Y)

In [47]:
loss(model, x_tr=X, y_tr=Y, x_val=x_validation, y_val=y_validation, proba=True)

Train Loss: 0.692074778203
Validation Loss: 0.692531153867


In [48]:
consistency(model)

Predicting...
Writing predictions to predictions.csv
Done


In [None]:
write(model, 'GradientBoosting')

# K-Nearest Neighbor

In [11]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
clf = KNeighborsClassifier()
# specify parameters and distributions to sample from
param_grid = {"n_neighbors": range(1000, 11001, 5000),
#               "algorithm": ['auto', 'ball_tree', 'kd_tree']#, 'brute'],
#               "weights": ['uniform', 'distance']
             }

# run randomized search
# n_iter_search = 100
grid_search = GridSearchCV(clf, param_grid=param_grid,
                                   cv=cv_split, n_jobs=-1, scoring=log_scoring, verbose=1)

In [12]:
start = time()
search = grid_search.fit(X, Y)
print("GridSearchCV took %.2f seconds for %d candidates"
      " parameter settings." ((time() - start)))
# report(random_search.cv_results_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  3.0min finished


TypeError: not enough arguments for format string

In [13]:
search.grid_scores_

[mean: -0.69305, std: 0.00078, params: {'n_neighbors': 1000},
 mean: -0.69294, std: 0.00041, params: {'n_neighbors': 6000},
 mean: -0.69296, std: 0.00035, params: {'n_neighbors': 11000}]

In [14]:
# model = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
# print("Training...")
# model.fit(X, Y)
model = search.best_estimator_.fit(X, Y)

In [17]:
loss(model, x_tr=X, y_tr=Y, x_val=x_validation, y_val=y_validation, proba=True)

Train Loss: 0.692660571913
Validation Loss: 0.692826321222


In [18]:
consistency(model)

Threshold loss 0.69314718056
Validation Loss: 0.692541106707
Validation Loss: 0.693149712931
Validation Loss: 0.693239586066
Validation Loss: 0.692593246476
Validation Loss: 0.693354007385
Validation Loss: 0.692821115751
Validation Loss: 0.693372195663
Validation Loss: 0.692415094578
Validation Loss: 0.691887720527
Validation Loss: 0.693382208368
Validation Loss: 0.692817370522
Validation Loss: 0.692380788331
Count = 12
Good = 7
Percent of Good = 58.3333333333


# SVM

In [None]:
model = SVC(kernel='rbf', C=0.1, probability=True)
print("Training...")
model.fit(X, Y)

In [None]:
loss(model, x_tr=X, y_tr=Y, x_val=x_validation, y_val=y_validation, proba=True)

In [None]:
consistency(model)

# XGB

In [26]:
xgtrain = xgb.DMatrix(X.values, label=Y.values)
xgval = xgb.DMatrix(x_validation.values, label=y_validation.values)

In [72]:
param = {'gamma':4, 'max_depth':2, 'eta':0.3, 'silent':0, 'objective':'multi:softprob', 'num_class':2}
param['nthread'] = 20
plst = param.items()
plst += [('eval_metric', 'mlogloss')] # Multiple evals can be handled in this way

evallist  = [(xgval,'eval'), (xgtrain,'train')]

In [73]:
num_round = 500
model = xgb.train( plst, xgtrain, num_round, evallist, early_stopping_rounds=20 )
# [38]	eval-mlogloss:0.692388	train-mlogloss:0.690688 
# 'gamma':2, 'max_depth':2, 'eta':0.5, 'silent':0, 'objective':'multi:softmax', 'num_class':2

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:0.692972	train-mlogloss:0.692934
[1]	eval-mlogloss:0.692828	train-mlogloss:0.692729
[2]	eval-mlogloss:0.692708	train-mlogloss:0.692604
[3]	eval-mlogloss:0.692670	train-mlogloss:0.692535
[4]	eval-mlogloss:0.692670	train-mlogloss:0.692436
[5]	eval-mlogloss:0.692616	train-mlogloss:0.692402
[6]	eval-mlogloss:0.692577	train-mlogloss:0.692329
[7]	eval-mlogloss:0.692552	train-mlogloss:0.692225
[8]	eval-mlogloss:0.692554	train-mlogloss:0.692191
[9]	eval-mlogloss:0.692592	train-mlogloss:0.692140
[10]	eval-mlogloss:0.692592	train-mlogloss:0.692107
[11]	eval-mlogloss:0.692580	train-mlogloss:0.692032
[12]	eval-mlogloss:0.692561	train-mlogloss:0.691950
[13]	eval-mlogloss:0.692560	train-mlogloss:0.691907
[14]	eval-mlogloss:0.692559	train-mlogloss:0.691911
[15]	eval-mlogloss:0.692560	train-mlogloss:0.691908
[16]	eval-mlogloss:0.692560	train-mlogloss:0.691909
[17]	eval-mlogloss:0.692560	train-mlogloss:0.691910
[18]	eval-mlog

In [81]:
loss(model, x_tr=xgtrain, y_tr=Y, x_val=xgval, y_val=y_validation, proba=False)

Train Loss: 0.691907738853
Validation Loss: 0.692560348714


In [86]:
consistency(model, XGB=True)

Threshold loss 0.69314718056
Validation Loss: 0.692050552099
Validation Loss: 0.692611149711
Validation Loss: 0.692196117517
Validation Loss: 0.692303270102
Validation Loss: 0.693129951877
Validation Loss: 0.69255846545
Validation Loss: 0.69336131037
Validation Loss: 0.692607297937
Validation Loss: 0.691487173887
Validation Loss: 0.694004457318
Validation Loss: 0.693004723277
Validation Loss: 0.691401050398
Count = 12
Good = 10
Percent of Good = 83.3333333333


In [100]:
write(model, 'XGB1', XGB=True)

Predicting...
Writing predictions to predictions.csv
Done


# XGB Sklearn

In [114]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
clf = XGBClassifier()
# specify parameters and distributions to sample from
param_dist = {
    "n_estimators": range(50, 400, 50), #[300, 400, 500, 600, 700, 800]
    "max_depth": range(3, 40), #[1, 2, 3, 4, 5, 6, 7, 8]
    "learning_rate": [round(i, 2) for i in np.arange(0.05,0.2,0.01)],
    "colsample_bytree": np.arange(0.3, 0.7, 0.1),
    "subsample": np.arange(0.3, 0.7, 0.1),
    "gamma": range(1, 6)}
#     'reg_alpha': from_zero_positive,
#     "min_child_weight": from_zero_positive,
    
    
#     "max_depth": [1, 2, 3, 4, 5, 6, 7, 8],
#               "max_features": ['sqrt',0.2, 0.3, 0.4, 0.5],
            
#               "learning_rate": [round(i,2) for i in np.arange(0.05,0.2,0.01)],
#              "min_samples_split": [100, 300, 400, 500, 600, 800, 1000],
#              "min_samples_leaf": [20, 40, 50, 70 ,100]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=cv_split, n_jobs=5, scoring=log_scoring, verbose=1)

In [115]:
start = time()
search = random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:  3.6min


KeyboardInterrupt: 

In [None]:
search.grid_scores_

In [None]:
model = search.best_estimator_.fit(X, Y)

In [None]:
loss(model, x_tr=X, y_tr=Y, x_val=x_validation, y_val=y_validation, proba=True)

In [None]:
consistency(model)