In [102]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model
from sklearn.metrics import log_loss, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GroupKFold
from sklearn.grid_search import RandomizedSearchCV
from time import time


In [103]:
competition = '53'
# Set seed for reproducibility
np.random.seed(0)

# Load the data from the CSV files
training_data = pd.read_csv('numerai_datasets' + competition + '/numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('numerai_datasets' + competition + '/numerai_tournament_data.csv', header=0)


# Transform the loaded CSV data into numpy arrays
features = [f for f in list(training_data) if "feature" in f]
X = training_data[features]
Y = training_data["target"]
x_prediction = prediction_data[features]
x_validation = prediction_data[prediction_data['data_type'] == 'validation'][features]
y_validation = prediction_data["target"][prediction_data['data_type'] == 'validation']
ids = prediction_data["id"]

# Logistic Regression

In [3]:
# Logistic Regression
model = linear_model.LogisticRegression(n_jobs=-1)
# model = RandomForestClassifier(n_estimators=1000, max_features=0.5, max_depth=20, min_samples_split=20, random_state=0, n_jobs=-1)
print("Training...")
# Your model is trained on the training_data
model.fit(X, Y)

Training...


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
train_loss = log_loss(Y, pd.DataFrame(model.predict_proba(X)).as_matrix())
print 'Train Loss:', train_loss
val_loss = log_loss(y_validation, pd.DataFrame(model.predict_proba(x_validation)).as_matrix())
print 'Validation Loss:', val_loss

In [None]:
print("Predicting...")
# Your trained model is now used to make predictions on the numerai_tournament_data
# The model returns two columns: [probability of 0, probability of 1]
# We are just interested in the probability that the target is 1.
y_prediction = model.predict_proba(x_prediction)
results = y_prediction[:, 1]
results_df = pd.DataFrame(data={'probability':results})
joined = pd.DataFrame(ids).join(results_df)

print("Writing predictions to predictions.csv")
# Save the predictions out to a CSV file
joined.to_csv('numerai_datasets' + competition + '/predictions_LogReg.csv", index=False)
# Now you can upload these predictions on numer.ai
print("Done")

In [104]:
group_kfold = GroupKFold(n_splits=5)
group = training_data.era.apply(lambda x: int(x[3:]))
cv_split = group_kfold.split(X, Y, groups)
# for a, b in group_kfold.split(X, y, groups):
#     print 'train', a, X[a], y[a]
#     print 'test', b, X[b], y[b]

In [105]:
cv_split

<generator object split at 0x7ffab0d3a870>

In [106]:
for a, b in cv_split:
    print 'train', a, b

ValueError: Found input variables with inconsistent numbers of samples: [108405, 108405, 4]

In [86]:
from sklearn.model_selection import PredefinedSplit
# >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
# >>> y = np.array([0, 0, 1, 1])
# >>> test_fold = [0, 1, -1, 1]
ps = PredefinedSplit(group)
# print ps.get_n_splits()
pss = list(ps.split())
# print(ps)       
# PredefinedSplit(test_fold=array([ 0,  1, -1,  1]))
# for train_index, test_index in ps.split():
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

TypeError: __init__() got an unexpected keyword argument 'n_splits'

# Random Forest

In [83]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
clf = RandomForestClassifier()
# specify parameters and distributions to sample from
param_dist = {"max_depth": range(5,15),
              "max_features": ['sqrt',0.2, 0.3, 0.4, 0.5],
              "n_estimators": [300, 400, 500, 600, 700, 800],
             "min_samples_split": [100, 300, 400, 500, 600, 800, 1000],
             "min_samples_leaf": [20, 40, 50, 70 ,100]}

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=pss, n_jobs=-1, scoring=log_scoring, verbose=1)

In [84]:
start = time()
search = random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)

Fitting 96 folds for each of 100 candidates, totalling 9600 fits


KeyboardInterrupt: 

In [11]:
search.grid_scores_

[mean: -0.69292, std: 0.00047, params: {'n_estimators': 500, 'min_samples_split': 800, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 40},
 mean: -0.69297, std: 0.00053, params: {'n_estimators': 800, 'min_samples_split': 500, 'max_depth': 9, 'max_features': 0.4, 'min_samples_leaf': 100},
 mean: -0.69335, std: 0.00071, params: {'n_estimators': 600, 'min_samples_split': 400, 'max_depth': 14, 'max_features': 0.5, 'min_samples_leaf': 50},
 mean: -0.69291, std: 0.00042, params: {'n_estimators': 300, 'min_samples_split': 100, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 50},
 mean: -0.69292, std: 0.00037, params: {'n_estimators': 400, 'min_samples_split': 600, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 20},
 mean: -0.69311, std: 0.00062, params: {'n_estimators': 500, 'min_samples_split': 500, 'max_depth': 14, 'max_features': 0.2, 'min_samples_leaf': 20},
 mean: -0.69291, std: 0.00043, params: {'n_estimators': 500, 'min_samples_split': 800, 'max_de

In [12]:
search.best_score_, search.best_estimator_, search.best_params_

# (-0.6928779206980246,
#  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#              max_depth=5, max_features=0.4, max_leaf_nodes=None,
#              min_impurity_split=1e-07, min_samples_leaf=100,
#              min_samples_split=300, min_weight_fraction_leaf=0.0,
#              n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
#              verbose=0, warm_start=False),
#  {'max_depth': 5,
#   'max_features': 0.4,
#   'min_samples_leaf': 100,
#   'min_samples_split': 300,
#   'n_estimators': 600})

(-0.6928779206980246,
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=5, max_features=0.4, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=100,
             min_samples_split=300, min_weight_fraction_leaf=0.0,
             n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=False),
 {'max_depth': 5,
  'max_features': 0.4,
  'min_samples_leaf': 100,
  'min_samples_split': 300,
  'n_estimators': 600})

In [13]:
model = search.best_estimator_.fit(X, Y)

In [14]:
# Random Forest
# model = RandomForestClassifier(n_estimators=10, max_features='sqrt', max_depth=10, min_samples_split=20, random_state=0, n_jobs=-1)
# print("Training...")
# # Your model is trained on the training_data
# model.fit(X, Y)
# Train Loss: 0.67897489267
# Validation Loss: 0.694670352826

In [15]:
train_loss = log_loss(Y, pd.DataFrame(model.predict_proba(X)).as_matrix())
print 'Train Loss:', train_loss
val_loss = log_loss(y_validation, pd.DataFrame(model.predict_proba(x_validation)).as_matrix())
print 'Validation Loss:', val_loss

Train Loss: 0.690883866474
Validation Loss: 0.692585525566


In [18]:
print("Predicting...")
# Your trained model is now used to make predictions on the numerai_tournament_data
# The model returns two columns: [probability of 0, probability of 1]
# We are just interested in the probability that the target is 1.
y_prediction = model.predict_proba(x_prediction)
results = y_prediction[:, 1]
results_df = pd.DataFrame(data={'probability':results})
joined = pd.DataFrame(ids).join(results_df)

print("Writing predictions to predictions.csv")
# Save the predictions out to a CSV file
joined.to_csv('numerai_datasets' + competition + '/predictions_RandomForest_100search.csv', index=False)
# Now you can upload these predictions on numer.ai
print("Done")

Predicting...
Writing predictions to predictions.csv
Done


# Gradient Boosting

In [77]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
clf = GradientBoostingClassifier()
# specify parameters and distributions to sample from
param_dist = {"max_depth": [1, 2, 3, 4, 5, 6, 7, 8],
              "max_features": ['sqrt',0.2, 0.3, 0.4, 0.5],
              "n_estimators": [300, 400, 500, 600, 700, 800],
              "learning_rate": [round(i,2) for i in np.arange(0.05,0.2,0.01)],
             "min_samples_split": [100, 300, 400, 500, 600, 800, 1000],
             "min_samples_leaf": [20, 40, 50, 70 ,100]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=cv, n_jobs=-1, scoring=log_scoring, verbose=1)


In [78]:
start = time()
search = random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)

TypeError: object of type 'ShuffleSplit' has no len()

In [22]:
search.grid_scores_

[mean: -0.69681, std: 0.00157, params: {'learning_rate': 0.09, 'min_samples_leaf': 50, 'n_estimators': 600, 'max_features': 0.4, 'min_samples_split': 1000, 'max_depth': 6},
 mean: -0.69971, std: 0.00170, params: {'learning_rate': 0.12, 'min_samples_leaf': 40, 'n_estimators': 700, 'max_features': 0.3, 'min_samples_split': 300, 'max_depth': 5},
 mean: -0.69482, std: 0.00106, params: {'learning_rate': 0.16, 'min_samples_leaf': 50, 'n_estimators': 400, 'max_features': 'sqrt', 'min_samples_split': 600, 'max_depth': 3},
 mean: -0.69761, std: 0.00127, params: {'learning_rate': 0.18, 'min_samples_leaf': 100, 'n_estimators': 500, 'max_features': 'sqrt', 'min_samples_split': 500, 'max_depth': 4},
 mean: -0.69373, std: 0.00089, params: {'learning_rate': 0.09, 'min_samples_leaf': 40, 'n_estimators': 400, 'max_features': 0.2, 'min_samples_split': 800, 'max_depth': 3},
 mean: -0.69363, std: 0.00085, params: {'learning_rate': 0.07, 'min_samples_leaf': 20, 'n_estimators': 800, 'max_features': 0.2, 'mi

In [87]:
from sklearn.model_selection import GroupKFold
x = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
group_kfold = GroupKFold(n_splits=2)
group_kfold.get_n_splits(x, y, groups)

2

In [88]:
for train_index, test_index in group_kfold.split(x, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)

('TRAIN:', array([0, 1]), 'TEST:', array([2, 3]))
('TRAIN:', array([2, 3]), 'TEST:', array([0, 1]))
