In [20]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model
from sklearn.metrics import log_loss, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV
from time import time


In [2]:
competition = '53'
# Set seed for reproducibility
np.random.seed(0)

# Load the data from the CSV files
training_data = pd.read_csv('numerai_datasets' + competition + '/numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('numerai_datasets' + competition + '/numerai_tournament_data.csv', header=0)


# Transform the loaded CSV data into numpy arrays
features = [f for f in list(training_data) if "feature" in f]
X = training_data[features]
Y = training_data["target"]
x_prediction = prediction_data[features]
x_validation = prediction_data[prediction_data['data_type'] == 'validation'][features]
y_validation = prediction_data["target"][prediction_data['data_type'] == 'validation']
ids = prediction_data["id"]

# Logistic Regression

In [3]:
# Logistic Regression
model = linear_model.LogisticRegression(n_jobs=-1)
# model = RandomForestClassifier(n_estimators=1000, max_features=0.5, max_depth=20, min_samples_split=20, random_state=0, n_jobs=-1)
print("Training...")
# Your model is trained on the training_data
model.fit(X, Y)

Training...


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
train_loss = log_loss(Y, pd.DataFrame(model.predict_proba(X)).as_matrix())
print 'Train Loss:', train_loss
val_loss = log_loss(y_validation, pd.DataFrame(model.predict_proba(x_validation)).as_matrix())
print 'Validation Loss:', val_loss

In [None]:
print("Predicting...")
# Your trained model is now used to make predictions on the numerai_tournament_data
# The model returns two columns: [probability of 0, probability of 1]
# We are just interested in the probability that the target is 1.
y_prediction = model.predict_proba(x_prediction)
results = y_prediction[:, 1]
results_df = pd.DataFrame(data={'probability':results})
joined = pd.DataFrame(ids).join(results_df)

print("Writing predictions to predictions.csv")
# Save the predictions out to a CSV file
joined.to_csv('numerai_datasets' + competition + '/predictions_LogReg.csv", index=False)
# Now you can upload these predictions on numer.ai
print("Done")

# Random Forest

In [23]:
log_scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True)
clf = RandomForestClassifier()
# specify parameters and distributions to sample from
param_dist = {"max_depth": range(5,15),
              "max_features": ['sqrt',0.2, 0.3, 0.4, 0.5],
              "n_estimators": [300, 400, 500, 600, 700, 800],
             "min_samples_split": [None, 100, 300, 400, 500, 600, 800, 1000],
             "min_samples_leaf": [None, 20, 40, 50, 70 ,100]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, n_jobs=3, scoring=log_scoring, verbose=1)

In [None]:
start = time()
search = random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [11]:
# Random Forest
model = RandomForestClassifier(n_estimators=10, max_features='sqrt', max_depth=10, min_samples_split=20, random_state=0, n_jobs=-1)
print("Training...")
# Your model is trained on the training_data
model.fit(X, Y)

Training...


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [12]:
train_loss = log_loss(Y, pd.DataFrame(model.predict_proba(X)).as_matrix())
print 'Train Loss:', train_loss
val_loss = log_loss(y_validation, pd.DataFrame(model.predict_proba(x_validation)).as_matrix())
print 'Validation Loss:', val_loss

Train Loss: 0.67897489267
Validation Loss: 0.694670352826


In [7]:
print("Predicting...")
# Your trained model is now used to make predictions on the numerai_tournament_data
# The model returns two columns: [probability of 0, probability of 1]
# We are just interested in the probability that the target is 1.
y_prediction = model.predict_proba(x_prediction)
results = y_prediction[:, 1]
results_df = pd.DataFrame(data={'probability':results})
joined = pd.DataFrame(ids).join(results_df)

print("Writing predictions to predictions.csv")
# Save the predictions out to a CSV file
joined.to_csv('numerai_datasets' + competition + '/predictions_RandomForest.csv", index=False)
# Now you can upload these predictions on numer.ai
print("Done")

Predicting...
Writing predictions to predictions.csv
