In [1]:
"""
Example classifier on Numerai data using a logistic regression classifier.
To get started, install the required packages: pip install pandas, numpy, sklearn
"""

import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model
import tensorflow as tf

In [2]:
# Set seed for reproducibility
np.random.seed(0)

print("Loading data...")
# Load the data from the CSV files
training_data = pd.read_csv('numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('numerai_tournament_data.csv', header=0)

Loading data...


In [3]:
training_data.head()

Unnamed: 0,id,era,data_type,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
0,141343,era1,train,0.39259,0.61835,0.63515,0.54343,0.42712,0.60906,0.47875,...,0.37281,0.65519,0.65057,0.31835,0.62492,0.6054,0.55826,0.42972,0.28459,0
1,118965,era1,train,0.51999,0.62194,0.5529,0.5379,0.45459,0.48424,0.55901,...,0.45956,0.64501,0.63455,0.41849,0.53119,0.54691,0.58776,0.39615,0.32122,1
2,104930,era1,train,0.51097,0.39703,0.63725,0.40934,0.38163,0.28381,0.55535,...,0.39448,0.63178,0.52078,0.37193,0.60606,0.63645,0.57264,0.53535,0.51676,0
3,107619,era1,train,0.54786,0.62813,0.51853,0.5451,0.48513,0.56366,0.52711,...,0.49475,0.59248,0.55167,0.47589,0.49704,0.52032,0.51307,0.41957,0.35533,0
4,107532,era1,train,0.63631,0.69773,0.41166,0.64127,0.56288,0.53927,0.49923,...,0.59686,0.52205,0.5161,0.58242,0.39449,0.417,0.49341,0.3418,0.29281,1


In [4]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108405 entries, 0 to 108404
Data columns (total 25 columns):
id           108405 non-null int64
era          108405 non-null object
data_type    108405 non-null object
feature1     108405 non-null float64
feature2     108405 non-null float64
feature3     108405 non-null float64
feature4     108405 non-null float64
feature5     108405 non-null float64
feature6     108405 non-null float64
feature7     108405 non-null float64
feature8     108405 non-null float64
feature9     108405 non-null float64
feature10    108405 non-null float64
feature11    108405 non-null float64
feature12    108405 non-null float64
feature13    108405 non-null float64
feature14    108405 non-null float64
feature15    108405 non-null float64
feature16    108405 non-null float64
feature17    108405 non-null float64
feature18    108405 non-null float64
feature19    108405 non-null float64
feature20    108405 non-null float64
feature21    108405 non-null float64
targe

In [5]:
training_data.shape

(108405, 25)

In [7]:
training_data.era.unique()

array(['era1', 'era2', 'era3', 'era4', 'era5', 'era6', 'era7', 'era8',
       'era9', 'era10', 'era11', 'era12', 'era13', 'era14', 'era15',
       'era16', 'era17', 'era18', 'era19', 'era20', 'era21', 'era22',
       'era23', 'era24', 'era25', 'era26', 'era27', 'era28', 'era29',
       'era30', 'era31', 'era32', 'era33', 'era34', 'era35', 'era36',
       'era37', 'era38', 'era39', 'era40', 'era41', 'era42', 'era43',
       'era44', 'era45', 'era46', 'era47', 'era48', 'era49', 'era50',
       'era51', 'era52', 'era53', 'era54', 'era55', 'era56', 'era57',
       'era58', 'era59', 'era60', 'era61', 'era62', 'era63', 'era64',
       'era65', 'era66', 'era67', 'era68', 'era69', 'era70', 'era71',
       'era72', 'era73', 'era74', 'era75', 'era76', 'era77', 'era78',
       'era79', 'era80', 'era81', 'era82', 'era83', 'era84', 'era85',
       'era86', 'era87', 'era88', 'era89', 'era90', 'era91', 'era92',
       'era93', 'era94', 'era95', 'era96'], dtype=object)

In [9]:
training_data.era.value_counts()

era69    1422
era68    1420
era71    1412
era70    1406
era67    1392
era72    1379
era88    1373
era80    1373
era64    1365
era74    1364
era79    1362
era85    1359
era84    1358
era96    1353
era86    1352
era83    1346
era78    1346
era87    1345
era89    1345
era90    1344
era92    1344
era95    1342
era91    1335
era63    1333
era66    1332
era75    1330
era93    1328
era62    1323
era94    1321
era82    1320
         ... 
era23    1092
era40    1072
era44    1072
era22    1071
era38    1068
era21    1048
era39    1035
era20    1030
era41    1024
era43    1018
era19    1006
era42     989
era18     973
era16     966
era17     954
era15     935
era14     912
era13     870
era11     865
era12     860
era10     831
era9      809
era8      797
era3      403
era6      272
era7      264
era5      256
era2      241
era4      220
era1      124
Name: era, dtype: int64

In [8]:
training_data.data_type.unique()

array(['train'], dtype=object)

In [6]:
list(training_data)

['id',
 'era',
 'data_type',
 'feature1',
 'feature2',
 'feature3',
 'feature4',
 'feature5',
 'feature6',
 'feature7',
 'feature8',
 'feature9',
 'feature10',
 'feature11',
 'feature12',
 'feature13',
 'feature14',
 'feature15',
 'feature16',
 'feature17',
 'feature18',
 'feature19',
 'feature20',
 'feature21',
 'target']

# Example linear logistical regression model

In [4]:
# Transform the loaded CSV data into numpy arrays
# features = [f for f in list(training_data) if "feature" in f]
# X = training_data[features]
# Y = training_data["target"]
# x_prediction = prediction_data[features]
# ids = prediction_data["id"]


# # This is your model that will learn to predict
# model = linear_model.LogisticRegression(n_jobs=-1)

# print("Training...")
# # Your model is trained on the training_data
# model.fit(X, Y)

# print("Predicting...")
# # Your trained model is now used to make predictions on the numerai_tournament_data
# # The model returns two columns: [probability of 0, probability of 1]
# # We are just interested in the probability that the target is 1.
# y_prediction = model.predict_proba(x_prediction)
# results = y_prediction[:, 1]
# results_df = pd.DataFrame(data={'probability':results})
# joined = pd.DataFrame(ids).join(results_df)

# print("Writing predictions to predictions.csv")
# # Save the predictions out to a CSV file
# joined.to_csv("predictions.csv", index=False)
# Now you can upload these predictions on numer.ai

# 1: A deep feedforward net using tensorflow

In [None]:
inputs = tf.pl