In [5]:
# Load needed modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
# Function that returns the Cross-Validation log_loss score
def get_log_loss(grid_search_obj):
    print("Best log_loss score: {:.5f}".format(-1 * grid_search_obj.best_score_))

In [7]:
# Function that uses RandomForest to return most important features
def rf_feature_selection(df, X, y):
    # Create a random forest
    from sklearn.ensemble import RandomForestClassifier

    # Get the columns
    cols = df.columns[3:-1]

    # Make the forest
    forest = RandomForestClassifier(n_estimators=100, n_jobs=2)

    # Fit
    forest.fit(X,y)

    # Get the importances
    importances = forest.feature_importances_

    # Get the indices
    indices = np.argsort(importances)[::-1]

    # Show the importance of each feature
    for f in range(X.shape[1]):
        print("{} - {}".format(cols[f], importances[indices[f]]))

In [8]:
# Function that takes the probabilities and ids and saves them to a CSV file
def save_predictions(probabilities, tourney_df, tournament_nbr=60):
    import pandas as pd
    
    # Get the probability of 1
    prob_of_one = probabilities[:, 1]
    
    # Get the IDs
    ids = tourney_df['id']
    
    # Join the data into one DataFrame
    results = pd.DataFrame(data={'probability': prob_of_one})
    results = pd.DataFrame(ids).join(results)
    
    # Save to CSV
    results.to_csv("probabilities_{}.csv".format(tournament_nbr), index=False)

In [9]:
# Load the data
# File path
train_path = '/Users/taylordye/Desktop/numerai_datasets(62)/numerai_training_data.csv'
train_df = pd.read_csv(train_path)

train_df.head()

Unnamed: 0,id,era,data_type,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
0,30869,era1,train,0.80581,0.59023,0.33846,0.66893,0.60931,0.44031,0.54041,...,0.71156,0.44718,0.49803,0.65204,0.2991,0.33321,0.62169,0.30362,0.40045,1
1,153709,era1,train,0.54412,0.49768,0.63767,0.45867,0.34125,0.5188,0.66379,...,0.41867,0.69744,0.60291,0.42221,0.57828,0.63202,0.59929,0.48143,0.50796,1
2,39100,era1,train,0.5202,0.38561,0.73734,0.28803,0.23947,0.28526,0.8193,...,0.32548,0.85534,0.74794,0.29552,0.66491,0.74581,0.78279,0.52793,0.58432,1
3,146072,era1,train,0.25519,0.40359,0.83019,0.31328,0.26406,0.56522,0.50346,...,0.18752,0.73311,0.54077,0.19624,0.80862,0.81713,0.45656,0.66848,0.51978,1
4,140880,era1,train,0.60276,0.43162,0.55852,0.41156,0.42314,0.28048,0.59067,...,0.47205,0.60394,0.47279,0.47331,0.5221,0.58061,0.54359,0.52048,0.53967,1


In [10]:
# Load the tournement data
test_path = '/Users/taylordye/Desktop/numerai_datasets(62)/numerai_tournament_data.csv'
tournament_df = pd.read_csv(test_path)
tournament_df.head()

Unnamed: 0,id,era,data_type,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
0,96144,era97,validation,0.56851,0.31365,0.66518,0.35125,0.32892,0.36261,0.63402,...,0.40269,0.64491,0.47552,0.40982,0.60408,0.66982,0.57779,0.59696,0.67462,1.0
1,17982,era97,validation,0.50546,0.43588,0.69312,0.34049,0.30613,0.52393,0.69548,...,0.36185,0.74445,0.60389,0.36263,0.63274,0.70123,0.61099,0.563,0.58569,0.0
2,96161,era97,validation,0.42258,0.63478,0.60804,0.52483,0.43256,0.61563,0.49732,...,0.39536,0.64803,0.59994,0.37412,0.59534,0.59674,0.50139,0.44816,0.31181,1.0
3,53895,era97,validation,0.52451,0.63911,0.5101,0.56558,0.50251,0.55216,0.47071,...,0.49197,0.56144,0.50666,0.48477,0.49932,0.51205,0.45684,0.42574,0.32765,0.0
4,7267,era97,validation,0.51572,0.43398,0.55069,0.49226,0.50208,0.41294,0.37023,...,0.46277,0.46567,0.35225,0.44589,0.54606,0.54634,0.41125,0.54601,0.48034,0.0


### Feature Preprocessing

In [11]:
from sklearn.preprocessing import StandardScaler
sclr = StandardScaler()

In [16]:
# Get the training and testing data
X_train = sclr.fit_transform(train_df.loc[:, "feature1":"feature21"].values)
y_train = train_df.loc[:, "target"].values
X_test = sclr.transform(tournament_df.loc[:, 'feature1':'feature21'].values)

### Feature Selection

In [47]:
rf_feature_selection(df=train_df, X=X_train, y=y_train)

feature1 - 0.05438115437603896
feature2 - 0.051411022886057124
feature3 - 0.050125769299357664
feature4 - 0.0496391380150376
feature5 - 0.0493227219323956
feature6 - 0.0489100335799153
feature7 - 0.04890447961721969
feature8 - 0.04817645751731975
feature9 - 0.048045092612640124
feature10 - 0.04779461600107142
feature11 - 0.04778641270743054
feature12 - 0.047508794129879434
feature13 - 0.047503010697371416
feature14 - 0.04633422203657533
feature15 - 0.04626662464206865
feature16 - 0.04542307272319142
feature17 - 0.045407163879566434
feature18 - 0.0445593340653168
feature19 - 0.04447544760244952
feature20 - 0.0441197179935388
feature21 - 0.043905713685558484


In [48]:
# Pick the top 4 features for our data
X_train = train_df.loc[:, 'feature1':'feature4'].values
X_test = tournament_df.loc[:, 'feature1':'feature4'].values

### Modelling

Logistic Regression and AdaBoost

In [59]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# Instantiate
ada_logit = AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=2, C=0.001), n_estimators=500)

# Fit
ada_logit.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=500, random_state=None)

In [60]:
# Predict and save
save_predictions((ada_logit.predict_proba(X_test)), tournament_df, tournament_nbr=62)