In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA,KernelPCA
from sklearn.metrics import log_loss
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

### Practise data set
We will be using the numerai data set for example

In [2]:
print("# Loading data...")
# The training data is used to train your model how to predict the targets.
train = pd.read_csv('numerai_datasets/numerai_training_data.csv', header=0)
# The tournament data is the data that Numerai uses to evaluate your model.
tournament = pd.read_csv('numerai_datasets/numerai_tournament_data.csv', header=0)

# Loading data...


In [3]:
# The tournament data contains validation data, test data and live data.
# Validation is used to test your model locally so we separate that.
validation = tournament[tournament['data_type']=='validation']
test = tournament[tournament['data_type']=='test']
live = tournament[tournament['data_type']=='live']

In [4]:
print(train.shape)
print(tournament.shape)
print(live.shape)
print(test.shape)

(393613, 58)
(243037, 58)
(4070, 58)
(192605, 58)


In [5]:
train_bernie = train.drop([
    'id', 'era', 'data_type',
    'target_charles', 'target_elizabeth',
    'target_jordan', 'target_ken'], axis=1)

### Scikit Pipeline method

In [6]:
# Transform the loaded CSV data into numpy arrays
features = [f for f in list(train_bernie) if "feature" in f]
X_train = train_bernie[features]
y_train = train_bernie['target_bernie']
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)
pca_n_dim=pca.explained_variance_ratio_.cumsum()
n_dim=np.argwhere(pca_n_dim>0.9)[0]
X_train = train_bernie[features]
y_train = train_bernie['target_bernie']
pipe_lr = make_pipeline(StandardScaler(),PCA(n_components=n_dim[0]),LogisticRegression(random_state=1,penalty='l1',C=0.01))
#pipe_lr = make_pipeline(StandardScaler(),PCA(n_components=5),SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0))
pipe_lr.fit(X_train, y_train)
validation_bernie = validation.drop([
    'id', 'era', 'data_type',
    'target_charles', 'target_elizabeth',
    'target_jordan', 'target_ken'], axis=1)
# Transform the loaded CSV data into numpy arrays
features = [f for f in list(validation_bernie) if "feature" in f]
X_test = validation_bernie[features]
y_test = validation_bernie['target_bernie']

y_pred = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
print('Log loss : %.3f' % log_loss(y_test,pipe_lr.predict_proba(X_test)))

Test Accuracy: 0.516
Log loss : 0.692


### K-Fold cross validation

In [15]:
X_train = train_bernie[features].values
y_train = train_bernie['target_bernie'].values
kfold = StratifiedKFold(n_splits=10,random_state=1).split(X_train,y_train)
pipe_lr = make_pipeline(StandardScaler(),PCA(n_components=n_dim[0]),LogisticRegression(random_state=1))
scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold:  1, Class dist.: [177153 177098], Acc: 0.517
Fold:  2, Class dist.: [177153 177098], Acc: 0.516
Fold:  3, Class dist.: [177153 177098], Acc: 0.513
Fold:  4, Class dist.: [177153 177098], Acc: 0.515
Fold:  5, Class dist.: [177153 177098], Acc: 0.498
Fold:  6, Class dist.: [177153 177098], Acc: 0.504
Fold:  7, Class dist.: [177153 177099], Acc: 0.517
Fold:  8, Class dist.: [177154 177099], Acc: 0.516
Fold:  9, Class dist.: [177154 177099], Acc: 0.507
Fold: 10, Class dist.: [177154 177099], Acc: 0.512

CV accuracy: 0.511 +/- 0.006


### Testing no of jobs

In [46]:
%%timeit -n 1 -r 1
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [0.51730095 0.51595447 0.51277882 0.51524313 0.49794218 0.50353132
 0.51650111 0.51575203 0.50708841 0.51163618]
CV accuracy: 0.511 +/- 0.006
35.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [47]:
%%timeit -n 1 -r 1
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=-1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [0.51730095 0.51595447 0.51280423 0.51531934 0.49791677 0.50348051
 0.51650111 0.51585366 0.50706301 0.51163618]
CV accuracy: 0.511 +/- 0.006
20.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
