In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA,KernelPCA
from sklearn.metrics import log_loss
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV


### Practise data set
We will be using the numerai data set for example

In [None]:
print("# Loading data...")
# The training data is used to train your model how to predict the targets.
train = pd.read_csv('numerai_datasets/numerai_training_data.csv', header=0)
# The tournament data is the data that Numerai uses to evaluate your model.
tournament = pd.read_csv('numerai_datasets/numerai_tournament_data.csv', header=0)

In [None]:
# The tournament data contains validation data, test data and live data.
# Validation is used to test your model locally so we separate that.
validation = tournament[tournament['data_type']=='validation']
test = tournament[tournament['data_type']=='test']
live = tournament[tournament['data_type']=='live']

In [None]:
print(train.shape)
print(tournament.shape)
print(live.shape)
print(test.shape)

In [None]:
train_bernie = train.drop([
    'id', 'era', 'data_type',
    'target_charles', 'target_elizabeth',
    'target_jordan', 'target_ken'], axis=1)

### Scikit Pipeline method

In [None]:
# Transform the loaded CSV data into numpy arrays
features = [f for f in list(train_bernie) if "feature" in f]
X_train = train_bernie[features]
y_train = train_bernie['target_bernie']
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)
pca_n_dim=pca.explained_variance_ratio_.cumsum()
n_dim=np.argwhere(pca_n_dim>0.9)[0]
X_train = train_bernie[features]
y_train = train_bernie['target_bernie']
pipe_lr = make_pipeline(StandardScaler(),PCA(n_components=n_dim[0]),LogisticRegression(random_state=1,penalty='l1',C=0.01))
#pipe_lr = make_pipeline(StandardScaler(),PCA(n_components=5),SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0))
pipe_lr.fit(X_train, y_train)
validation_bernie = validation.drop([
    'id', 'era', 'data_type',
    'target_charles', 'target_elizabeth',
    'target_jordan', 'target_ken'], axis=1)
# Transform the loaded CSV data into numpy arrays
features = [f for f in list(validation_bernie) if "feature" in f]
X_test = validation_bernie[features]
y_test = validation_bernie['target_bernie']

y_pred = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
print('Log loss : %.3f' % log_loss(y_test,pipe_lr.predict_proba(X_test)))

### K-Fold cross validation

In [None]:
X_train = train_bernie[features].values
y_train = train_bernie['target_bernie'].values
kfold = StratifiedKFold(n_splits=10,random_state=1).split(X_train,y_train)
pipe_lr = make_pipeline(StandardScaler(),PCA(n_components=n_dim[0]),LogisticRegression(random_state=1,solver='lbfgs'))
scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

### Testing no of jobs

In [None]:
%%timeit -n 1 -r 1
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
%%timeit -n 1 -r 1
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=-1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
log_reg=LogisticRegression(random_state=1,C=0.01,penalty ='l1')
rfecv = RFECV(estimator=log_reg, step=1, cv=StratifiedKFold(2),
              scoring='accuracy',n_jobs=-1)
X_train_std = sc.fit_transform(X_train)
rfecv.fit(X_train_std, y_train)

In [None]:
print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
X_train = train_bernie[features].values
y_train = train_bernie['target_bernie'].values
pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty='l2', random_state=1))

train_sizes, train_scores, test_scores =\
                learning_curve(estimator=pipe_lr,
                               X=X_train,
                               y=y_train,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10,
                               n_jobs=-1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.tight_layout()
#plt.savefig('images/06_05.png', dpi=300)
plt.show()


In [None]:

X_train = train_bernie[features].values
y_train = train_bernie['target_bernie'].values
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
                estimator=pipe_lr, 
                X=X_train, 
                y=y_train, 
                param_name='logisticregression__C', 
                param_range=param_range,
                cv=10,n_jobs =-1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='blue', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='blue')

plt.plot(param_range, test_mean, 
         color='green', linestyle='--', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')

plt.tight_layout()
# plt.savefig('images/06_06.png', dpi=300)
plt.show()

In [None]:
#pipe_lr.get_params()

In [None]:
X_train = train_bernie[features].values[:1000]
y_train = train_bernie['target_bernie'].values[:1000]

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=1))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range, 
               'svc__kernel': ['linear']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
X_train = train_bernie[features].values[:1000]
y_train = train_bernie['target_bernie'].values[:1000]

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=1,probability =True))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range, 
               'svc__kernel': ['linear']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='log_loss', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
X_train = train_bernie[features]
y_train = train_bernie['target_bernie']

clf = gs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_train, y_train))

In [6]:
import pandas as pd
df = pd.read_csv('HIGGS/HIGGS.csv',header=None)

In [8]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487
