In [28]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn
from sklearn.svm import SVC
from vecstack import stacking
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate

In [127]:
new_store = pd.read_hdf("singlepi_e100GeV_pu200Nov7.h5")
prev_store = pd.read_hdf("singlepi_e100GeV_pu200_oct27.h5")

In [13]:
# load the old data of october 27 to use it as test data. 
prev_store['purity']=prev_store['purity'].apply(lambda x: 0 if x <=1 else 1 )
XOct27Test = prev_store.drop(['purity','event','trackster','trckType'],1,inplace=False)
YOct27Test = prev_store[['purity']].iloc[:,0]

In [14]:
df = new_store.drop(['trckType'],1,inplace=False)
df['purity']=df['purity'].apply(lambda x: 0 if x <=1 else 1 )


In [121]:
X = df.drop(['purity','event','trackster'],1,inplace=False)
y = df[['purity']].iloc[:,0]

sc = StandardScaler()
SC_X = sc.fit_transform(X)
trainDF=df.sample(frac=0.9,random_state=200) #random state is a seed value
testDF=df.drop(trainDF.index)

xTrain = trainDF.drop(['purity','event','trackster'],1,inplace=False)
xTest = testDF.drop(['purity','event','trackster'],1,inplace=False)

SC_xTrain = sc.fit_transform(xTrain)
SC_xTest = sc.transform(xTest)

yTrain = trainDF[['purity']].iloc[:,0]
yTest =  testDF[['purity']].iloc[:,0]

In [25]:
clf0 = XGBClassifier(max_depth=50,random_state=1234).fit(xTrain, yTrain)
yTestPred = clf0.predict(xTest) 

print("Testing confusion_matrix")
print(confusion_matrix(yTest, yTestPred))

print("Test accuracy")
print(sklearn.metrics.accuracy_score(yTest, yTestPred))
print("Test Percision")
print(sklearn.metrics.precision_score(yTest, yTestPred))
print("Test recall")
print(sklearn.metrics.recall_score(yTest, yTestPred))
print("Test F1 score")
print(sklearn.metrics.f1_score(yTest, yTestPred))

Testing confusion_matrix
[[1958   72]
 [  87  471]]
Test accuracy
0.9385625965996909
Test Percision
0.8674033149171271
Test recall
0.8440860215053764
Test F1 score
0.8555858310626704


In [None]:
clf_MLP = MLPClassifier(alpha=0.05, hidden_layer_sizes=(50, 50, 50), random_state=1234)
clf_RF = RandomForestClassifier(max_depth=40,n_estimators=100,random_state=1234)
clf_SVM = SVC(C=10,kernel='rbf', gamma=0.5, random_state=1234)
clf_XGB= XGBClassifier(max_depth=50,random_state=1234)

models = [clf_MLP, clf_RF, clf_SVM, clf_XGB]
names = ["MLP", "RF", "SVM", "XGB"]

In [60]:
scoring = {'accuracy': 'accuracy','f1': 'f1'}

for model, name in zip(models, names):
    print (name)
    cv_results=cross_validate(model, X, y, scoring=scoring, cv=7, return_train_score=True)
    print("test_accuracy", cv_results['test_accuracy'])
    print("test_f1", cv_results['test_f1'])
    print("train_accuracy", cv_results['train_accuracy'])


MLP




test_accuracy [0.88723634 0.9210384  0.90643591 0.89291509 0.8934271  0.89880952
 0.91044372]
test_f1 [0.69315673 0.82742317 0.76137931 0.7083947  0.74808184 0.77496992
 0.7654146 ]
train_accuracy [0.92476221 0.91299644 0.91380787 0.91732408 0.91534439 0.91449177
 0.92215461]
RF
test_accuracy [0.88858843 0.92347215 0.90183883 0.75581395 0.74249391 0.91991342
 0.91964286]
test_f1 [0.70359712 0.82733374 0.76864245 0.52796654 0.57613535 0.82038835
 0.79754601]
train_accuracy [1. 1. 1. 1. 1. 1. 1.]
SVM
test_accuracy [0.78366685 0.78150352 0.78312601 0.7828556  0.78360833 0.78327922
 0.7827381 ]
test_f1 [0.00497512 0.00492611 0.00742574 0.         0.00990099 0.00249066
 0.        ]
train_accuracy [1.         1.         0.99995492 0.99995492 0.99995492 0.99995492
 0.99995492]
XGB
test_accuracy [0.88236885 0.9161709  0.90156842 0.83180097 0.76088721 0.92207792
 0.91558442]
test_f1 [0.68637347 0.81097561 0.7699115  0.6097867  0.59634703 0.82566586
 0.78688525]
train_accuracy [0.9997746  0.9998

In [62]:
print('coress validation with feature scaling CV: 7')
for model, name in zip(models, names):
    print (name)
    cv_results=cross_validate(model, SC_X, y, scoring=scoring, cv=7, return_train_score=True)
    print("test_accuracy", cv_results['test_accuracy'])
    print("test_f1", cv_results['test_f1'])
    print("train_accuracy", cv_results['train_accuracy'])

coress validation with feature scaling
MLP
test_accuracy [0.89426717 0.90183883 0.89318551 0.85667929 0.86962402 0.85497835
 0.89204545]
test_f1 [0.72978576 0.79005205 0.7545059  0.67243511 0.72551253 0.71428571
 0.73698088]
train_accuracy [0.95451472 0.95041248 0.95050264 0.95126899 0.95384061 0.9481181
 0.95199459]
RF
test_accuracy [0.88750676 0.92293131 0.90156842 0.75689562 0.74087098 0.92072511
 0.91801948]
test_f1 [0.70200573 0.82632541 0.76726343 0.5305483  0.57232143 0.82210079
 0.79345603]
train_accuracy [1. 1. 1. 1. 1. 1. 1.]
SVM
test_accuracy [0.87750135 0.88290968 0.8815576  0.79096809 0.77901001 0.75784632
 0.88284632]
test_f1 [0.67480258 0.7411835  0.71335079 0.55954416 0.59373446 0.59225513
 0.71791531]
train_accuracy [0.95785061 0.95618266 0.95663346 0.95776045 0.95879913 0.95402299
 0.95348208]
XGB
test_accuracy [0.88345051 0.91725257 0.90237966 0.83396431 0.75872329 0.9237013
 0.91558442]
test_f1 [0.69148175 0.81341463 0.7716635  0.61625    0.59491371 0.83012048
 0.78

In [145]:
print('coress validation with feature scaling wiht PCA 6 - CV 10')

pca = PCA(n_components= 4)
SC_X_pca = pca.fit_transform(SC_X)

for model, name in zip(models, names):
    print (name)
    cv_results=cross_validate(model, SC_X_pca, y, scoring=scoring, cv=7, return_train_score=True)
    print("test_accuracy", cv_results['test_accuracy'])
    print("test_f1", cv_results['test_f1'])
    print("train_accuracy", cv_results['train_accuracy'])

coress validation with feature scaling wiht PCA 6 - CV 10
MLP
test_accuracy [0.88209843 0.90481341 0.87290427 0.88804759 0.87909115 0.90746753
 0.88988095]
test_f1 [0.70855615 0.78871549 0.68708389 0.68965517 0.72010019 0.78409091
 0.70226774]
train_accuracy [0.91204977 0.91033674 0.9152955  0.91272596 0.91430761 0.91129141
 0.90872211]
RF
test_accuracy [0.87209302 0.88994051 0.86749594 0.87479719 0.86205031 0.90232684
 0.88771645]
test_f1 [0.65848375 0.75288403 0.68264249 0.6572909  0.68902439 0.76604018
 0.7112039 ]
train_accuracy [1.         1.         1.         1.         0.99995492 1.
 1.        ]
SVM
test_accuracy [0.87290427 0.90075717 0.87804218 0.88750676 0.87882067 0.90313853
 0.88176407]
test_f1 [0.65081724 0.77133956 0.7027027  0.68908819 0.71134021 0.76291391
 0.70292318]
train_accuracy [0.91425867 0.91074246 0.91389803 0.91376279 0.91277497 0.90944332
 0.9100293 ]
XGB
test_accuracy [0.87479719 0.88534343 0.85992428 0.87155219 0.85312415 0.89799784
 0.875     ]
test_f1 [0

In [64]:
S_train, S_test = stacking(models,                     # list of models
                           SC_xTrain, yTrain, SC_xTest,   # data
                           regression=False,           # classification task (if you need 
                                                       #     regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                                                       #     set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need 
                                                       #     probabilities - set to True) 
                           metric=accuracy_score,      # metric: callable
                           n_folds=7,                  # number of folds
                           random_state=1234,             # ensure reproducibility
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [MLPClassifier]
    fold  0:  [0.93900240]
    fold  1:  [0.93629808]




    fold  2:  [0.92938702]
    fold  3:  [0.93569712]
    fold  4:  [0.93567779]
    fold  5:  [0.92726180]
    fold  6:  [0.92816351]
    ----
    MEAN:     [0.93306967] + [0.00432087]
    FULL:     [0.93307002]

model  1:     [RandomForestClassifier]
    fold  0:  [0.93810096]
    fold  1:  [0.94411058]
    fold  2:  [0.93479567]
    fold  3:  [0.94381010]
    fold  4:  [0.93988578]
    fold  5:  [0.93748121]
    fold  6:  [0.93748121]
    ----
    MEAN:     [0.93938079] + [0.00320987]
    FULL:     [0.93938093]

model  2:     [SVC]
    fold  0:  [0.93840144]
    fold  1:  [0.93629808]
    fold  2:  [0.92728365]
    fold  3:  [0.93840144]
    fold  4:  [0.93116922]
    fold  5:  [0.92876465]
    fold  6:  [0.93026751]
    ----
    MEAN:     [0.93294086] + [0.00432000]
    FULL:     [0.93294123]

model  3:     [XGBClassifier]
    fold  0:  [0.94260817]
    fold  1:  [0.94230769]
    fold  2:  [0.94050481]
    fold  3:  [0.94801683]
    fold  4:  [0.94409378]
    fold  5:  [0.94168921]

In [120]:
a=np.sum(S_train, axis=1)
print("all 4 clfs match", np.count_nonzero( np.logical_or(a==0,a==4)))
print("no match in the 4 clfs ", np.count_nonzero( np.logical_and(a!=0,a!=4)))
print("percentage ",np.count_nonzero( np.logical_and(a!=0,a!=4))/np.count_nonzero( np.logical_or(a==0,a==4)))


all 4 clfs match 21649
no match in the 4 clfs  1644
percentage  0.0759388424407594


In [82]:
S_train[1:25]

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 0, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1]])

In [126]:
from sklearn.linear_model import LogisticRegression
# Initialize 2nd level model
metaModel_XGB = XGBClassifier(max_depth=10,random_state=1234, n_estimators=100)

# Fit 2nd level model
clf_meta_xgb = metaModel_XGB.fit(S_train, yTrain)

# Predict
y_pred = clf_meta_xgb.predict(S_test)

# Final prediction score
print('Final prediction score: [%.8f]' % accuracy_score(yTest, y_pred))

Final prediction score: [0.93701700]


In [146]:
pca = PCA(n_components= 4)
SC_xTrain_pca = pca.fit_transform(SC_xTrain)
SC_xTest_pca = pca.fit_transform(SC_xTest)

S_train_pca, S_test_pca = stacking(models,                     # list of models
                           SC_xTrain_pca, yTrain, SC_xTest_pca,   # data
                           regression=False,           # classification task (if you need 
                                                       #     regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                                                       #     set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need 
                                                       #     probabilities - set to True) 
                           metric=accuracy_score,      # metric: callable
                           n_folds=7,                  # number of folds
                           random_state=1234,             # ensure reproducibility
                           verbose=2)          

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [MLPClassifier]
    fold  0:  [0.90985577]
    fold  1:  [0.90895433]
    fold  2:  [0.90685096]
    fold  3:  [0.91195913]
    fold  4:  [0.90411782]
    fold  5:  [0.90652239]
    fold  6:  [0.90141268]
    ----
    MEAN:     [0.90709616] + [0.00330105]
    FULL:     [0.90709655]

model  1:     [RandomForestClassifier]
    fold  0:  [0.90625000]
    fold  1:  [0.91256010]
    fold  2:  [0.90685096]
    fold  3:  [0.90985577]
    fold  4:  [0.89930869]
    fold  5:  [0.90501954]
    fold  6:  [0.90081154]
    ----
    MEAN:     [0.90580808] + [0.00432742]
    FULL:     [0.90580861]

model  2:     [SVC]
    fold  0:  [0.90655048]
    fold  1:  [0.91346154]
    fold  2:  [0.90294471]
    fold  3:  [0.91015625]
    fold  4:  [0.90021040]
    fold  5:  [0.90261497]
    fold  6:  [0.90351668]
    ----
    MEAN:     [0.90563643] + [0.00435572]
    FUL

In [147]:
a=np.sum(S_train_pca, axis=1)
print("all 4 clfs match", np.count_nonzero( np.logical_or(a==0,a==4)))
print("no match in the 4 clfs ", np.count_nonzero( np.logical_and(a!=0,a!=4)))
print("percentage ",np.count_nonzero( np.logical_and(a!=0,a!=4))/np.count_nonzero( np.logical_or(a==0,a==4)))


all 4 clfs match 21841
no match in the 4 clfs  1452
percentage  0.06648047250583765


In [150]:
# Initialize 2nd level model
metaModel_XGB = XGBClassifier(max_depth=25,random_state=1234, n_estimators=100)

# Fit 2nd level model
clf_meta_xgb = metaModel_XGB.fit(S_train_pca, yTrain)

# Predict
y_pred = clf_meta_xgb.predict(S_test_pca)

# Final prediction score
print('Final prediction accuracy score: [%.8f]' % sklearn.metrics.accuracy_score(yTest, y_pred))
print('Final prediction F1 score: [%.8f]' % sklearn.metrics.f1_score(yTest, y_pred))

Final prediction accuracy score: [0.89451314]
Final prediction F1 score: [0.75249320]


## Stacking with extended Featuers

In [132]:
updated_df_groupby_avg=df.groupby(['event','trackster','layer']).mean().reset_index()
updated_df_groupby_max=df.groupby(['event','trackster','layer']).max().reset_index()
updated_df_groupby_min=df.groupby(['event','trackster','layer']).min().reset_index()
updated_df_groupby_sum=df.groupby(['event','trackster','layer']).sum().reset_index()

In [133]:
def getValueEventLayerTrackster(df, col, event, trackster,layer, defaultV = 0):
    s = df.loc[(df['event'] == event) & (df['layer'] ==layer) & (df['trackster'] ==trackster) ,col ]
    return defaultV if s.size == 0 else s.values[0]

In [134]:
updated_df = df.copy()
#updated_df['prevAvgEta'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'eta', row['event'], row['trackster'], row['layer'] - 1, row['trckEta']), axis=1)
#updated_df['prevAvgPhi'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'phi', row['event'], row['trackster'], row['layer'] - 1,row['trckPhi']), axis=1)
#updated_df['prevAvgE'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'E', row['event'], row['trackster'], row['layer'] - 1), axis=1)
#updated_df['PrevSumE'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer'] - 1), axis=1)

#updated_df['NextAvgEta'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'eta', row['event'], row['trackster'], row['layer'] + 1,row['trckEta']), axis=1)
#updated_df['NextAvgPhi'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'phi', row['event'], row['trackster'], row['layer'] + 1,row['trckPhi']), axis=1)
#updated_df['NextAvgE'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'E', row['event'], row['trackster'], row['layer'] + 1), axis=1)
#updated_df['NextSumE'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer'] + 1), axis=1)

#updated_df['SiblingAvgEta'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'eta', row['event'], row['trackster'], row['layer'],row['trckEta'] ), axis=1)
#updated_df['SiblingAvgPhi'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'phi', row['event'], row['trackster'], row['layer'],row['trckPhi'] ), axis=1)
#updated_df['SiblingAvgE'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_avg, 'E', row['event'], row['trackster'], row['layer'] ), axis=1)
#updated_df['SiblingSumE'] = updated_df.apply(lambda row: getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer'] ), axis=1)

updated_df['RatioSiblingNHits'] = updated_df.apply(lambda row: row['nHits'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'nHits', row['event'], row['trackster'], row['layer'] ), axis=1)
updated_df['RatioNextNHits'] = updated_df.apply(lambda row: row['nHits'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'nHits', row['event'], row['trackster'], row['layer'] + 1, 1 ), axis=1)
updated_df['RatioPrevNHits'] = updated_df.apply(lambda row: row['nHits'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'nHits', row['event'], row['trackster'], row['layer'] - 1, 1 ), axis=1)

updated_df['RatioE'] = updated_df.apply(lambda row: row['E'] / row['trckEn'], axis=1)
updated_df['RatioSiblingE'] = updated_df.apply(lambda row: row['E'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer']), axis=1)


updated_df['RatioNextE'] = updated_df.apply(lambda row: row['E'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer'] + 1,row['trckEn']), axis=1)

updated_df['RatioPrevE'] =  updated_df.apply(lambda row: row['E'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer'] - 1, row['trckEn']), axis=1)

updated_df.head()

Unnamed: 0,event,trackster,purity,layer,E,eta,phi,x,y,z,...,trckEn,trckEta,trckPhi,RatioSiblingNHits,RatioNextNHits,RatioPrevNHits,RatioE,RatioSiblingE,RatioNextE,RatioPrevE
0,1.0,0.0,0,1.0,0.077115,1.963233,0.158004,91.118462,14.518062,322.102753,...,57.759506,1.897144,0.196742,0.178571,0.128205,5.0,0.001335,0.133108,0.04977,0.001335
1,1.0,0.0,0,1.0,0.134952,1.93162,0.233432,92.770134,22.057596,322.102753,...,57.759506,1.897144,0.196742,0.25,0.179487,7.0,0.002336,0.232939,0.087097,0.002336
2,1.0,0.0,0,1.0,0.081363,1.93214,0.148012,94.262695,14.054753,322.102753,...,57.759506,1.897144,0.196742,0.071429,0.051282,2.0,0.001409,0.14044,0.052511,0.001409
3,1.0,0.0,0,1.0,0.0232,1.950308,0.361422,87.475647,33.068218,322.102753,...,57.759506,1.897144,0.196742,0.071429,0.051282,2.0,0.000402,0.040045,0.014973,0.000402
4,1.0,0.0,0,1.0,0.088878,1.91165,0.356526,91.242096,33.982418,322.102753,...,57.759506,1.897144,0.196742,0.071429,0.051282,2.0,0.001539,0.153412,0.057362,0.001539


In [135]:
X_extended = updated_df.drop(['purity','event','trackster'],1,inplace=False)
y_extended = updated_df[['purity']].iloc[:,0]

SC_X_extended = sc.fit_transform(X_extended)

trainDF_extended=df.sample(frac=0.9,random_state=200) #random state is a seed value
testDF_extended=df.drop(trainDF.index)

xTrain_extended = trainDF_extended.drop(['purity','event','trackster'],1,inplace=False)
xTest_extended = testDF_extended.drop(['purity','event','trackster'],1,inplace=False)

SC_xTrain_extended = sc.fit_transform(xTrain_extended)
SC_xTest_extended = sc.transform(xTest_extended)

yTrain_extended = trainDF[['purity']].iloc[:,0]
yTest_extended =  testDF[['purity']].iloc[:,0]

In [136]:
print('coress validation with extended features and scaling')
for model, name in zip(models, names):
    print (name)
    cv_results=cross_validate(model, SC_X_extended, y_extended, scoring=scoring, cv=7, return_train_score=True)
    print("test_accuracy", cv_results['test_accuracy'])
    print("test_f1", cv_results['test_f1'])
    print("train_accuracy", cv_results['train_accuracy'])

coress validation with extended features and scaling
MLP




test_accuracy [0.88777718 0.90697674 0.88182802 0.86235803 0.82769813 0.85633117
 0.88582251]
test_f1 [0.72534745 0.78712871 0.7270456  0.65491525 0.65548945 0.71919619
 0.71016484]
train_accuracy [0.96181761 0.95992427 0.96154713 0.96533381 0.95843851 0.96164075
 0.95920667]
RF
test_accuracy [0.89102217 0.92915089 0.90886966 0.76203353 0.7454693  0.92640693
 0.91747835]
test_f1 [0.7115247  0.8398533  0.78383579 0.54545455 0.57090743 0.83168317
 0.79293958]
train_accuracy [1.         1.         1.         0.99995492 1.         1.
 1.        ]
SVM
test_accuracy [0.8783126  0.88372093 0.87182261 0.74067063 0.77819854 0.76569264
 0.85281385]
test_f1 [0.68265162 0.75258918 0.70485679 0.52312282 0.6        0.60600546
 0.67268351]
train_accuracy [0.96975161 0.96695668 0.96776811 0.97011225 0.97015867 0.96664413
 0.96673428]
XGB
test_accuracy [0.89102217 0.92184965 0.90589508 0.84910763 0.77116581 0.92694805
 0.91910173]
test_f1 [0.70733479 0.82495457 0.77918782 0.64861461 0.60577819 0.833743

In [139]:
S_train_extended, S_test_extended = stacking(models,                     # list of models
                           SC_xTrain_extended, yTrain_extended, SC_xTest_extended,   # data
                           regression=False,           # classification task (if you need 
                                                       #     regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                                                       #     set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need 
                                                       #     probabilities - set to True) 
                           metric=accuracy_score,      # metric: callable
                           n_folds=7,                  # number of folds
                           random_state=1234,             # ensure reproducibility
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [MLPClassifier]
    fold  0:  [0.93900240]
    fold  1:  [0.93629808]




    fold  2:  [0.92938702]
    fold  3:  [0.93569712]
    fold  4:  [0.93567779]
    fold  5:  [0.92726180]
    fold  6:  [0.92816351]
    ----
    MEAN:     [0.93306967] + [0.00432087]
    FULL:     [0.93307002]

model  1:     [RandomForestClassifier]
    fold  0:  [0.93810096]
    fold  1:  [0.94411058]
    fold  2:  [0.93479567]
    fold  3:  [0.94381010]
    fold  4:  [0.93988578]
    fold  5:  [0.93748121]
    fold  6:  [0.93748121]
    ----
    MEAN:     [0.93938079] + [0.00320987]
    FULL:     [0.93938093]

model  2:     [SVC]
    fold  0:  [0.93840144]
    fold  1:  [0.93629808]
    fold  2:  [0.92728365]
    fold  3:  [0.93840144]
    fold  4:  [0.93116922]
    fold  5:  [0.92876465]
    fold  6:  [0.93026751]
    ----
    MEAN:     [0.93294086] + [0.00432000]
    FULL:     [0.93294123]

model  3:     [XGBClassifier]
    fold  0:  [0.94260817]
    fold  1:  [0.94230769]
    fold  2:  [0.94050481]
    fold  3:  [0.94801683]
    fold  4:  [0.94409378]
    fold  5:  [0.94168921]

In [140]:
a=np.sum(S_train_extended, axis=1)
print("all 4 clfs match", np.count_nonzero( np.logical_or(a==0,a==4)))
print("no match in the 4 clfs ", np.count_nonzero( np.logical_and(a!=0,a!=4)))
print("percentage ",np.count_nonzero( np.logical_and(a!=0,a!=4))/np.count_nonzero( np.logical_or(a==0,a==4)))


all 4 clfs match 21649
no match in the 4 clfs  1644
percentage  0.0759388424407594


In [142]:
from sklearn.linear_model import LogisticRegression
# Initialize 2nd level model
metaModel_XGB = XGBClassifier(max_depth=25,random_state=1234, n_estimators=100)

# Fit 2nd level model
clf_meta_xgb = metaModel_XGB.fit(S_train_extended, yTrain_extended)

# Predict
y_pred = clf_meta_xgb.predict(S_test_extended)

# Final prediction score
print('Final prediction score: [%.8f]' % accuracy_score(yTest_extended, y_pred))

Final prediction score: [0.93701700]
