In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn
from sklearn.svm import SVC
from vecstack import stacking
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate

In [2]:
new_store = pd.read_hdf("singlepi_e100GeV_pu200Nov7.h5")
prev_store = pd.read_hdf("singlepi_e100GeV_pu200_oct27.h5")

In [3]:
# load the old data of october 27 to use it as test data. 
#prev_store['purity']=prev_store['purity'].apply(lambda x: 0 if x <=1 else 1 )
XOct27Test = prev_store.drop(['purity','event','trackster','trckType'],1,inplace=False)
YOct27Test = prev_store[['purity']].iloc[:,0]

In [4]:
df = new_store.drop(['trckType'],1,inplace=False)
#df['purity']=df['purity'].apply(lambda x: 0 if x <=1 else 1 )


In [5]:
X = df.drop(['purity','event','trackster'],1,inplace=False)
y = df[['purity']].iloc[:,0]

sc = StandardScaler()
SC_X = sc.fit_transform(X)
trainDF=df.sample(frac=0.9,random_state=200) #random state is a seed value
testDF=df.drop(trainDF.index)

xTrain = trainDF.drop(['purity','event','trackster'],1,inplace=False)
xTest = testDF.drop(['purity','event','trackster'],1,inplace=False)

SC_xTrain = sc.fit_transform(xTrain)
SC_xTest = sc.transform(xTest)

yTrain = trainDF[['purity']].iloc[:,0]
yTest =  testDF[['purity']].iloc[:,0]

In [16]:
clf0 = SVC(C=10,kernel='rbf', gamma=0.5, random_state=1234).fit(xTrain, yTrain)
yTestPred = clf0.predict(xTest) 

print("Testing confusion_matrix")
print(confusion_matrix(yTest, yTestPred))

print("Test accuracy")
print(sklearn.metrics.accuracy_score(yTest, yTestPred))
print("Test F1 score")
print(sklearn.metrics.f1_score(yTest, yTestPred,average='micro'))

Testing confusion_matrix
[[1854    3    1]
 [ 172    0    0]
 [ 549    0    9]]
Test accuracy
0.7198608964451314
Test F1 score
0.7198608964451313


In [6]:
clf_MLP = MLPClassifier(alpha=0.05, hidden_layer_sizes=(50, 50, 50), random_state=1234)
clf_RF = RandomForestClassifier(max_depth=40,n_estimators=100,random_state=1234)
clf_SVM = SVC(C=10,kernel='rbf', gamma=0.5, random_state=1234)
clf_XGB= XGBClassifier(max_depth=50,random_state=1234)

models = [clf_MLP, clf_RF, clf_SVM, clf_XGB]
names = ["MLP", "RF", "SVM", "XGB"]

In [21]:
scoring = {'accuracy': 'accuracy','f1_micro': 'f1_micro'}

for model, name in zip(models, names):
    print (name)
    cv_results=cross_validate(model, SC_X, y, scoring=scoring, cv=7, return_train_score=True)
    print("test_accuracy", cv_results['test_accuracy'])
    print("test_f1_micro", cv_results['test_f1_micro'])
    print("train_accuracy", cv_results['train_accuracy'])


MLP




test_accuracy [0.8436993  0.85208221 0.84072472 0.79475392 0.79713281 0.83522727
 0.83089827]
test_f1_micro [0.8436993  0.85208221 0.84072472 0.79475392 0.79713281 0.83522727
 0.83089827]
train_accuracy [0.92205743 0.92084028 0.91930758 0.9192625  0.92467544 0.921794
 0.92116295]
RF
test_accuracy [0.83829097 0.87777177 0.86154678 0.72120065 0.71057614 0.83874459
 0.87094156]
test_f1_micro [0.83829097 0.87777177 0.86154678 0.72120065 0.71057614 0.83874459
 0.87094156]
train_accuracy [1.         1.         1.         1.         0.99995492 0.99995492
 1.        ]
SVM
test_accuracy [0.83423472 0.84342888 0.83693889 0.74256355 0.69596971 0.72619048
 0.82007576]
test_f1_micro [0.83423472 0.84342888 0.83693889 0.74256355 0.69596971 0.72619048
 0.82007576]
train_accuracy [0.92386061 0.92106568 0.92115584 0.92381553 0.92548684 0.92093757
 0.92053189]
XGB
test_accuracy [0.83531639 0.86587345 0.86127637 0.79502434 0.70976467 0.8357684
 0.86255411]
test_f1_micro [0.83531639 0.86587345 0.86127637 0

In [22]:
scoring = {'accuracy': 'accuracy','f1_micro': 'f1_micro'}

for model, name in zip(models, names):
    print (name)
    cv_results=cross_validate(model,X, y, scoring=scoring, cv=7, return_train_score=True)
    print("test_accuracy", cv_results['test_accuracy'])
    print("test_f1_micro", cv_results['test_f1_micro'])
    print("train_accuracy", cv_results['train_accuracy'])

MLP




test_accuracy [0.83396431 0.87074094 0.8553272  0.86289886 0.84473898 0.81926407
 0.86985931]
test_f1_micro [0.83396431 0.87074094 0.8553272  0.86289886 0.84473898 0.81926407
 0.86985931]
train_accuracy [0.88058423 0.87873597 0.88811252 0.87855565 0.87968806 0.88284877
 0.87500563]
RF
test_accuracy [0.83747972 0.87696052 0.85938345 0.72228231 0.71030565 0.83847403
 0.87040043]
test_f1_micro [0.83747972 0.87696052 0.85938345 0.72228231 0.71030565 0.83847403
 0.87040043]
train_accuracy [1.         1.         1.         1.         0.99995492 0.99995492
 1.        ]
SVM
test_accuracy [0.7203894  0.71768524 0.71957815 0.72011898 0.71977279 0.72050866
 0.71996753]
test_f1_micro [0.7203894  0.71768524 0.71957815 0.72011898 0.71977279 0.72050866
 0.71996753]
train_accuracy [1.         1.         0.99995492 0.99995492 0.99995492 0.99995492
 0.99995492]
XGB
test_accuracy [0.83396431 0.86587345 0.85722012 0.79691725 0.71517447 0.83658009
 0.8612013 ]
test_f1_micro [0.83396431 0.86587345 0.8572201

In [27]:
print('coress validation with feature scaling wiht PCA 5 - CV 10')

pca = PCA(n_components= 5)
SC_X_pca = pca.fit_transform(SC_X)

for model, name in zip(models, names):
    print (name)
    cv_results=cross_validate(model, SC_X_pca, y, scoring=scoring, cv=10, return_train_score=True)
    print("test_accuracy", cv_results['test_accuracy'])
    print("test_f1_micro", cv_results['test_f1_micro'])
    print("train_accuracy", cv_results['train_accuracy'])

coress validation with feature scaling wiht PCA 5 - CV 10
MLP




test_accuracy [0.83281853 0.8370027  0.88523957 0.82727975 0.82264297 0.82612056
 0.79984544 0.87944359 0.82643989 0.8763046 ]
test_f1_micro [0.83281853 0.8370027  0.88523957 0.82727975 0.82264297 0.82612056
 0.79984544 0.87944359 0.82643989 0.8763046 ]
train_accuracy [0.89442274 0.89382621 0.88606019 0.89331559 0.89202765 0.88846435
 0.89151247 0.88618898 0.89374946 0.89134541]
RF
test_accuracy [0.82509653 0.84125145 0.88214838 0.84003091 0.86398764 0.80757342
 0.80293663 0.86785162 0.82064167 0.8658678 ]
test_f1_micro [0.82509653 0.84125145 0.88214838 0.84003091 0.86398764 0.80757342
 0.80293663 0.86785162 0.82064167 0.8658678 ]
train_accuracy [0.99995706 1.         1.         1.         1.         1.
 0.99991414 1.         1.         0.99995707]
SVM
test_accuracy [0.82779923 0.84781769 0.86978362 0.83230294 0.79211747 0.82264297
 0.79598145 0.85896445 0.78430615 0.87011983]
test_f1_micro [0.82779923 0.84781769 0.86978362 0.83230294 0.79211747 0.82264297
 0.79598145 0.85896445 0.7843

In [7]:
S_train, S_test = stacking(models,                     # list of models
                           SC_xTrain, yTrain, SC_xTest,   # data
                           regression=False,           # classification task (if you need 
                                                       #     regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                                                       #     set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need 
                                                       #     probabilities - set to True) 
                           metric=accuracy_score,      # metric: callable
                           n_folds=7,                  # number of folds
                           random_state=1234,             # ensure reproducibility
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [MLPClassifier]




    fold  0:  [0.89573317]




    fold  1:  [0.89573317]




    fold  2:  [0.89603365]




    fold  3:  [0.90504808]




    fold  4:  [0.89089270]




    fold  5:  [0.88337842]
    fold  6:  [0.89510069]
    ----
    MEAN:     [0.89455998] + [0.00602012]
    FULL:     [0.89456060]

model  1:     [RandomForestClassifier]
    fold  0:  [0.90685096]
    fold  1:  [0.89723558]
    fold  2:  [0.90294471]
    fold  3:  [0.91256010]
    fold  4:  [0.90231440]
    fold  5:  [0.89540126]
    fold  6:  [0.90051097]
    ----
    MEAN:     [0.90254543] + [0.00537742]
    FULL:     [0.90254583]

model  2:     [SVC]
    fold  0:  [0.89453125]
    fold  1:  [0.89152644]
    fold  2:  [0.89002404]
    fold  3:  [0.89903846]
    fold  4:  [0.89179441]
    fold  5:  [0.88397956]
    fold  6:  [0.89179441]
    ----
    MEAN:     [0.89181265] + [0.00421294]
    FULL:     [0.89181299]

model  3:     [XGBClassifier]
    fold  0:  [0.91165865]
    fold  1:  [0.90444712]
    fold  2:  [0.91286058]
    fold  3:  [0.92157452]
    fold  4:  [0.90952810]
    fold  5:  [0.90321611]
    fold  6:  [0.91042982]
    ----
    MEAN:     [0.91053070] + [0.00560860]
  

In [8]:
S_train[1:35]

array([[-1, -1,  2, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [ 2,  2,  2,  2],
       [ 2,  2,  2,  2],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [ 2,  2,  2,  2],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [ 2,  2,  2,  2],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [ 2,  2,  2,  2],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [ 2,  2,  2,  2],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1,  1, -1,  1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1]])

In [9]:
yTest[1:10]

5     -1.0
17    -1.0
18    -1.0
37    -1.0
40    -1.0
96    -1.0
129    2.0
132    1.0
137    2.0
Name: purity, dtype: float64

In [26]:
 ##df['purity']=df['purity'].apply(lambda x: 0 if x <=1 else 1 )
#arr[arr > 255] = x

yTest_2_classes = np.copy(yTest)
yTest_2_classes[yTest_2_classes <= 1] = 0
yTest_2_classes[yTest_2_classes > 1] = 1
yTest_2_classes[1:10]

array([0., 0., 0., 0., 0., 0., 1., 0., 1.])

In [19]:
#vote max.
sum_S_test= np.sum(S_test, axis=1)
sum_S_test[1:20]

array([-4, -4, -4, -4, -4, -4,  2, -4,  2,  2, -4, -4, -4, -4, -4,  8, -4,
       -1, -4])

In [22]:
S_test_Vote_2_classes = np.copy(sum_S_test)
S_test_Vote_2_classes[S_test_Vote_2_classes <= 4] = 0
S_test_Vote_2_classes[S_test_Vote_2_classes > 4] = 1

print('Final prediction score: [%.8f]' % accuracy_score(yTest_2_classes, S_test_Vote_2_classes))

print("Testing confusion_matrix")
print(confusion_matrix(yTest_2_classes, S_test_Vote_2_classes))
print("Test accuracy")
print(sklearn.metrics.accuracy_score(yTest_2_classes, S_test_Vote_2_classes))
print("Test Percision")
print(sklearn.metrics.precision_score(yTest_2_classes, S_test_Vote_2_classes))
print("Test recall")
print(sklearn.metrics.recall_score(yTest_2_classes, S_test_Vote_2_classes))
print("Test F1 score")
print(sklearn.metrics.f1_score(yTest_2_classes, S_test_Vote_2_classes))


Final prediction score: [0.93353941]
Testing confusion_matrix
[[1933   97]
 [  75  483]]
Test accuracy
0.9335394126738794
Test Percision
0.8327586206896552
Test recall
0.8655913978494624
Test F1 score
0.8488576449912127


In [23]:
from sklearn.linear_model import LogisticRegression
# Initialize 2nd level model
metaModel_XGB = XGBClassifier(max_depth=10,random_state=1234, n_estimators=100)

# Fit 2nd level model
clf_meta_xgb = metaModel_XGB.fit(S_train, yTrain)

# Predict
y_pred = clf_meta_xgb.predict(S_test)

# Final prediction score
print('Final prediction score: [%.8f]' % accuracy_score(yTest, y_pred))

Final prediction score: [0.90996909]


In [27]:
y_pred_2_classes = np.copy(y_pred)

y_pred_2_classes[y_pred_2_classes <= 1] = 0
y_pred_2_classes[y_pred_2_classes > 1] = 1

print('Final prediction score: [%.8f]' % accuracy_score(yTest_2_classes, y_pred_2_classes))

print("Testing confusion_matrix")
print(confusion_matrix(yTest_2_classes, y_pred_2_classes))
print("Test accuracy")
print(sklearn.metrics.accuracy_score(yTest_2_classes, y_pred_2_classes))
print("Test Percision")
print(sklearn.metrics.precision_score(yTest_2_classes, y_pred_2_classes))
print("Test recall")
print(sklearn.metrics.recall_score(yTest_2_classes, y_pred_2_classes))
print("Test F1 score")
print(sklearn.metrics.f1_score(yTest_2_classes, y_pred_2_classes))

Final prediction score: [0.94010819]
Testing confusion_matrix
[[1932   98]
 [  57  501]]
Test accuracy
0.9401081916537867
Test Percision
0.8363939899833055
Test recall
0.8978494623655914
Test F1 score
0.8660328435609335
