In [90]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [86]:
new_store = pd.read_hdf("singlepi_e100GeV_pu200Nov7.h5")
prev_store = pd.read_hdf("singlepi_e100GeV_pu200_oct27.h5")

In [87]:
# load the old data of october 27 to use it as test data. 
prev_store['purity']=prev_store['purity'].apply(lambda x: 0 if x <=1 else 1 )
XOct27Test = prev_store.drop(['purity','event','trackster','trckType'],1,inplace=False)
YOct27Test = prev_store[['purity']].iloc[:,0]

In [88]:
df = new_store.drop(['trckType'],1,inplace=False)
df['purity']=df['purity'].apply(lambda x: 0 if x <=1 else 1 )



In [89]:
trainDF=df.sample(frac=0.9,random_state=200) #random state is a seed value
testDF=df.drop(trainDF.index)

x0Train = trainDF.drop(['purity','event','trackster'],1,inplace=False)
x0Test = testDF.drop(['purity','event','trackster'],1,inplace=False)

sc = StandardScaler()
x0Train = sc.fit_transform(x0Train)
x0Test = sc.transform(x0Test)

y0Train = trainDF[['purity']].iloc[:,0]
y0Test =  testDF[['purity']].iloc[:,0]


pca = PCA(n_components= None)
pca.fit_transform(x0Train)
pca.transform(x0Test)
pca.explained_variance_ratio_

array([0.29185005, 0.24363646, 0.17289322, 0.13004571, 0.07764744,
       0.03569065, 0.02695563, 0.00822576, 0.0073457 , 0.00472424,
       0.00098514])

In [91]:
trainDF=df.sample(frac=0.9,random_state=200) #random state is a seed value
testDF=df.drop(trainDF.index)

x0Train = trainDF.drop(['purity','event','trackster'],1,inplace=False)
x0Test = testDF.drop(['purity','event','trackster'],1,inplace=False)

sc = MinMaxScaler()
x0Train = sc.fit_transform(x0Train)
x0Test = sc.transform(x0Test)

y0Train = trainDF[['purity']].iloc[:,0]
y0Test =  testDF[['purity']].iloc[:,0]


pca = PCA(n_components= None)
pca.fit_transform(x0Train)
pca.transform(x0Test)
pca.explained_variance_ratio_

array([0.47804988, 0.20712209, 0.18171881, 0.06416708, 0.02239336,
       0.01944824, 0.01485702, 0.0069028 , 0.00262831, 0.00146596,
       0.00124645])

In [14]:
clf0 = LogisticRegression(random_state=1234).fit(x0Train, y0Train)
y0TestPred = clf0.predict(x0Test) 

print("Testing confusion_matrix")
print(confusion_matrix(y0Test, y0TestPred))

print("Test accuracy")
print(sklearn.metrics.accuracy_score(y0Test, y0TestPred))
print("Test Percision")
print(sklearn.metrics.precision_score(y0Test, y0TestPred))
print("Test recall")
print(sklearn.metrics.recall_score(y0Test, y0TestPred))
print("Test F1 score")
print(sklearn.metrics.f1_score(y0Test, y0TestPred))

Testing confusion_matrix
[[1934   96]
 [ 177  381]]
Test accuracy
0.8945131375579598
Test Percision
0.7987421383647799
Test recall
0.6827956989247311
Test F1 score
0.7362318840579711




In [None]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1,5, 10,100]
    gammas = [0.001, 0.01, 0.1,0.5, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [54]:
svc_param_selection(pcaTrain, y0Train, 5)

{'C': 5, 'gamma': 1}

In [92]:
pca = PCA(n_components= 7) # here you can change this number to play around
pcaTrain = pca.fit_transform(x0Train)
pcaTest = pca.transform(x0Test)

clf0 = SVC(C=5,kernel='rbf', gamma=1).fit(pcaTrain, y0Train)
y0TestPred = clf0.predict(pcaTest) 

print("Testing confusion_matrix")
print(confusion_matrix(y0Test, y0TestPred))

print("Test accuracy")
print(sklearn.metrics.accuracy_score(y0Test, y0TestPred))
print("Test Percision")
print(sklearn.metrics.precision_score(y0Test, y0TestPred))
print("Test recall")
print(sklearn.metrics.recall_score(y0Test, y0TestPred))
print("Test F1 score")
print(sklearn.metrics.f1_score(y0Test, y0TestPred))

Testing confusion_matrix
[[1943   87]
 [ 150  408]]
Test accuracy
0.9084234930448223
Test Percision
0.8242424242424242
Test recall
0.7311827956989247
Test F1 score
0.774928774928775


In [56]:
pca = PCA(n_components= 7) # here you can change this number to play around
pcaTrain = pca.fit_transform(x0Train)
pcaTest = pca.transform(x0Test)

clf0 = SVC(C=5,kernel='rbf', gamma=1).fit(pcaTrain, y0Train)
y0TestPred = clf0.predict(pcaTest) 

print("Testing confusion_matrix")
print(confusion_matrix(y0Test, y0TestPred))

print("Test accuracy")
print(sklearn.metrics.accuracy_score(y0Test, y0TestPred))
print("Test Percision")
print(sklearn.metrics.precision_score(y0Test, y0TestPred))
print("Test recall")
print(sklearn.metrics.recall_score(y0Test, y0TestPred))
print("Test F1 score")
print(sklearn.metrics.f1_score(y0Test, y0TestPred))

Testing confusion_matrix
[[1942   88]
 [ 108  450]]
Test accuracy
0.9242658423493045
Test Percision
0.8364312267657993
Test recall
0.8064516129032258
Test F1 score
0.8211678832116787


In [57]:
updated_df_groupby_avg=df.groupby(['event','trackster','layer']).mean().reset_index()
updated_df_groupby_max=df.groupby(['event','trackster','layer']).max().reset_index()
updated_df_groupby_min=df.groupby(['event','trackster','layer']).min().reset_index()
updated_df_groupby_sum=df.groupby(['event','trackster','layer']).sum().reset_index()

In [58]:
def getValueEventLayerTrackster(df, col, event, trackster,layer, defaultV = 0):
    s = df.loc[(df['event'] == event) & (df['layer'] ==layer) & (df['trackster'] ==trackster) ,col ]
    return defaultV if s.size == 0 else s.values[0]

In [59]:
#test
getValueEventLayerTrackster(updated_df_groupby_sum, 'E', 1,0,8,8)

3.834830043837428

In [60]:
updated_df = df.copy()

updated_df['RatioSiblingNHits'] = updated_df.apply(lambda row: row['nHits'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'nHits', row['event'], row['trackster'], row['layer'] ), axis=1)
updated_df['RatioNextNHits'] = updated_df.apply(lambda row: row['nHits'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'nHits', row['event'], row['trackster'], row['layer'] + 1, 1 ), axis=1)
updated_df['RatioPrevNHits'] = updated_df.apply(lambda row: row['nHits'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'nHits', row['event'], row['trackster'], row['layer'] - 1, 1 ), axis=1)

updated_df['RatioE'] = updated_df.apply(lambda row: row['E'] / row['trckEn'], axis=1)
updated_df['RatioSiblingE'] = updated_df.apply(lambda row: row['E'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer']), axis=1)

updated_df['RatioNextE'] = updated_df.apply(lambda row: row['E'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer'] + 1,row['trckEn']), axis=1)

updated_df['RatioPrevE'] =  updated_df.apply(lambda row: row['E'] / getValueEventLayerTrackster(updated_df_groupby_sum, 'E', row['event'], row['trackster'], row['layer'] - 1, row['trckEn']), axis=1)

updated_df.head()

Unnamed: 0,event,trackster,purity,layer,E,eta,phi,x,y,z,...,trckEn,trckEta,trckPhi,RatioSiblingNHits,RatioNextNHits,RatioPrevNHits,RatioE,RatioSiblingE,RatioNextE,RatioPrevE
0,1.0,0.0,0,1.0,0.077115,1.963233,0.158004,91.118462,14.518062,322.102753,...,57.759506,1.897144,0.196742,0.178571,0.128205,5.0,0.001335,0.133108,0.04977,0.001335
1,1.0,0.0,0,1.0,0.134952,1.93162,0.233432,92.770134,22.057596,322.102753,...,57.759506,1.897144,0.196742,0.25,0.179487,7.0,0.002336,0.232939,0.087097,0.002336
2,1.0,0.0,0,1.0,0.081363,1.93214,0.148012,94.262695,14.054753,322.102753,...,57.759506,1.897144,0.196742,0.071429,0.051282,2.0,0.001409,0.14044,0.052511,0.001409
3,1.0,0.0,0,1.0,0.0232,1.950308,0.361422,87.475647,33.068218,322.102753,...,57.759506,1.897144,0.196742,0.071429,0.051282,2.0,0.000402,0.040045,0.014973,0.000402
4,1.0,0.0,0,1.0,0.088878,1.91165,0.356526,91.242096,33.982418,322.102753,...,57.759506,1.897144,0.196742,0.071429,0.051282,2.0,0.001539,0.153412,0.057362,0.001539


In [63]:
trainExtendedDF=updated_df.sample(frac=0.9,random_state=200) #random state is a seed value
testExtendedDF=updated_df.drop(trainDF.index)

xExtendedTrain = trainExtendedDF.drop(['purity','event','trackster'],1,inplace=False)
xExtendedTest = testExtendedDF.drop(['purity','event','trackster'],1,inplace=False)

sc = StandardScaler()
xExtendedTrain = sc.fit_transform(xExtendedTrain)
xExtendedTest = sc.transform(xExtendedTest)

yTrain = trainDF[['purity']].iloc[:,0]
yTest =  testDF[['purity']].iloc[:,0]

pca = PCA(n_components= None)
pca.fit_transform(xExtendedTrain)
pca.transform(xExtendedTest)
pca.explained_variance_ratio_

array([0.26021411, 0.17140689, 0.11282183, 0.10460166, 0.06130509,
       0.05619657, 0.05155926, 0.04293485, 0.03941528, 0.02769174,
       0.02224504, 0.01805746, 0.01568584, 0.00490463, 0.004483  ,
       0.0030133 , 0.00286292, 0.00060053])

In [73]:
pca = PCA(n_components= 7) # here you can change this number to play around
pcaExtendedTrain = pca.fit_transform(xExtendedTrain)
pcaExtendedTest = pca.transform(xExtendedTest)

In [74]:
svc_param_selection(pcaExtendedTrain, yTrain, 5)

{'C': 10, 'gamma': 0.5}

In [75]:
clf0 = SVC(C=10,kernel='rbf', gamma=0.5).fit(pcaExtendedTrain, yTrain)
y0TestPred = clf0.predict(pcaExtendedTest) 

print("Testing confusion_matrix")
print(confusion_matrix(yTest, y0TestPred))

print("Test accuracy")
print(sklearn.metrics.accuracy_score(yTest, y0TestPred))
print("Test Percision")
print(sklearn.metrics.precision_score(yTest, y0TestPred))
print("Test recall")
print(sklearn.metrics.recall_score(yTest, y0TestPred))
print("Test F1 score")
print(sklearn.metrics.f1_score(yTest, y0TestPred))

Testing confusion_matrix
[[1924  106]
 [ 118  440]]
Test accuracy
0.9134466769706336
Test Percision
0.8058608058608059
Test recall
0.7885304659498208
Test F1 score
0.7971014492753623


In [83]:
# No Scaling - No Pca
trainDF=df.sample(frac=0.9,random_state=200) #random state is a seed value
testDF=df.drop(trainDF.index)

xNoScalingTrain = trainDF.drop(['purity','event','trackster'],1,inplace=False)
yNoScalingTrain = trainDF[['purity']].iloc[:,0]

xNoScalingTest = testDF.drop(['purity','event','trackster'],1,inplace=False)
yNoScalingTest =  testDF[['purity']].iloc[:,0]


In [84]:
clf0 = SVC(C=1,kernel='rbf', gamma=0.01).fit(xNoScalingTrain, yNoScalingTrain)
y0TestPred = clf0.predict(xNoScalingTest) 
yTest = yNoScalingTest

print("Testing confusion_matrix")
print(confusion_matrix(yTest, y0TestPred))

print("Test accuracy")
print(sklearn.metrics.accuracy_score(yTest, y0TestPred))
print("Test Percision")
print(sklearn.metrics.precision_score(yTest, y0TestPred))
print("Test recall")
print(sklearn.metrics.recall_score(yTest, y0TestPred))
print("Test F1 score")
print(sklearn.metrics.f1_score(yTest, y0TestPred))

Testing confusion_matrix
[[1947   83]
 [ 111  447]]
Test accuracy
0.9250386398763524
Test Percision
0.8433962264150944
Test recall
0.8010752688172043
Test F1 score
0.8216911764705883
