In [1]:
%matplotlib notebook

import numpy as np
import scipy as scp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
variables = ['nTrackJetsOR', 'MV1cB1_cont', 'MV1cB2_cont', 'mBB', 'dRBB', 'pTB1',
             'pTB2', 'MET', 'dPhiVBB','dPhiLBmin', 'Mtop', 'dYWH', 'mTW', 'pTV']

dfBDT = pd.read_csv("Datasets/Plotcsv2.csv")
dfNN = pd.read_csv("Datasets/NNPlot2jet.csv")

# not defaulted by Event Number by default
dfBDT = dfBDT.sort_values('EventNumber')
dfNN = dfNN.sort_values('EventNumber')

# Resets index of sorted dataframe so sorted dataframe index begins at 0
dfBDT.reset_index(drop=True, inplace=True)
dfNN.reset_index(drop=True, inplace=True)

# Neural network ouptut is in range [0,1] and BDT [-1,1] so maps [0,1] -> [-1,1]
dfNN['Neural Net'] = dfNN['decision_value'].apply(lambda x: (x-0.5)*2)
dfBDT['BDT'] = dfBDT['decision_value']

In [3]:
#
dfcorr = pd.concat([dfBDT['BDT'],dfNN['Neural Net']], axis = 1)
dfnum = pd.concat([dfBDT['EventNumber'],dfNN['EventNumber']], axis = 1)

dfTotal = pd.concat([dfnum,dfcorr], axis = 1)
dfTotal.columns = ['EventNumberBDT', 'EventNumberNN', 'BDT', 'NN']

# Adds column that has value 1 if the event numbers of the BDT and NN match and 0 if they don't
dfTotal['Check'] = np.where(dfTotal['EventNumberBDT'] == dfTotal['EventNumberNN'], 1, 0)

display(dfTotal)

# If not all event numbers match, it will output where they differ
if len(dfTotal[dfTotal['Check']==0]) == 0:
    print("Everything Matches")
    
else:
    print("Problem")
    dfTotal[dfTotal['Check']==0]
    
display(dfTotal.corr())

# Looks for any NaN values
dfTotal.isnull().values.any()

Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN,Check
0,4,4,-0.241809,-0.574150,1
1,49,49,0.946879,0.959886,1
2,50,50,0.944509,0.953559,1
3,78,78,0.872938,0.937318,1
4,137,137,0.950240,0.950255,1
...,...,...,...,...,...
126001,59946434,59946434,-0.996982,-0.996012,1
126002,59954656,59954656,-0.480028,-0.474449,1
126003,59975756,59975756,0.136205,0.081626,1
126004,59981951,59981951,-0.930572,-0.966734,1


Everything Matches


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN,Check
EventNumberBDT,1.0,1.0,-0.417859,-0.419306,
EventNumberNN,1.0,1.0,-0.417859,-0.419306,
BDT,-0.417859,-0.417859,1.0,0.972343,
NN,-0.419306,-0.419306,0.972343,1.0,
Check,,,,,


False

In [4]:
BDT_back = (dfBDT.loc[dfBDT['Class'] == 0])['BDT']
BDT_back.reset_index(drop=True, inplace=True)


NN_back = (dfNN.loc[dfNN['Class'] == 0])['Neural Net']
NN_back.reset_index(drop=True, inplace=True)


BDT_back_num = (dfBDT.loc[dfBDT['Class'] == 0])['EventNumber']
BDT_back_num.reset_index(drop=True, inplace=True)


NN_back_num = (dfBDT.loc[dfBDT['Class'] == 0])['EventNumber']
NN_back_num.reset_index(drop=True, inplace=True)



dfBack = pd.concat([BDT_back_num,NN_back_num,BDT_back,NN_back],
                   axis = 1)
dfBack.reset_index(drop=True, inplace=True)

dfBack.columns = ['EventNumberBDT','EventNumberNN','BDT','NN']

dfBack['Check'] = np.where(dfBack['EventNumberBDT'] == dfBack['EventNumberNN'],
                           1, 0)

pd.set_option('display.max_rows', 100)
display(dfBack)

if len(dfBack[dfBack['Check']==0]) == 0:
    print("Everything Matches")
    
else:
    print("Problem")
    dfBack[dfBack['Check'] == 0]    


dfBack.corr()


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN,Check
0,4,4,-0.241809,-0.574150,1
1,368,368,0.084625,0.122225,1
2,523,523,-0.903535,-0.894241,1
3,668,668,-0.815901,-0.218050,1
4,700,700,0.972098,0.908712,1
...,...,...,...,...,...
47480,59946434,59946434,-0.996982,-0.996012,1
47481,59954656,59954656,-0.480028,-0.474449,1
47482,59975756,59975756,0.136205,0.081626,1
47483,59981951,59981951,-0.930572,-0.966734,1


Everything Matches


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN,Check
EventNumberBDT,1.0,1.0,0.02669,0.021611,
EventNumberNN,1.0,1.0,0.02669,0.021611,
BDT,0.02669,0.02669,1.0,0.959519,
NN,0.021611,0.021611,0.959519,1.0,
Check,,,,,


In [5]:
BDT_sig = (dfBDT.loc[dfBDT['Class'] == 1])['BDT']
BDT_sig.reset_index(drop=True, inplace=True)

BDT_sig_num = (dfBDT.loc[dfBDT['Class'] == 1])['EventNumber']
BDT_sig_num.reset_index(drop=True, inplace=True)


NN_sig = (dfNN.loc[dfNN['Class'] == 1])['Neural Net']
NN_sig.reset_index(drop=True, inplace=True)

NN_sig_num = (dfNN.loc[dfNN['Class'] == 1])['EventNumber']
NN_sig_num.reset_index(drop=True, inplace=True)


dfSig = pd.concat([BDT_sig_num,NN_sig_num,BDT_sig,NN_sig],
                   axis = 1)
dfSig.reset_index(drop=True, inplace=True)

dfSig.columns = ['EventNumberBDT','EventNumberNN','BDT','NN']

dfSig['Check'] = np.where(dfSig['EventNumberBDT'] == dfSig['EventNumberNN'],
                           1, 0)

pd.set_option('display.max_rows', 100)
display(dfSig)

if len(dfSig[dfSig['Check']==0]) == 0:
    print("Everything Matches")
    
else:
    print("Problem")
    dfSig[dfSig['Check'] == 0]    


dfSig.corr()

Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN,Check
0,49,49,0.946879,0.959886,1
1,50,50,0.944509,0.953559,1
2,78,78,0.872938,0.937318,1
3,137,137,0.950240,0.950255,1
4,216,216,0.781196,0.564914,1
...,...,...,...,...,...
78516,2997850,2997850,0.947567,0.879751,1
78517,2998332,2998332,0.936991,0.799842,1
78518,2999356,2999356,0.937480,0.941542,1
78519,2999418,2999418,0.829242,0.757954,1


Everything Matches


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN,Check
EventNumberBDT,1.0,1.0,0.000547,-0.002125,
EventNumberNN,1.0,1.0,0.000547,-0.002125,
BDT,0.000547,0.000547,1.0,0.910221,
NN,-0.002125,-0.002125,0.910221,1.0,
Check,,,,,


In [6]:
ab = pd.concat([dfBack['BDT'],dfSig['BDT']])
ba = pd.concat([dfBack['NN'],dfSig['NN']])

abba =pd.concat([ab,ba], axis = 1)
abba.corr()

Unnamed: 0,BDT,NN
BDT,1.0,0.980693
NN,0.980693,1.0


In [5]:
dfBDT = pd.read_csv("Datasets/Plotcsv2.csv")
dfNN = pd.read_csv("Datasets/NNPlot2jet.csv")

dfBDT = dfBDT.sort_values('EventNumber').reset_index(drop=True)    
dfNN = dfNN.sort_values('EventNumber').reset_index(drop=True)  

dfBDT = dfBDT.sort_values(['Class', 'EventNumber']).reset_index(drop=True)    
dfNN = dfNN.sort_values(['Class', 'EventNumber']).reset_index(drop=True)  

# ALL ROWS (NO FILTER)
dfTotal = (dfBDT.reindex(['EventNumber', 'decision_value'], axis='columns')
                .join(dfNN.reindex(['EventNumber', 'decision_value'], axis='columns'),
                      rsuffix = '_')
                .set_axis(['EventNumberBDT', 'BDT', 'EventNumberNN', 'NN'], 
                          axis='columns', inplace = False)
                .reindex(['EventNumberBDT','EventNumberNN','BDT','NN'], 
                         axis='columns'))    
display(dfTotal.corr())

# TWO FILTERED DATA FRAMES CLASS (0 FOR BACKGROUND, 1 FOR SIGNAL)
df_list = [(dfBDT.query('Class == {}'.format(i))
                 .reindex(['EventNumber', 'decision_value'], axis='columns')
                 .join(dfNN.query('Class == {}'.format(i))
                           .reindex(['EventNumber', 'decision_value'], axis='columns'),
                       rsuffix = '_')
                 .set_axis(['EventNumberBDT', 'BDT', 'EventNumberNN', 'NN'],
                           axis='columns', inplace = False)

                 .reindex(['EventNumberBDT','EventNumberNN','BDT','NN'],
                          axis='columns')
           ) for i in range(0,2)]

dfSub = pd.concat(df_list)

display(dfSub.corr())

Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN
EventNumberBDT,1.0,1.0,-0.417859,-0.419306
EventNumberNN,1.0,1.0,-0.417859,-0.419306
BDT,-0.417859,-0.417859,1.0,0.980693
NN,-0.419306,-0.419306,0.980693,1.0


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN
EventNumberBDT,1.0,1.0,-0.417859,-0.419306
EventNumberNN,1.0,1.0,-0.417859,-0.419306
BDT,-0.417859,-0.417859,1.0,0.980693
NN,-0.419306,-0.419306,0.980693,1.0


In [9]:
dfBDT = pd.read_csv("Datasets/Plotcsv2.csv")
dfNN = pd.read_csv("Datasets/NNPlot2jet.csv")

dfBDT = dfBDT.sort_values(['Class', 'EventNumber']).reset_index(drop=True)    
dfNN = dfNN.sort_values(['Class', 'EventNumber']).reset_index(drop=True)  


def op_approach_total():
    dfscore = pd.concat([dfBDT['decision_value'],dfNN['decision_value']], axis = 1)
    dfnum = pd.concat([dfBDT['EventNumber'],dfNN['EventNumber']], axis = 1)

    dfTotal = pd.concat([dfnum,dfscore], axis = 1)
    dfTotal.columns = ['EventNumberBDT', 'EventNumberNN', 'BDT', 'NN']

    return dfTotal.corr()


def op_approach_split():
    # not defaulted by Event Number by default
    BDT_back = (dfBDT.loc[dfBDT['Class'] == 0])['decision_value']
    BDT_back.reset_index(drop=True, inplace=True)

    BDT_back_num = (dfBDT.loc[dfBDT['Class'] == 0])['EventNumber']
    BDT_back_num.reset_index(drop=True, inplace=True)


    NN_back = (dfNN.loc[dfNN['Class'] == 0])['decision_value']
    NN_back.reset_index(drop=True, inplace=True)

    NN_back_num = (dfNN.loc[dfNN['Class'] == 0])['EventNumber'] 
    NN_back_num.reset_index(drop=True, inplace=True)


    dfBack = pd.concat([BDT_back_num,NN_back_num,BDT_back,NN_back],
                       axis = 1)
    dfBack.reset_index(drop=True, inplace=True)
    dfBack.columns = ['EventNumberBDT','EventNumberNN','BDT','NN']


    # not defaulted by Event Number by default
    BDT_sig = (dfBDT.loc[dfBDT['Class'] == 1])['decision_value']
    BDT_sig.reset_index(drop=True, inplace=True)

    BDT_sig_num = (dfBDT.loc[dfBDT['Class'] == 1])['EventNumber']
    BDT_sig_num.reset_index(drop=True, inplace=True)

    NN_sig = (dfNN.loc[dfNN['Class'] == 1])['decision_value']
    NN_sig.reset_index(drop=True, inplace=True)

    NN_sig_num = (dfNN.loc[dfNN['Class'] == 1])['EventNumber']
    NN_sig_num.reset_index(drop=True, inplace=True)


    dfSig = pd.concat([BDT_sig_num, NN_sig_num, BDT_sig, NN_sig],
                       axis = 1)
    dfSig.reset_index(drop=True, inplace=True)
    dfSig.columns = ['EventNumberBDT','EventNumberNN','BDT','NN']

    # ADDING EventNumber COLUMNS
    ev_back = pd.concat([dfBack['EventNumberBDT'], dfSig['EventNumberBDT']])
    ev_sig = pd.concat([dfBack['EventNumberNN'], dfSig['EventNumberNN']])


    ab = pd.concat([dfBack['BDT'], dfSig['BDT']])

    ba = pd.concat([dfBack['NN'], dfSig['NN']])

    # HORIZONTAL MERGE
    abba = pd.concat([ev_back, ev_sig, ab, ba], axis = 1)

    return [abba.corr(),dfSig.corr(), dfBack.corr()]

opTotal = op_approach_total()
opSub = op_approach_split()

display(opTotal)
display(opSub[0])
display(opSub[1])
display(opSub[2])

Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN
EventNumberBDT,1.0,1.0,-0.417859,-0.419306
EventNumberNN,1.0,1.0,-0.417859,-0.419306
BDT,-0.417859,-0.417859,1.0,0.982029
NN,-0.419306,-0.419306,0.982029,1.0


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN
EventNumberBDT,1.0,1.0,-0.417859,-0.419306
EventNumberNN,1.0,1.0,-0.417859,-0.419306
BDT,-0.417859,-0.417859,1.0,0.982029
NN,-0.419306,-0.419306,0.982029,1.0


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN
EventNumberBDT,1.0,1.0,0.000547,-0.002125
EventNumberNN,1.0,1.0,0.000547,-0.002125
BDT,0.000547,0.000547,1.0,0.919404
NN,-0.002125,-0.002125,0.919404,1.0


Unnamed: 0,EventNumberBDT,EventNumberNN,BDT,NN
EventNumberBDT,1.0,1.0,0.02669,0.021611
EventNumberNN,1.0,1.0,0.02669,0.021611
BDT,0.02669,0.02669,1.0,0.960429
NN,0.021611,0.021611,0.960429,1.0
