Exploritary Data Analysis

In [48]:
import numpy as np
import matplotlib.pyplot as plt
# Import Data and methods
from scripts import proj1_helpers as hp
y_train, x_train, ids_train = hp.load_csv_data('./data/train.csv', sub_sample=False)

1) Five number Summary of training values for each feature
    - the sample minimum
    - the lower quartile
    - the median
    - the upper quartile or third quartile
    - the sample maximum 

In [49]:
def fiveNumberSummary(x_t):
    np.set_printoptions(suppress=True)
    print ('---the sample minimum for each feature---')
    print(np.percentile(x_t,0,axis=0))
    print('---the lower quartile for each feature---')
    print(np.percentile(x_t,25,axis=0))
    print('---the median for each feature---')
    print(np.percentile(x_t,50,axis=0))
    print('---the upper quartile for each feature---')
    print(np.percentile(x_t,75,axis=0))
    print('---maximum for each feature---')
    print(np.percentile(x_t,100,axis=0))
    


In [50]:
fiveNumberSummary(x_train)

---the sample minimum for each feature---
[-999.       0.       6.329    0.    -999.    -999.    -999.       0.208
    0.      46.104    0.047   -1.414 -999.      20.      -2.499   -3.142
   26.      -2.505   -3.142    0.109   -3.142   13.678    0.    -999.
 -999.    -999.    -999.    -999.    -999.       0.   ]
---the lower quartile for each feature---
[  78.10075   19.241     59.38875   14.06875 -999.      -999.
 -999.         1.81       2.841     77.55       0.883     -1.371
 -999.        24.59175   -0.925     -1.575     32.375     -1.014
   -1.522     21.398     -1.575    123.0175     0.      -999.
 -999.      -999.      -999.      -999.      -999.         0.     ]
---the median for each feature---
[ 105.012    46.524    73.752    38.4675 -999.     -999.     -999.
    2.4915   12.3155  120.6645    1.28     -0.356  -999.       31.804
   -0.023    -0.033    40.516    -0.045     0.086    34.802    -0.024
  179.739     1.       38.96     -1.872    -2.093  -999.     -999.
 -999.       4

Conclusion: It can observe that some features have more than half of its values equal to -999 which probably distort the results

By looking at the data, we can see that every values are continuous except PRI_jet_num which represents the number of jets that is categorical. The -999 values seem to be linked to the number of jets. The number of jets belongs to the following set: {0,1,2,3}. Let's check which features are -999 depending on the number of jets:

In [85]:
features = ['DER_mass_MMC','DER_mass_transverse_met_lep','DER_mass_vis','DER_pt_h','DER_deltaeta_jet_jet','DER_mass_jet_jet','DER_prodeta_jet_jet','DER_deltar_tau_lep','DER_pt_tot','DER_sum_pt','DER_pt_ratio_lep_tau','DER_met_phi_centrality','DER_lep_eta_centrality','PRI_tau_pt','PRI_tau_eta','PRI_tau_phi','PRI_lep_pt','PRI_lep_eta','PRI_lep_phi','PRI_met','PRI_met_phi','PRI_met_sumet','PRI_jet_num','PRI_jet_leading_pt','PRI_jet_leading_eta','PRI_jet_leading_phi','PRI_jet_subleading_pt','PRI_jet_subleading_eta','PRI_jet_subleading_phi','PRI_jet_all_pt']


def nullFeatureDependingOnJetNmbr(nmr_jet):
    nullFeatures = np.zeros(30)
    count = 0
    for r in x_train:
        if(r[22] == nmr_jet):
            for i in range(len(r)):
                if(r[i] == -999):
                    nullFeatures[i] += 1
    return nullFeatures

In [101]:
nullFeaturesnm0 = nullFeatureDependingOnJetNmbr(0.0)
nullFeaturesnm1 = nullFeatureDependingOnJetNmbr(1.0)
nullFeaturesnm2 = nullFeatureDependingOnJetNmbr(2.0)
nullFeaturesnm3 = nullFeatureDependingOnJetNmbr(3.0)
print(nullFeaturesnm0)
print(nullFeaturesnm1)
print(nullFeaturesnm2)
print(nullFeaturesnm3)

[26123.     0.     0.     0. 99913. 99913. 99913.     0.     0.     0.
     0.     0. 99913.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0. 99913. 99913. 99913. 99913. 99913. 99913.     0.]
[ 7562.     0.     0.     0. 77544. 77544. 77544.     0.     0.     0.
     0.     0. 77544.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0. 77544. 77544. 77544.     0.]
[2952.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.]
[1477.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.]


In [100]:
nF0 = []
nF1 = []
nF2 = []
nF3 = []
for i in range(30):
    if nullFeaturesnm0[i] != 0:
        nF0.append(features[i])
    if nullFeaturesnm1[i] != 0:
        nF1.append(features[i])
    if nullFeaturesnm2[i] != 0:
        nF2.append(features[i])
    if nullFeaturesnm3[i] != 0:
        nF3.append(features[i])
print("Features -999 when the number of jets is 0: ")
print('number of features:',len(nF0))
print(nF0)
print("Features -999 when the number of jets is 1: ")
print('number of features:',len(nF1))
print(nF1)
print("Features -999 when the number of jets is 2: ")
print('number of features:',len(nF2))
print(nF2)
print("Features -999 when the number of jets is 3: ")
print('number of features:',len(nF3))
print(nF3)

Features -999 when the number of jets is 0: 
number of features: 11
['DER_mass_MMC', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi']
Features -999 when the number of jets is 1: 
number of features: 8
['DER_mass_MMC', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi']
Features -999 when the number of jets is 2: 
number of features: 1
['DER_mass_MMC']
Features -999 when the number of jets is 3: 
number of features: 1
['DER_mass_MMC']


We can observe 11 out of 30 features containing -999. 10 of them linked to jets. We can see that DER_mass_MMC is -999 more often when the number of jets is lower but is not always -999. 

There are 10 features that are always -999 when the number of jet is 0: 

['DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi']

There are 7 features that are always -999 when the number of jet is 1: 

['DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi']

There are no features that are always -99 when the number of jets is 2 or 3

Pearson product-moment correlation coefficients to check the correlation between the variables and y

In [108]:
##Probably need to be normalized before 
##Fix me
for i in range(30):
    print(np.corrcoef(y_train,x_train[:,i])[0][1])

0.2391490578915459
-0.35142795586167713
-0.014055273784852627
0.19252632856874902
0.14164599256566465
0.19176608807477116
0.14055440046509635
0.012245481285482975
-0.015287426687781514
0.1532359324758141
-0.19539789618287715
0.2717518770516496
0.14134598859646377
0.23523797587836742
-0.0009432510582117524
-0.0044025386863884184
-0.03194758680534837
0.0015162353770597236
0.004125447411524855
0.022465751510785933
0.0074753421885902635
0.13552026152268518
0.13354912308169195
0.15760414567634956
0.15046926004977912
0.1504680377929272
0.14071436695043768
0.14126491377601597
0.14125568650533826
0.13429572666925302


Pearson product-moment correlation coefficients to check the correlation between the variables

In [119]:
corrBetweenVariables = np.corrcoef([x_train[:,0],x_train[:,1],x_train[:,2],x_train[:,3],x_train[:,4],x_train[:,5],x_train[:,6],x_train[:,7],x_train[:,8],x_train[:,9],x_train[:,10],x_train[:,11],x_train[:,12],x_train[:,13],x_train[:,14],x_train[:,15],x_train[:,16],x_train[:,17],x_train[:,18],x_train[:,19],x_train[:,20],x_train[:,21],x_train[:,22],x_train[:,23],x_train[:,24],x_train[:,25],x_train[:,26],x_train[:,27],x_train[:,28],x_train[:,29]])

In [139]:
##Probably need to be normalized before and jet values need to be remove?
##Fix me
for i in range(30):
    for j in range(i+1,30):
            if(abs(corrBetweenVariables[i][j]) > 0.90):
                print('Correlation between ',features[i],' and ',features[j])
        

Correlation between  DER_deltaeta_jet_jet  and  DER_mass_jet_jet
Correlation between  DER_deltaeta_jet_jet  and  DER_prodeta_jet_jet
Correlation between  DER_deltaeta_jet_jet  and  DER_lep_eta_centrality
Correlation between  DER_deltaeta_jet_jet  and  PRI_jet_subleading_pt
Correlation between  DER_deltaeta_jet_jet  and  PRI_jet_subleading_eta
Correlation between  DER_deltaeta_jet_jet  and  PRI_jet_subleading_phi
Correlation between  DER_mass_jet_jet  and  DER_prodeta_jet_jet
Correlation between  DER_mass_jet_jet  and  DER_lep_eta_centrality
Correlation between  DER_mass_jet_jet  and  PRI_jet_subleading_pt
Correlation between  DER_mass_jet_jet  and  PRI_jet_subleading_eta
Correlation between  DER_mass_jet_jet  and  PRI_jet_subleading_phi
Correlation between  DER_prodeta_jet_jet  and  DER_lep_eta_centrality
Correlation between  DER_prodeta_jet_jet  and  PRI_jet_subleading_pt
Correlation between  DER_prodeta_jet_jet  and  PRI_jet_subleading_eta
Correlation between  DER_prodeta_jet_jet  an