In [12]:
import os
import numpy as np
import pandas as pd 
from time import time
import visualizations as vz
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [13]:
#Read the data
train = pd.read_csv('higgsTrain.csv')
test = pd.read_csv('higgsTest.csv')
for col in list(train.columns):
    print(col)
print(len(list(train.columns)))

EventId
DER_mass_MMC
DER_mass_transverse_met_lep
DER_mass_vis
DER_pt_h
DER_deltaeta_jet_jet
DER_mass_jet_jet
DER_prodeta_jet_jet
DER_deltar_tau_lep
DER_pt_tot
DER_sum_pt
DER_pt_ratio_lep_tau
DER_met_phi_centrality
DER_lep_eta_centrality
PRI_tau_pt
PRI_tau_eta
PRI_tau_phi
PRI_lep_pt
PRI_lep_eta
PRI_lep_phi
PRI_met
PRI_met_phi
PRI_met_sumet
PRI_jet_num
PRI_jet_leading_pt
PRI_jet_leading_eta
PRI_jet_leading_phi
PRI_jet_subleading_pt
PRI_jet_subleading_eta
PRI_jet_subleading_phi
PRI_jet_all_pt
Weight
Label
33


In [14]:
#Look at the number of jets first
train['PRI_jet_num'].value_counts()

0    99913
1    77544
2    50379
3    22164
Name: PRI_jet_num, dtype: int64

In [15]:
#Split the dataset into different jets
dfJet0 = train[(train['PRI_jet_num'] == 0)]
dfJet1 = train[(train['PRI_jet_num'] == 1)]
dfJet2 = train[(train['PRI_jet_num'] == 2)]
dfJet3 = train[(train['PRI_jet_num'] == 3)]
del train

def clean_jet_sets(jetFrame):
    for col in list(jetFrame.columns):
        missing = (jetFrame[col] == -999.000)
        if (missing.all()  == True):
            jetFrame = jetFrame.drop(col, axis = 1)
    return jetFrame

dfJet0 = clean_jet_sets(dfJet0)
dfJet1 = clean_jet_sets(dfJet1)
dfJet2 = clean_jet_sets(dfJet2)
dfJet3 = clean_jet_sets(dfJet3)
print("Jet 0 has the following columns:")
print(dfJet0.info())
print("\n")
print("Jet 1 has the following columns:")
print(dfJet1.info())
print("\n")
print("Jet 2 has the following columns:")
print(dfJet2.info())
print("\n")
print("Jet 3 has the following columns:")
print(dfJet3.info())
print("\n")

Jet 0 has the following columns:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 99913 entries, 3 to 249999
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   EventId                      99913 non-null  int64  
 1   DER_mass_MMC                 99913 non-null  float64
 2   DER_mass_transverse_met_lep  99913 non-null  float64
 3   DER_mass_vis                 99913 non-null  float64
 4   DER_pt_h                     99913 non-null  float64
 5   DER_deltar_tau_lep           99913 non-null  float64
 6   DER_pt_tot                   99913 non-null  float64
 7   DER_sum_pt                   99913 non-null  float64
 8   DER_pt_ratio_lep_tau         99913 non-null  float64
 9   DER_met_phi_centrality       99913 non-null  float64
 10  PRI_tau_pt                   99913 non-null  float64
 11  PRI_tau_eta                  99913 non-null  float64
 12  PRI_tau_phi                  99913 non-n

Each Jet is now considered separately and analysis performed separately. 

In [16]:
#dfJet0['PRI_jet_all_pt'].value_counts()
dfJet0 = dfJet0.drop(['PRI_jet_num','PRI_jet_all_pt'],axis = 1)
#Also, 'DER_pt_h' always equals 'DER_pt_tot':
dfJet0 = dfJet0.drop(['DER_pt_h'],axis = 1)

Due to the presence of a single jet in Jet 1, the transversal momentum equals the total

In [17]:
#(dfJet1['PRI_jet_all_pt'] - dfJet1['PRI_jet_leading_pt']).value_counts()
dfJet1 = dfJet1.drop(['PRI_jet_num','PRI_jet_all_pt'],axis = 1)

In [18]:
# For two jets, the total transversal momentum is the sum of the momentum of both jets -
#Lack of new information 
dfJet2 = dfJet2.drop(['PRI_jet_num', 'PRI_jet_all_pt'], axis = 1)
#There is more information available only when three jets are present 
dfJet3 = dfJet3.drop(['PRI_jet_num'],axis = 1)

#Since we're interested only in the sign of DER_deltaeta_jet_jet, drop it for now:
dfJet2 = dfJet2.drop('DER_deltaeta_jet_jet', axis = 1)
dfJet3 = dfJet3.drop('DER_deltaeta_jet_jet', axis = 1)

In [22]:
#Look at the missing values 
def getMissing(DF):
    for col in list(DF.columns):
        missing = (DF[col] == -999.0)
        if (missing.any() == True):
            print(col)
getMissing(dfJet0)
getMissing(dfJet1)
getMissing(dfJet2)
getMissing(dfJet3)

DER_mass_MMC
DER_mass_MMC
DER_mass_MMC
DER_mass_MMC


In [28]:
#Distribution of labels in the missing values
#After running the analysis we see that the majority of the values are in background and few in signal. 
def labelDistribution(DF):
    missing = (DF['DER_mass_MMC'] == -999.0)
    labels = DF['Label']
    print (labels[missing].value_counts())
labelDistribution(dfJet0)
labelDistribution(dfJet1)
labelDistribution(dfJet2)
labelDistribution(dfJet3)

b    24564
s     1559
Name: Label, dtype: int64
b    6857
s     705
Name: Label, dtype: int64
b    2485
s     467
Name: Label, dtype: int64
b    1373
s     104
Name: Label, dtype: int64
