# Exploratory Data Analysis for datasets
--- 

Aim 
* Understand structure of datasets, variables and where things are.

There are 11 files supplied in total, which are in the directory `./data/`.

Only the `.pkl` files have been downloaded, which are still approximately 1.7 GB in size. 

GitHub blocks pushes that are over 100 MB so they have not been committed.

In [2]:
import pandas as pd
import os

In [8]:
files = []
for file in os.listdir('data/'):
    if file.endswith('.pkl'):
        files.append(file)

print(files, len(files))

['total_dataset.pkl', 'jpsi_mu_k_swap.pkl', 'psi2S.pkl', 'jpsi_mu_pi_swap.pkl', 'phimumu.pkl', 'pKmumu_piTop.pkl', 'signal.pkl', 'pKmumu_piTok_kTop.pkl', 'k_pi_swap.pkl', 'jpsi.pkl', 'acceptance_mc.pkl'] 11


The number of entries for each dataset varies. For one dataset, there are only 773 events. Will there be a problem with this when creating a ML algorithm?

In [None]:
summary = pd.DataFrame(columns=['filename', 'size'])
for file in files:
    df = pd.read_pickle('data/' + file)
    summary = summary.append({'filename': file, 'size':len(df)}, ignore_index=True)
summary

# Total Dataset
* The `total_dataset.pkl` is the LHCb data to analyse.


In [21]:
total = pd.read_pickle('data/total_dataset.pkl')
total

Unnamed: 0,mu_plus_MC15TuneV1_ProbNNk,mu_plus_MC15TuneV1_ProbNNpi,mu_plus_MC15TuneV1_ProbNNmu,mu_plus_MC15TuneV1_ProbNNe,mu_plus_MC15TuneV1_ProbNNp,mu_plus_P,mu_plus_PT,mu_plus_ETA,mu_plus_PHI,mu_plus_PE,...,B0_OWNPV_Y,B0_OWNPV_Z,B0_FD_OWNPV,B0_ID,q2,phi,costhetal,costhetak,polarity,year
0,0.000303,0.282979,0.966269,2.708744e-06,2.358479e-05,22529.217656,3371.873364,2.586844,-2.463601,22529.465415,...,-0.2076,103.8536,2.377813,-511,3.627847,0.687398,-0.467658,0.992306,1,2016
1,0.020258,0.003956,0.998035,8.310519e-06,2.909099e-03,161024.220000,8534.230892,3.629914,-1.540762,161024.254665,...,-0.1812,-32.1607,6.161559,-511,13.718153,1.989341,0.814684,0.110453,-1,2016
2,0.088214,0.007898,0.998085,1.014832e-05,3.530469e-03,109393.330000,5710.313234,3.645141,0.265732,109393.381025,...,-0.1998,40.6020,47.151129,-511,9.567142,-1.738231,0.242143,-0.598537,1,2016
3,0.000404,0.016088,0.990623,3.606953e-06,1.098000e-05,13815.260342,1236.546436,3.104590,1.340393,13815.664371,...,-0.1859,-31.5099,4.679321,-511,18.784472,-0.041052,-0.832555,-0.497081,1,2016
4,0.000061,0.059106,0.997773,1.061150e-06,2.045247e-06,40205.947339,2979.715433,3.293958,-0.438803,40206.086170,...,-0.2535,-44.0725,4.054877,511,9.541409,-0.084586,-0.006600,0.294391,1,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498240,0.040453,0.001366,0.997394,1.393092e-03,1.167161e-02,211552.172468,7447.801664,4.039390,1.877470,211552.198853,...,-0.1246,39.0764,10.975222,511,7.739509,-1.323554,0.598088,-0.040562,1,2016
498241,0.000047,0.020918,0.998482,1.134924e-05,6.563242e-07,40338.590000,2968.512043,3.301038,-2.905452,40338.728375,...,-0.1814,-76.8413,1.959396,511,9.534103,2.481835,0.410797,-0.204429,1,2016
498242,0.011117,0.139928,0.991624,8.119469e-07,2.001283e-03,59256.560000,2274.457017,3.952914,1.632646,59256.654198,...,-0.1658,-46.5753,19.827259,-511,13.524815,-2.516108,-0.527713,0.994013,-1,2016
498243,0.014127,0.357343,0.758277,5.019459e-05,4.491047e-02,22014.330000,1115.553871,3.674847,-0.069939,22014.583554,...,-0.2319,-63.5059,5.819832,511,13.086508,0.965240,0.033597,-0.533498,-1,2016


In [29]:
total.columns

Index(['mu_plus_MC15TuneV1_ProbNNk', 'mu_plus_MC15TuneV1_ProbNNpi',
       'mu_plus_MC15TuneV1_ProbNNmu', 'mu_plus_MC15TuneV1_ProbNNe',
       'mu_plus_MC15TuneV1_ProbNNp', 'mu_plus_P', 'mu_plus_PT', 'mu_plus_ETA',
       'mu_plus_PHI', 'mu_plus_PE', 'mu_plus_PX', 'mu_plus_PY', 'mu_plus_PZ',
       'mu_plus_IPCHI2_OWNPV', 'mu_minus_MC15TuneV1_ProbNNk',
       'mu_minus_MC15TuneV1_ProbNNpi', 'mu_minus_MC15TuneV1_ProbNNmu',
       'mu_minus_MC15TuneV1_ProbNNe', 'mu_minus_MC15TuneV1_ProbNNp',
       'mu_minus_P', 'mu_minus_PT', 'mu_minus_ETA', 'mu_minus_PHI',
       'mu_minus_PE', 'mu_minus_PX', 'mu_minus_PY', 'mu_minus_PZ',
       'mu_minus_IPCHI2_OWNPV', 'K_MC15TuneV1_ProbNNk',
       'K_MC15TuneV1_ProbNNpi', 'K_MC15TuneV1_ProbNNmu',
       'K_MC15TuneV1_ProbNNe', 'K_MC15TuneV1_ProbNNp', 'K_P', 'K_PT', 'K_ETA',
       'K_PHI', 'K_PE', 'K_PX', 'K_PY', 'K_PZ', 'K_IPCHI2_OWNPV',
       'Pi_MC15TuneV1_ProbNNk', 'Pi_MC15TuneV1_ProbNNpi',
       'Pi_MC15TuneV1_ProbNNmu', 'Pi_MC15TuneV1_ProbNN

# Standard Model Signal

* `sig.pkl` - The signal decay, simulated as per the Standard Model

In [41]:
sig = pd.read_pickle('data/signal.pkl')
sig

Unnamed: 0,mu_plus_MC15TuneV1_ProbNNk,mu_plus_MC15TuneV1_ProbNNpi,mu_plus_MC15TuneV1_ProbNNmu,mu_plus_MC15TuneV1_ProbNNe,mu_plus_MC15TuneV1_ProbNNp,mu_plus_P,mu_plus_PT,mu_plus_ETA,mu_plus_PHI,mu_plus_PE,...,costhetak,B0_IPCHI2_OWNPV,B0_DIRA_OWNPV,B0_OWNPV_X,B0_OWNPV_Y,B0_OWNPV_Z,B0_FD_OWNPV,B0_ID,polarity,year
0,0.001626,0.368621,0.965144,2.496697e-06,6.789551e-05,28796.58,2531.302013,3.122733,-2.023842,28796.773836,...,-0.885188,4.304049,0.999999,0.7708,-0.2054,40.5605,27.879712,511,1,2016
1,0.000075,0.008453,0.998412,5.320784e-06,1.026381e-06,24673.47,2274.749879,3.074874,-1.014814,24673.696228,...,-0.133953,1.159875,0.999999,0.7926,-0.1152,-1.7938,12.678658,-511,1,2016
2,0.000051,0.007129,0.998914,6.864172e-06,1.078601e-06,28464.70,3379.198551,2.820631,-2.119273,28464.896096,...,0.741662,4.786264,0.999997,0.8396,-0.2445,53.8937,12.155244,511,1,2016
3,0.006548,0.088188,0.949859,7.272984e-06,5.321462e-03,11540.26,1048.238326,3.089809,-0.829094,11540.743674,...,0.150085,2.156253,1.000000,0.8143,-0.1723,32.0894,21.083577,-511,1,2016
4,0.000130,0.041185,0.992115,3.291775e-06,4.089373e-06,8605.87,1542.434031,2.404101,-2.030454,8606.518585,...,-0.998332,1.590714,0.999938,0.8220,-0.1640,-9.6481,1.710604,511,1,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255736,0.000333,0.108626,0.987444,1.502582e-06,1.870762e-05,23313.42,2234.277666,3.035954,-0.162299,23313.659425,...,-0.611537,2.645413,1.000000,0.8647,-0.2082,27.0830,28.458499,-511,1,2016
255737,0.000133,0.251615,0.984600,3.506863e-07,2.739955e-06,24766.97,2118.363086,3.150180,2.675359,24767.195374,...,-0.164196,0.238440,1.000000,0.8580,-0.1066,-26.6170,6.922365,-511,1,2016
255738,0.142958,0.057447,0.992398,2.708243e-06,8.296998e-04,73885.16,10205.654608,2.667913,-3.125113,73885.235548,...,-0.517692,4.281706,1.000000,0.8241,-0.2009,-18.6741,28.255551,-511,1,2016
255739,0.000063,0.009217,0.996697,4.489019e-06,6.358561e-07,30614.89,2465.913356,3.210446,1.116355,30615.072324,...,0.375601,5.515809,0.999995,0.8456,-0.1410,-18.2321,7.737330,511,1,2016


# Backgrounds simulation samples

* `jpsi.pkl` - <img src="https://latex.codecogs.com/gif.latex?B^{0}\rightarrow{}J/\psi{}K^{\ast{}0} " /> with <img src="https://latex.codecogs.com/gif.latex?J/\psi\rightarrow\mu\mu " />
* `psi2S.pkl` - <img src="https://latex.codecogs.com/gif.latex?B^{0}\rightarrow{}\psi{}(2S)K^{\ast{}0} " /> with <img src="https://latex.codecogs.com/gif.latex?\psi{}(2S)\rightarrow\mu\mu " />
* `jpsi_mu_k_swap.pkl` - <img src="https://latex.codecogs.com/gif.latex?B^{0}\rightarrow{}J/\psi{}K^{\ast{}0} " /> with the muon reconstructed as kaon and the kaon reconstructed as a muon
* `jpsi_mu_pi_swap.pkl` - <img src="https://latex.codecogs.com/gif.latex?B^{0}\rightarrow{}J/\psi{}K^{\ast{}0} " /> with the muon reconstructed as pion and the pion reconstructed as a muon
* `k_pi_swap.pkl` - signal decay but with the kaon reconstructed as a pion and the pion reconstructed as a kaon
* `phimumu.pkl` - <img src="https://latex.codecogs.com/gif.latex?B_{s}^{0}\rightarrow{}\phi\mu\mu " /> with <img src="https://latex.codecogs.com/gif.latex?\phi{}\rightarrow{}KK " /> and one of the kaons reconstructed as a pion
* `pKmumu_piTok_kTop.pkl` - <img src="https://latex.codecogs.com/gif.latex?\Lambda_{b}^{0}\rightarrow{}pK\mu\mu " /> with the proton reconstructed as a kaon and the kaon reconstructed as a pion
* `pKmumu_piTop.pkl`  - <img src="https://latex.codecogs.com/gif.latex?\Lambda_{b}^{0}\rightarrow{}pK\mu\mu " /> with the proton reconstructed as a pion

In [31]:
acc = pd.read_pickle('data/acceptance_mc.pkl')
cols = acc.columns
acc

Unnamed: 0,mu_plus_MC15TuneV1_ProbNNk,mu_plus_MC15TuneV1_ProbNNpi,mu_plus_MC15TuneV1_ProbNNmu,mu_plus_MC15TuneV1_ProbNNe,mu_plus_MC15TuneV1_ProbNNp,mu_plus_P,mu_plus_PT,mu_plus_ETA,mu_plus_PHI,mu_plus_PE,...,costhetak,B0_IPCHI2_OWNPV,B0_DIRA_OWNPV,B0_OWNPV_X,B0_OWNPV_Y,B0_OWNPV_Z,B0_FD_OWNPV,B0_ID,polarity,year
0,0.000045,0.039243,0.999345,8.361805e-07,0.000001,47515.45,3035.832415,3.442694,-2.097649,47515.567474,...,-0.325254,0.706831,1.000000,0.8757,-0.1507,-67.1291,39.395123,511,1,2016
1,0.001295,0.118154,0.992041,2.135627e-07,0.000328,7465.87,1400.820724,2.357511,-0.076334,7466.617611,...,-0.929107,0.766567,0.999973,0.8690,-0.1879,-16.4842,3.848004,-511,1,2016
2,0.018971,0.052269,0.993025,1.504784e-05,0.029875,139095.95,4196.451282,4.193844,-0.511106,139095.990129,...,-0.539335,1.170415,1.000000,0.8587,-0.2014,15.6514,31.008944,511,1,2016
3,0.000231,0.259635,0.971924,6.076813e-05,0.000147,16229.29,2946.570852,2.390978,1.386500,16229.633933,...,0.379058,6.179803,0.999977,0.7812,-0.1659,16.0543,6.435185,-511,1,2016
4,0.005569,0.315182,0.973565,1.367964e-06,0.003432,24957.42,1914.807750,3.259226,-0.731608,24957.643654,...,0.105453,0.620364,1.000000,0.8997,-0.1221,-50.4380,11.638834,511,1,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716854,0.000062,0.003175,0.999322,2.309370e-06,0.000001,21752.35,2737.286994,2.761919,-0.916564,21752.606607,...,-0.679964,0.145360,1.000000,0.8721,-0.1452,-29.0464,5.605457,511,1,2016
716855,0.000737,0.031372,0.995584,6.899959e-06,0.000049,34724.83,2550.932812,3.302791,1.815204,34724.990745,...,0.104648,0.832383,1.000000,0.8011,-0.2018,-10.7847,13.046422,511,1,2016
716856,0.729794,0.035291,0.725008,1.966208e-05,0.361820,4687.27,1431.843156,1.854845,0.050745,4688.460701,...,0.315782,0.507845,1.000000,0.8349,-0.1570,-0.9651,20.143577,-511,1,2016
716857,0.000955,0.196032,0.940839,8.133744e-06,0.000119,48537.30,2644.574006,3.602227,-2.935226,48537.415001,...,-0.682951,0.631402,0.999998,0.8648,-0.1829,52.1866,7.341194,511,1,2016


In [9]:
df = pd.read_pickle('data/signal.pkl')
df.columns

Index(['mu_plus_MC15TuneV1_ProbNNk', 'mu_plus_MC15TuneV1_ProbNNpi',
       'mu_plus_MC15TuneV1_ProbNNmu', 'mu_plus_MC15TuneV1_ProbNNe',
       'mu_plus_MC15TuneV1_ProbNNp', 'mu_plus_P', 'mu_plus_PT', 'mu_plus_ETA',
       'mu_plus_PHI', 'mu_plus_PE', 'mu_plus_PX', 'mu_plus_PY', 'mu_plus_PZ',
       'mu_plus_IPCHI2_OWNPV', 'mu_minus_MC15TuneV1_ProbNNk',
       'mu_minus_MC15TuneV1_ProbNNpi', 'mu_minus_MC15TuneV1_ProbNNmu',
       'mu_minus_MC15TuneV1_ProbNNe', 'mu_minus_MC15TuneV1_ProbNNp',
       'mu_minus_P', 'mu_minus_PT', 'mu_minus_ETA', 'mu_minus_PHI',
       'mu_minus_PE', 'mu_minus_PX', 'mu_minus_PY', 'mu_minus_PZ',
       'mu_minus_IPCHI2_OWNPV', 'K_MC15TuneV1_ProbNNk',
       'K_MC15TuneV1_ProbNNpi', 'K_MC15TuneV1_ProbNNmu',
       'K_MC15TuneV1_ProbNNe', 'K_MC15TuneV1_ProbNNp', 'K_P', 'K_PT', 'K_ETA',
       'K_PHI', 'K_PE', 'K_PX', 'K_PY', 'K_PZ', 'K_IPCHI2_OWNPV',
       'Pi_MC15TuneV1_ProbNNk', 'Pi_MC15TuneV1_ProbNNpi',
       'Pi_MC15TuneV1_ProbNNmu', 'Pi_MC15TuneV1_ProbNN

---
# Investigating the variables

Refer to the documentation for the variable meanings.

Seems like the dataframe keys are a bit scrambled in order, but all are present. 

In [37]:
for i in range(len(cols)):
    if cols[i]!= total.columns[i]:
        print(cols[i],total.columns[i])

q2 B0_IPCHI2_OWNPV
phi B0_DIRA_OWNPV
costhetal B0_OWNPV_X
costhetak B0_OWNPV_Y
B0_IPCHI2_OWNPV B0_OWNPV_Z
B0_DIRA_OWNPV B0_FD_OWNPV
B0_OWNPV_X B0_ID
B0_OWNPV_Y q2
B0_OWNPV_Z phi
B0_FD_OWNPV costhetal
B0_ID costhetak


In [38]:
cols.size, total.columns.size

(81, 81)