# Energy Regresion
This is an introduction, inspecting the dataset.
The dataset is a sample of single electrons, without pileup. The electron have eta between 0.05 and 0.65 and an uncalibrated transverse energy less than 10 GeV.

In [None]:
# We will use RDataFrame to read the data in ROOT format, but we will use Pandas to manipulated and visualize it.

try:
    from ROOT import RDataFrame
except ImportError:  # this is for google colab where it is difficult to install ROOT
    RDataFrame = None
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
def read_data(fn, treename):
    # specify the columns we want to read
    columns = ['el_rawcl_Es0', 'el_rawcl_Es1', 'el_rawcl_Es2', 'el_rawcl_Es3',  # energy in the calorimeter layers
               'el_rawcl_E',  # the sum
               'el_cl_aeta',  # |eta|
               'el_f0',  # PS fraction
               'el_truth_E', 'el_truth_pT'  # truth info
               ]
    
    df = RDataFrame(treename, fn)

    df = df.Define('el_truth_pT', 'el_truth_E / cosh(el_truth_eta)')\
           .Define('el_cl_aeta', 'abs(el_cl_eta)')\
           .Define('el_rawcl_E', 'el_rawcl_Es0 + el_rawcl_Es1 + el_rawcl_Es2 + el_rawcl_Es3')\
           .Define('el_f0', 'el_rawcl_Es0 / el_rawcl_E')
    df = df.AsNumpy(columns)  # this is a dictionary with a numpy array as values
    
    df = pd.DataFrame(df)  # convert it to pandas
    df['el_erawOverEtrue'] = df['el_rawcl_E'] / df['el_truth_E']  # add a column in the dataframe

    return df

if RDataFrame is not None:  # if we have ROOT installed
    fn = "http://rgw.fisica.unimi.it/TutorialML-AtlasItalia2022/MVACalib_electron_Et0-10_eta1.0-1.2_Eaccordion.root?AWSAccessKeyId=M06HBTUGIKXVXYH1RES6&Signature=Z6bHqOv%2FLgnNTDOfAai5%2F11x50Y%3D&Expires=1828737990"
    df_train = read_data(fn, 'TrainTree')
    df_test = read_data(fn, 'TestTree')

    df_train.to_csv('train_electron_Et0-10_eta1.0-1.2_Eaccordion.csv')
    df_test.to_csv('test_electron_Et0-10_eta1.0-1.2_Eaccordion.csv')
else:
    df_train = pd.read_csv('http://rgw.fisica.unimi.it/TutorialML-AtlasItalia2022/train_electron_Et0-10_eta1.0-1.2_Eaccordion.csv?AWSAccessKeyId=M06HBTUGIKXVXYH1RES6&Signature=U%2BMJxVi5El1wxtCz%2B45VqLmUuok%3D&Expires=1828739034')
    df_test = pd.read_csv('http://rgw.fisica.unimi.it/TutorialML-AtlasItalia2022/test_electron_Et0-10_eta1.0-1.2_Eaccordion.csv?AWSAccessKeyId=M06HBTUGIKXVXYH1RES6&Signature=YKG4lzc%2FI0%2BcJZRnQG350DnVVK4%3D&Expires=1828739085')

In [None]:
print(len(df_train))

In [None]:
df_train.head()

In [None]:
df_train.describe()

The energy is quite small, usually where the fraction of energy lost is relatively large.

In [None]:
df_train.hist('el_truth_pT', bins=100, grid=False)
plt.show()

Looking at the ratio between the uncalibrated energy (raw-energy) and the truth energy we see a quite large distribution, peaked around 0.9. The distribution is becoming larger at low energy as expected, but it also get shifted. The calibration will try to understand this kind of dependency as a function of the considered variables.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

# find the quantiles that divide the sample in three bins in el_rawcl_E with the same population
el_rawcl_E_edges = df_train['el_rawcl_E'].quantile(np.linspace(0, 1, 3 + 1))
el_rawcl_E_edges = np.round(el_rawcl_E_edges, -3)

df_train.hist('el_erawOverEtrue', bins=np.linspace(0.4, 1.2, 100), grid=False, ax=ax, density=True, alpha=0.5, label='inclusive')
for k, v in df_train.groupby(pd.cut(df_train['el_rawcl_E'], el_rawcl_E_edges)):
    label = '{:6.0f}-{:6.0f}'.format(k.left, k.right)
    v.hist('el_erawOverEtrue', bins=np.linspace(0.4, 1.2, 100), grid=False, ax=ax, density=True, histtype='step', lw=2, label=label)
ax.set_xlabel(r'$E_{raw}/E_{true}$', fontsize=20)
ax.axvline(1, ls='--', color='0.3')
ax.legend(loc=0, title='el_rawcl_E range')

Have a look how variables are correlated, in particular with the truth energy.

In [None]:
from corner import corner
corner(df_train, labels=df_train.columns, levels=1.0 - np.exp(-0.5 * np.arange(0.5, 4.1, 0.5) ** 2))
plt.show()

Close look to the correlation. On the left plot $P[\text{response}, X]$, on the right $P[\text{response}|X]$

In [None]:
xvars = ['el_f0', 'el_cl_aeta', 'el_rawcl_E']

for xvar in xvars:
    xedges = df_train[xvar].quantile(np.linspace(0, 1, 40))
    yedges = np.linspace(0.6, 1., 40)
    h, xedges, yedges = np.histogram2d(df_train[xvar], df_train['el_erawOverEtrue'], bins=(xedges, yedges), normed=True)
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    axs[0].pcolormesh(xedges, yedges, h.T)

    hcond = (h.T / h.sum(axis=1)).T
    axs[1].pcolormesh(xedges, yedges, hcond.T, vmax=0.06)

    axs[0].set_title('P[%s, el_erawOverEtrue]' % xvar, fontsize=15)
    axs[1].set_title('P[el_erawOverEtrue | %s]' % xvar, fontsize=15)
    for ax in axs:
        ax.set_xlabel(xvar)
        ax.set_ylabel('el_erawOverEtrue')

Plot the mean on the y-axis

In [None]:
xvars = ['el_f0', 'el_cl_aeta', 'el_rawcl_E']

for xvar in xvars:
    xedges = df_train[xvar].quantile(np.linspace(0, 1, 100)).values
    xbins_midpoints = 0.5 * (xedges[1:] + xedges[:-1])
    df_agg = df_train.groupby(np.digitize(df_train[xvar], xedges))['el_erawOverEtrue'].agg(['mean', 'sem'])
    df_agg = df_agg.reindex(range(1, len(xbins_midpoints) + 1))
    df_agg.index = xbins_midpoints
    fig, ax = plt.subplots()
    ax.errorbar(df_agg.index, df_agg['mean'], df_agg['sem'], fmt='.')
    ax.set_xlabel(xvar, fontsize=15)
    ax.set_ylabel('Mean[el_erawOverEtrue]')
    ax.set_ylim(0.65, 0.85)

What we want to do it to find train an algorithm to fit the energy correction 

$$k = (E_{raw}/E_{true})$$

once we have it the estimated energy will be

$$ E_{raw} / k$$

We don't try to fit directly $E_{true}$ so that the algorithm already know that there is a strong (and quite linear) dependency between the raw-energy and the truth-energy.