# Notebook for LST1 training and storage of Random Forests 


## Some imports

In [None]:
import numpy as np                                                                                            
import pandas as pd                                                                                                                                                                                      
import matplotlib.pyplot as plt                                                                               
import joblib  

from lstchain.reco.dl1_to_dl2 import train_energy, train_disp_vector, train_sep, apply_models  
from lstchain.visualization import plot_dl2     
from lstchain.reco import utils, disp

from sklearn.model_selection import train_test_split
from lstchain.tests.test_lstchain import dl1_params_lstcam_key
from lstchain.io.config import get_standard_config

%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 15)
plt.rcParams['font.size'] = 14

## Get event DL1 files for training

    
We need two files, for **gammas** and for **protons**. These gammas are pointlike.

In [None]:
gammafile = "../data/mc/dl1/gamma-diffuse/dl1_gamma_20deg_180deg_run99___cta-prod5-lapalma_4LSTs_MAGIC_desert-2158m_mono_cone6.h5"
protonfile = "../data/mc/dl1/proton/dl1_proton_20deg_180deg_run99___cta-prod5-lapalma_4LSTs_MAGIC_desert-2158m_mono.h5"

<font size="4">
We read the files as pandas dataframes:

In [None]:
# reading events

df_gammas = pd.read_hdf(gammafile, key=dl1_params_lstcam_key)                                                               
df_proton = pd.read_hdf(protonfile, key=dl1_params_lstcam_key)

# Filtering events

config = get_standard_config()
events_filters = config["events_filters"]
df_gammas = utils.filter_events(df_gammas, filters=events_filters, 
                                finite_params=config['regression_features'] + config['classification_features'])
df_proton = utils.filter_events(df_proton, filters=events_filters,
                               finite_params=config['regression_features'] + config['classification_features'])

    
df_gammas.keys()

<font size="4">
From all the previous information, we choose certain features to train the Random Forests. We choose the standard ones.

In [None]:
features = config['regression_features']
features 

<font size="4">
Now we must split the data into train and test sets. 
Gamma events will train energy and direction reconstruction, and gamma/hadron separation, but protons are only used for separation.


## Train the Reconstruction


We train two Random Forest Regressors, from scikit-learn, to reconstruct "energy" and "disp" of the **test** set.

In [None]:
#Train regressors for energy and disp_norm reconstruction, ONLY with gammas

RFreg_Energy = train_energy(df_gammas)

RFreg_Disp = train_disp_vector(df_gammas)


#Train classifier for gamma/hadron separation.
traing, testg = train_test_split(df_gammas, test_size=0.2)

temp_reg_energy = train_energy(traing)
temp_reg_disp_vector = train_disp_vector(traing)


We can now predict the **energy** and **disp** of the test events, and from **disp**, calculate the reconstructed direction.

In [None]:

test = testg.append(df_proton,ignore_index=True)            #Protons are only for testing when trainin Energy/Direction reco.


#Apply the regressors to the test set

test['log_reco_energy'] = temp_reg_energy.predict(test[features])  
test['reco_energy'] = np.power(10,test['log_reco_energy']) 

disp_vector = temp_reg_disp_vector.predict(test[features])                                                  
test['reco_disp_dx'] = pd.DataFrame(disp_vector[:,0])
test['reco_disp_dy'] = pd.DataFrame(disp_vector[:,1])


In [None]:
test['reco_src_x'],test['reco_src_y'] = disp.disp_to_pos(test.reco_disp_dx,
                                                            test.reco_disp_dy,
                                                            test.x,
                                                            test.y,
                                                            )

We use these test events with reconstructed energy and direction to 
    **train the gamma/hadron separation.**
<br>
We add these two features to the list of features for training:

In [None]:
features_sep = list(features)                                                                             
features_sep.append('log_reco_energy') 
features_sep.append('reco_disp_dx')
features_sep.append('reco_disp_dy')
features_sep

In [None]:
test.keys()


**Train the gamma/hadron classifier:**
<br>
Now we train a scikit-learn **RandomForestClassifier** which will separate events in two classes: 0 for **gammas** and 1 for **protons**. We call this parameter **hadroness**.


In [None]:
#Train the Classifier
RFcls_GH = train_sep(test)


<font size="4">
Predict the hadroness of the test events:

In [None]:
test['reco_type'] = RFcls_GH.predict(test[features_sep])

In [None]:
test['gammaness'] = 1 - test['reco_type'] 

## Save the Random Forests:
<br>
We can save these trained RF into files to apply them later on any set of data:

In [None]:
fileE = "RFreg_Energy.sav"                                                           
fileD = "RFreg_Disp.sav"                                                             
fileH = "RFcls_GH.sav"                                                               
joblib.dump(RFreg_Energy, fileE)                                                                      
joblib.dump(RFreg_Disp, fileD)                                                                        
joblib.dump(RFcls_GH, fileH)


# Now we can plot some results
<br>
We can take in input the corresponding testing sample from the same RF folder. 
If we apply the RF, we can get the test_dl2 file

In [None]:

gammafile_test = "../data/mc/dl1/gamma/dl1_gamma_south_pointing_20201017_v0.6.3_prod5_LST1_local_off0.0deg_DL1_testing.h5"
df_gammas_test = pd.read_hdf(gammafile_test, key=dl1_params_lstcam_key)                                                               
df_gammas_test = utils.filter_events(df_gammas_test, filters=events_filters,
                                    finite_params=config['regression_features'] + config['classification_features'])

test_dl2 = apply_models(df_gammas_test, RFcls_GH, RFreg_Energy, RFreg_Disp)


### Distribution of features

In [None]:
plot_dl2.plot_features(test_dl2)

## Energy reconstruction

In [None]:
plot_dl2.plot_energy_resolution(test_dl2)

## Disp reconstruction

In [None]:
plt.rcParams['agg.path.chunksize'] = 10000   # sometimes complains and it needs this
plot_dl2.plot_disp(test_dl2)

## Source position in camera coordinates

In [None]:
plot_dl2.plot_pos(test_dl2)

## Importance of features for Gamma/Hadron separation

In [None]:
plot_dl2.plot_importances(RFcls_GH,features_sep)

## ROC curve

In [None]:
plot_dl2.plot_roc_gamma(test)


# Note on mono analysis

Mono analysis is very limited, specially at low energies, where it's much more difficult to separate gammas from hadrons without stereo information. If we discard low energy events, we can see that the performance improves.
<br>
<br>
For example, we can cut at **500 GeV:**

In [None]:
cut = 0.5 # TeV

e_cut = np.log10(cut) 
test_dl2_cut = test_dl2[test_dl2['log_reco_energy'] > e_cut]

In [None]:
plot_dl2.plot_energy_resolution(test_dl2_cut)

In [None]:
plot_dl2.plot_disp(test_dl2_cut)

In [None]:
plot_dl2.plot_pos(test_dl2_cut)