In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import uproot
import pickle
import nue_booster
import importlib
importlib.reload(nue_booster)

import awkward

In [None]:
import localSettings as ls
print(ls.ntuple_path)

In [None]:
# train both with RUN3 and RUN1?
TRAINBOTH = True

In [None]:

VARLOAD = ["shr_tkfit_npointsvalid","shr_tkfit_npoints", # track-fitted shower
           "shrsubclusters0","shrsubclusters1","shrsubclusters2", # sub-clusters for shower
           "trk_llr_pid_score_v", # PID for proton candidate
           "shrmoliereavg","shrmoliererms", # moliere metrics for shower
           "trkshrhitdist2", # 2D distance shower hits to track hits
           "trk_id","trk_energy_tot","shr_energy_tot_cali",
           "tksh_distance","shr_tkfit_dedx_Y","shr_tkfit_dedx_U","shr_tkfit_dedx_V","tksh_angle","n_showers_contained","shr_score", 
           "nu_e","slpdg",# truth variables
           "category","selected",
           #"crtveto","crthitpe","_closestNuCosmicDist", # CRT quantities
           "hits_ratio","slclustfrac","shr_energy","shr_theta","CosmicIP","n_tracks_contained",
           # second-shower tagging variables
           "secondshower_Y_nhit","secondshower_Y_vtxdist","secondshower_Y_dot","secondshower_Y_dir","shrclusdir2",
           "secondshower_V_nhit","secondshower_V_vtxdist","secondshower_V_dot","secondshower_V_dir","shrclusdir1",
           "secondshower_U_nhit","secondshower_U_vtxdist","secondshower_U_dot","secondshower_U_dir","shrclusdir0",
           # pi0 tagging variables
           "pi0_radlen1","pi0_radlen2","pi0_energy2_Y","pi0_dedx2_fit_Y","pi0_mass_Y","pi0_gammadot"
          ]

In [None]:
fold = "nuselection"
tree = "NeutrinoSelectionFilter"

EXT3 = 'data_extbnb_mcc9.1_v08_00_00_25_reco2_G1_all_reco2.root'
NU3  = 'prodgenie_bnb_nu_uboone_overlay_mcc9.1_v08_00_00_26_filter_run3_reco2_G_reco2.root'
NUE3 = 'prodgenie_bnb_intrinsice_nue_uboone_overlay_mcc9.1_v08_00_00_26_run3_reco2_reco2.root'
NCPI03 = 'prodgenie_nc_pi0_uboone_overlay_mcc9.1_v08_00_00_26_run3_G_reco2.root'
CCPI03 = 'prodgenie_cc_pi0_uboone_overlay_v08_00_00_26_run3_G_reco2.root'
    
EXT1 = 'data_extbnb_mcc9.1_v08_00_00_25_reco2_C1_all_reco2.root'
NU1  = 'prodgenie_bnb_nu_uboone_overlay_mcc9.1_v08_00_00_26_filter_run1_reco2_reco2.root'
NUE1 = 'prodgenie_bnb_intrinsice_nue_uboone_overlay_mcc9.1_v08_00_00_26_run1_reco2_reco2.root'
NCPI01 = 'prodgenie_nc_pi0_uboone_overlay-v08_00_00_26_run1_reco2_reco2.root'
CCPI01 = 'prodgenie_cc_pi0_uboone_overlay_v08_00_00_26_run1_reco2.root'


mc3 = uproot.open(ls.ntuple_path+NU3)[fold][tree]
ncpi03 = uproot.open(ls.ntuple_path+NCPI03)[fold][tree]
ccpi03 = uproot.open(ls.ntuple_path+CCPI03)[fold][tree]
nue3 = uproot.open(ls.ntuple_path+NUE3)[fold][tree]
ext3 = uproot.open(ls.ntuple_path+EXT3)[fold][tree]

if (TRAINBOTH == True):
    mc1 = uproot.open(ls.ntuple_path+NU1)[fold][tree]
    ncpi01 = uproot.open(ls.ntuple_path+NCPI01)[fold][tree]
    ccpi01 = uproot.open(ls.ntuple_path+CCPI01)[fold][tree]
    nue1 = uproot.open(ls.ntuple_path+NUE1)[fold][tree]
    ext1 = uproot.open(ls.ntuple_path+EXT1)[fold][tree]

if (TRAINBOTH == True):
    uproot_v = [mc3,ncpi03,ccpi03,nue3,ext3,mc1,ncpi01,ccpi01,nue1,ext1]
else:
    uproot_v = [mc3,ncpi03,ccpi03,nue3,ext3]
    
ncpi03 = ncpi03.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
ccpi03 = ccpi03.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
mc3 = mc3.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
nue3 = nue3.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
ext3 = ext3.pandas.df(VARLOAD, flatten=False)
if (TRAINBOTH == True):
    ncpi01 = ncpi01.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
    ccpi01 = ccpi01.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
    mc1 = mc1.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
    nue1 = nue1.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
    ext1 = ext1.pandas.df(VARLOAD, flatten=False)

ext3["weightSpline"] = 1
if (TRAINBOTH == True):
    ext1["weightSpline"] = 1

if (TRAINBOTH == True):
    df_v = [mc3,ncpi03,ccpi03,nue3,ext3,mc1,ncpi01,ccpi01,nue1,ext1]
else:
    df_v = [mc3,ncpi03,ccpi03,nue3,ext3]


In [None]:
# define some energy-related variables
for i,df in enumerate(df_v):
    df["reco_e"] = (df["shr_energy_tot_cali"] + 0.03) / 0.79 + df["trk_energy_tot"]
    df["reco_e_qe"] = 0.938*((df["shr_energy"]+0.030)/0.79)/(0.938 - ((df["shr_energy"]+0.030)/0.79)*(1-np.cos(df["shr_theta"])))
    df["reco_e_rqe"] = df["reco_e_qe"]/df["reco_e"]
    # and the 2d angle difference
    df['anglediff_Y'] = np.abs(df['secondshower_Y_dir']-df['shrclusdir2'])
    df['anglediff_V'] = np.abs(df['secondshower_V_dir']-df['shrclusdir1'])
    df['anglediff_U'] = np.abs(df['secondshower_U_dir']-df['shrclusdir0'])

In [None]:
# how to get the LLR-PID value for the "track candidate" (proton for nue selection, muon for numu)
# can be done for any variable
# code from Giuseppe!

for i,df in enumerate(df_v):
    up = uproot_v[i]
    trk_llr_pid_v = up.array('trk_llr_pid_score_v')
    trk_id = up.array('trk_id')-1 # I think we need this -1 to get the right result
    trk_llr_pid_v_sel = awkward.fromiter([pidv[tid] if tid<len(pidv) else 9999. for pidv,tid in zip(trk_llr_pid_v,trk_id)])
    df['trkpid'] = trk_llr_pid_v_sel
    df['subcluster'] = df['shrsubclusters0'] + df['shrsubclusters1'] + df['shrsubclusters2']
    df['trkfit'] = df['shr_tkfit_npointsvalid'] / df['shr_tkfit_npoints']

In [None]:
lee_bins = [0, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.8]
#lee_scaling = [
#    6.3744101, 6.3744101, 5.6455402, 3.7305500, 1.5091400, 1.0742800, 0.7540929,
#    0.4763070, 0.1523270
#]
lee_scaling = [1,1,1,1,1,1,1,1,1]

ncpi03["train_weight"] = 1
ccpi03["train_weight"] = 1
nue3["train_weight"] = 0
mc3["train_weight"] = 1
ext3["train_weight"] = 1
if (TRAINBOTH == True):
    ncpi01["train_weight"] = 1
    ccpi01["train_weight"] = 1
    nue1["train_weight"] = 0
    mc1["train_weight"] = 1
    ext1["train_weight"] = 1


for i, lee_bin in enumerate(lee_bins):
    
    if i == 0:
        continue
        
    nue3.loc[(nue3['nu_e'] > lee_bins[i-1]) & (nue3['nu_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * nue3['weightSpline']
    if (TRAINBOTH==True):
        nue1.loc[(nue1['nu_e'] > lee_bins[i-1]) & (nue1['nu_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * nue1['weightSpline']


In [None]:
ncpi03["is_signal"] = ncpi03["category"] == 11
ccpi03["is_signal"] = ccpi03["category"] == 11
nue3["is_signal"] = nue3["category"] == 11
mc3["is_signal"] = mc3["category"] == 11
ext3["is_signal"] = ext3["category"] == 11
if (TRAINBOTH == True):
    ncpi01["is_signal"] = ncpi01["category"] == 11
    ccpi01["is_signal"] = ccpi01["category"] == 11
    nue1["is_signal"] = nue1["category"] == 11
    mc1["is_signal"] = mc1["category"] == 11
    ext1["is_signal"] = ext1["category"] == 11

In [None]:
# variables to be trained on
TRAINVAR = ['tksh_angle',"shr_tkfit_dedx_Y","shr_tkfit_dedx_U","shr_tkfit_dedx_V","n_showers_contained","shr_score","tksh_distance",\
            "trkfit","trkpid","subcluster","shrmoliereavg","shrmoliererms","trkshrhitdist2","hits_ratio",\
            'is_signal','train_weight','nu_e'#,\
            #'secondshower_Y_nhit','secondshower_Y_vtxdist','secondshower_Y_dot','anglediff_Y',\
            #'secondshower_V_nhit','secondshower_V_vtxdist','secondshower_V_dot','anglediff_V',\
            #'secondshower_U_nhit','secondshower_U_vtxdist','secondshower_U_dot','anglediff_U'\
            #"pi0_radlen1","pi0_radlen2","pi0_energy2_Y","pi0_dedx2_fit_Y","pi0_mass_Y","pi0_gammadot"\
        ]


In [None]:
print (ls.pickle_path)

In [None]:
train_mc3, test_mc3 = train_test_split(mc3, test_size=0.5, random_state=1990)
train_ext3, test_ext3 = train_test_split(ext3, test_size=0.5, random_state=1990)
train_nue3, test_nue3 = train_test_split(nue3, test_size=0.5, random_state=1990)
train_ncpi03, test_ncpi03 = train_test_split(ncpi03, test_size=0.5, random_state=1990)
train_ccpi03, test_ccpi03 = train_test_split(ccpi03, test_size=0.5, random_state=1990)
if (TRAINBOTH == True):
    train_mc1, test_mc1 = train_test_split(mc1, test_size=0.5, random_state=1990)
    train_ext1, test_ext1 = train_test_split(ext1, test_size=0.5, random_state=1990)
    train_nue1, test_nue1 = train_test_split(nue1, test_size=0.5, random_state=1990)
    train_ncpi01, test_ncpi01 = train_test_split(ncpi01, test_size=0.5, random_state=1990)
    train_ccpi01, test_ccpi01 = train_test_split(ccpi01, test_size=0.5, random_state=1990)

# merge run1 and run3 samples
if (TRAINBOTH == True):
    train_mc = pd.concat([train_mc3,train_mc1])
    train_ext = pd.concat([train_ext3,train_ext1])
    train_nue = pd.concat([train_nue3,train_nue1])
    train_ncpi0 = pd.concat([train_ncpi03,train_ncpi01])
    train_ccpi0 = pd.concat([train_ccpi03,train_ccpi01])
    test_mc = pd.concat([test_mc3,test_mc1])
    test_ext = pd.concat([test_ext3,test_ext1])
    test_nue = pd.concat([test_nue3,test_nue1])
    test_ncpi0 = pd.concat([test_ncpi03,test_ncpi01])
    test_ccpi0 = pd.concat([test_ccpi03,test_ccpi01])
    mc = pd.concat([mc3,mc1])
    ext = pd.concat([ext3,ext1])
    nue = pd.concat([nue3,nue1])
    ncpi0 = pd.concat([ncpi03,ncpi01])
    ccpi0 = pd.concat([ccpi03,ccpi01])
else:
    train_mc = train_mc3
    train_ext = train_ext3
    train_nue = train_nue3
    train_ncpi0 = train_ncpi03
    train_ccpi0 = train_ccpi03
    test_mc = test_mc3
    test_ext = test_ext3
    test_nue = test_nue3
    test_ncpi0 = test_ncpi03
    test_ccpi0 = test_ccpi03
    mc = mc3
    ext = ext3
    nue = nue3
    ncpi0 = ncpi03
    ccpi0 = ccpi03

samples = {
    "mc": (train_mc, test_mc),
    "nue": (train_nue, test_nue),
    "ext": (train_ext, test_ext),
    "nc": (train_ncpi0, test_ncpi0),
    "cc": (train_ccpi0, test_ccpi0)
} 

fig, ax = plt.subplots(1,1)

my_booster = nue_booster.NueBooster(samples, TRAINVAR, random_state=1990)

print (my_booster.variables)

PRESEL = "reco_e < 1.0 and selected == 1 and n_tracks_contained > 0"# and (crtveto == 0) and (_closestNuCosmicDist > 20)"
PRESEL += " and shr_energy_tot_cali > 0.07"
PRESEL += ' and n_showers_contained == 1'
PRESEL += ' and hits_ratio>0.5'
PRESEL += ' and tksh_distance < 6.0'
PRESEL += ' and shr_tkfit_dedx_Y < 4.0'
PRESEL += ' and tksh_angle > -0.9'
PRESEL += ' and trkpid < 0.1'
PRESEL += ' and shr_score < 0.30'
PRESEL += ' and CosmicIP > 20.'

my_booster.set_preselection(PRESEL)

for label, bkg_query in zip(nue_booster.labels, nue_booster.bkg_queries):
    
    preds = my_booster.train_booster(ax, bkg_query)
    
    with open(ls.pickle_path+'booster_%s.pickle' % label, 'wb') as booster_file:
        pickle.dump(preds, booster_file)

    variables = my_booster.variables.copy()
    print ('variables are : ',variables)
    variables.remove("is_signal")
    variables.remove("nu_e")
    variables.remove("train_weight")        
        
    mc_prediction = preds.predict(
        xgb.DMatrix(mc[variables]),
        ntree_limit=preds.best_iteration)
    nue_prediction = preds.predict(
        xgb.DMatrix(nue[variables]),
        ntree_limit=preds.best_iteration)
    ext_prediction = preds.predict(
        xgb.DMatrix(ext[variables]),
        ntree_limit=preds.best_iteration)
    ncpi0_prediction = preds.predict(
        xgb.DMatrix(ncpi0[variables]),
        ntree_limit=preds.best_iteration)
    ccpi0_prediction = preds.predict(
        xgb.DMatrix(ccpi0[variables]),
        ntree_limit=preds.best_iteration)

    ncpi0["%s_score" % label] = ncpi0_prediction
    ccpi0["%s_score" % label] = ccpi0_prediction
    mc["%s_score" % label] = mc_prediction
    nue["%s_score" % label] = nue_prediction
    ext["%s_score" % label] = ext_prediction


ax.set_ylim([0, 1.05])
ax.set_xlim([0, 1.0])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC')
ax.legend()
fig.tight_layout()
fig.savefig(ls.plots_path+"roc_single.pdf")