In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np

import uproot
import pickle
import nue_booster
import importlib
importlib.reload(nue_booster)

import awkward

In [None]:
import localSettings as ls
print(ls.ntuple_path)

In [None]:
VARLOAD = ["shr_tkfit_npointsvalid","shr_tkfit_npoints", # track-fitted shower
           "shrsubclusters0","shrsubclusters1","shrsubclusters2", # sub-clusters for shower
           "trk_llr_pid_score_v", # PID for proton candidate
           "shrmoliereavg","shrmoliererms", # moliere metrics for shower
           "trkshrhitdist2", # 2D distance shower hits to track hits
           "trk_id","trk_energy_tot","shr_energy_tot_cali",
           "tksh_distance","shr_tkfit_dedx_Y","tksh_angle","n_showers_contained","shr_score", 
           "nu_e","slpdg",# truth variables
           "category","selected",
           "crtveto","crthitpe","_closestNuCosmicDist", # CRT quantities
           "hits_ratio","slclustfrac","shr_energy","shr_theta","CosmicIP"
          ]

In [None]:
fold = "nuselection"
tree = "NeutrinoSelectionFilter"

BNB = 'data_bnb_mcc9.1_v08_00_00_25_reco2_G1_beam_good_reco2_1e19.root'
EXT = 'data_extbnb_mcc9.1_v08_00_00_25_reco2_G1_all_reco2.root'
NU  = 'prodgenie_bnb_nu_uboone_overlay_mcc9.1_v08_00_00_26_filter_run3_reco2_G_reco2.root'
NUE = 'prodgenie_bnb_intrinsice_nue_uboone_overlay_mcc9.1_v08_00_00_26_run3_reco2_reco2.root'
DRT = 'prodgenie_bnb_dirt_overlay_mcc9.1_v08_00_00_26_run3_reco2_reco2.root'
PI0 = 'prodgenie_nc_pi0_uboone_overlay_mcc9.1_v08_00_00_26_run3_G_reco2.root'

mc = uproot.open(ls.ntuple_path+NU)[fold][tree]
nc = uproot.open(ls.ntuple_path+PI0)[fold][tree]
nue = uproot.open(ls.ntuple_path+NUE)[fold][tree]
ext = uproot.open(ls.ntuple_path+EXT)[fold][tree]

uproot_v = [mc,nc,nue,ext]

nc = nc.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
mc = mc.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
nue = nue.pandas.df(VARLOAD + ["weightSpline"], flatten=False)
ext = ext.pandas.df(VARLOAD, flatten=False)

ext["weightSpline"] = 1

df_v = [mc,nc,nue,ext]

# define some energy-related variables
for i,df in enumerate(df_v):
    df["reco_e"] = (df["shr_energy_tot_cali"] + 0.03) / 0.79 + df["trk_energy_tot"]
    df["reco_e_qe"] = 0.938*((df["shr_energy"]+0.030)/0.79)/(0.938 - ((df["shr_energy"]+0.030)/0.79)*(1-np.cos(df["shr_theta"])))
    df["reco_e_rqe"] = df["reco_e_qe"]/df["reco_e"]

In [None]:
# how to get the LLR-PID value for the "track candidate" (proton for nue selection, muon for numu)
# can be done for any variable
# code from Giuseppe!

for i,df in enumerate(df_v):
    up = uproot_v[i]
    trk_llr_pid_v = up.array('trk_llr_pid_score_v')
    trk_id = up.array('trk_id')-1 # I think we need this -1 to get the right result
    trk_llr_pid_v_sel = awkward.fromiter([j[0][j[1]] if j[1]<len(j[0]) else 9999. for j in zip(trk_llr_pid_v,trk_id)])
    df['trkpid'] = trk_llr_pid_v_sel
    df['subcluster'] = df['shrsubclusters0'] + df['shrsubclusters1'] + df['shrsubclusters2']
    df['trkfit'] = df['shr_tkfit_npointsvalid'] / df['shr_tkfit_npoints']

In [None]:
lee_bins = [0, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.8]
lee_scaling = [
    6.3744101, 6.3744101, 5.6455402, 3.7305500, 1.5091400, 1.0742800, 0.7540929,
    0.4763070, 0.1523270
]
#lee_scaling = [1,1,1,1,1,1,1,1,1]

nc["train_weight"] = 1
nue["train_weight"] = 0
mc["train_weight"] = 1
ext["train_weight"] = 1

for i, lee_bin in enumerate(lee_bins):
    
    if i == 0:
        continue
        
    #nue.loc[(nue['reco_e'] > lee_bins[i-1]) & (nue['reco_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * nue['weightSpline']
    #mc.loc[(mc['reco_e'] > lee_bins[i-1]) & (mc['reco_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * mc['weightSpline']
    #ext.loc[(ext['reco_e'] > lee_bins[i-1]) & (ext['reco_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1]
    #nc.loc[(nc['reco_e'] > lee_bins[i-1]) & (nc['reco_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * nc['weightSpline']
    
    nue.loc[(nue['nu_e'] > lee_bins[i-1]) & (nue['nu_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * nue['weightSpline']
    #mc.loc[(mc['nu_e'] > lee_bins[i-1]) & (mc['nu_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * mc['weightSpline']
    #ext.loc[(ext['nu_e'] > lee_bins[i-1]) & (ext['nu_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1]
    #nc.loc[(nc['nu_e'] > lee_bins[i-1]) & (nc['reco_e'] < lee_bins[i]), 'train_weight'] = lee_scaling[i-1] * nc['weightSpline']

In [None]:
nc["is_signal"] = nc["category"] == 11
nue["is_signal"] = nue["category"] == 11
mc["is_signal"] = mc["category"] == 11
ext["is_signal"] = ext["category"] == 11

In [None]:
# variables to be trained on
TRAINVAR = ['tksh_angle',"shr_tkfit_dedx_Y","n_showers_contained","shr_score","tksh_distance",\
            "trkfit","trkpid","subcluster","shrmoliereavg","shrmoliererms","trkshrhitdist2",\
            "hits_ratio",#"reco_e_rqe",\
            'is_signal','train_weight','nu_e']


In [None]:
print (ls.pickle_path)

In [None]:
train_mc, test_mc = train_test_split(mc, test_size=0.5, random_state=1990)
train_ext, test_ext = train_test_split(ext, test_size=0.5, random_state=1990)
train_nue, test_nue = train_test_split(nue, test_size=0.5, random_state=1990)
train_nc, test_nc = train_test_split(nc, test_size=0.5, random_state=1990)

samples = {
    "mc": (train_mc, test_mc),
    "nue": (train_nue, test_nue),
    "ext": (train_ext, test_ext),
    "nc": (train_nc, test_nc)
} 

fig, ax = plt.subplots(1,1)

my_booster = nue_booster.NueBooster(samples, TRAINVAR, random_state=1990)

print (my_booster.variables)

PRESEL = "reco_e < 1.0 and selected == 1 and (crtveto == 0) and (_closestNuCosmicDist > 20)"
PRESEL += " and shr_energy_tot_cali > 0.07"
PRESEL += ' and tksh_distance < 5.0'
PRESEL += ' and shr_tkfit_dedx_Y < 4.0 and shr_tkfit_dedx_Y > 0'
PRESEL += ' and tksh_angle > -0.9 and tksh_angle < 0.9'
PRESEL += ' and trkpid < 0.0'
PRESEL += ' and shr_score < 0.30'
PRESEL += ' and hits_ratio>0.6'
PRESEL += ' and n_showers_contained < 2'
PRESEL += ' and CosmicIP > 20.'

my_booster.set_preselection(PRESEL)

for label, bkg_query in zip(nue_booster.labels, nue_booster.bkg_queries):
    
    preds = my_booster.train_booster(ax, bkg_query)
    
    with open(ls.pickle_path+'booster_%s.pickle' % label, 'wb') as booster_file:
        pickle.dump(preds, booster_file)

    variables = my_booster.variables.copy()
    print ('variables are : ',variables)
    variables.remove("is_signal")
    variables.remove("nu_e")
    variables.remove("train_weight")        
        
    mc_prediction = preds.predict(
        xgb.DMatrix(mc[variables]),
        ntree_limit=preds.best_iteration)
    nue_prediction = preds.predict(
        xgb.DMatrix(nue[variables]),
        ntree_limit=preds.best_iteration)
    ext_prediction = preds.predict(
        xgb.DMatrix(ext[variables]),
        ntree_limit=preds.best_iteration)
    nc_prediction = preds.predict(
        xgb.DMatrix(nc[variables]),
        ntree_limit=preds.best_iteration)

    nc["%s_score" % label] = nc_prediction
    mc["%s_score" % label] = mc_prediction
    nue["%s_score" % label] = nue_prediction
    ext["%s_score" % label] = ext_prediction


ax.set_ylim([0, 1.05])
ax.set_xlim([0, 1.0])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC')
ax.legend()
fig.tight_layout()
fig.savefig(ls.plots_path+"roc_single.pdf")

In [None]:
train_mc, test_mc = train_test_split(mc, test_size=0.5, random_state=1990)
train_ext, test_ext = train_test_split(ext, test_size=0.5, random_state=1990)
train_nue, test_nue = train_test_split(nue, test_size=0.5, random_state=1990)
train_nc, test_nc = train_test_split(nc, test_size=0.5, random_state=1990)

samples = {
    "mc": (train_mc, test_mc),
    "nue": (train_nue, test_nue),
    "ext": (train_ext, test_ext),
    "nc": (train_nc, test_nc)
} 

variables_bdts = [
    "is_signal", "nu_e", "train_weight", "ext_score", "ncpi0_score", "cc_score",
    "ccpi0_score", "cosmic_score"
]

In [None]:
fig_global, ax_global = plt.subplots(1, 1)
my_global_booster = nue_booster.NueBooster(samples, variables_bdts, 1990)

preds = my_global_booster.train_booster(ax_global)

ax_global.legend()
ax_global.set_xlabel('False Positive Rate')
ax_global.set_ylabel('True Positive Rate')
ax_global.set_title('ROC')
fig_global.tight_layout()
fig_global.savefig(ls.plots_path+"roc_global.pdf")

In [None]:
with open(ls.pickle_path+'booster.pickle', 'wb') as booster_file:
    pickle.dump(preds, booster_file)