In [None]:
import sys
import localSettings as ls
print(ls.main_path)

In [None]:
main_path = ls.main_path
sys.path.append(main_path)

In [None]:
import timedependence
import importlib
importlib.reload(timedependence)
import uproot
import matplotlib.pylab as pylab
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
import nue_booster 
importlib.reload(nue_booster)
import awkward

params = {
    'axes.labelsize': 'x-large',
    'axes.titlesize': 'x-large',
    'xtick.labelsize': 'x-large',
    'ytick.labelsize': 'x-large'
}
pylab.rcParams.update(params)

In [None]:
# load nue variables
NUEVARS = False
# USING BDT?
USEBDT = False

In [None]:
fold = "nuselection"
tree = "NeutrinoSelectionFilter"

urON = uproot.open(ls.ONBEAM_NUMU_PATH)["nuselection"][tree]
urOF = uproot.open(ls.OFFBEAM_PATH)["nuselection"][tree]

variables = [
    "shr_dedx_Y", "shr_bkt_pdg", "p", "pt", "selected", "nu_pdg", "shr_theta",
    "slpdg", "trk_score_v", "backtracked_pdg", # modified from shr_score_v
    "shr_pfp_id_v", "category",
    "shr_tkfit_dedx_U","shr_tkfit_dedx_V","shr_tkfit_dedx_Y",
    "shr_tkfit_gap10_dedx_U","shr_tkfit_gap10_dedx_V","shr_tkfit_gap10_dedx_Y",
    "shr_tkfit_2cm_dedx_U","shr_tkfit_2cm_dedx_V","shr_tkfit_2cm_dedx_Y",
    #"shr_energy_tot", 
    "trk_energy_tot", "shr_hits_tot", "ccnc", "trk_chipr",
    "trk_bkt_pdg", "hits_ratio", "n_tracks_contained", 
    "crtveto","crthitpe","_closestNuCosmicDist",
    "NeutrinoEnergy2",
    #"run","sub","evt",
    "CosmicIP","CosmicDirAll3D","CosmicIPAll3D",
    "nu_flashmatch_score","best_cosmic_flashmatch_score","best_obviouscosmic_flashmatch_score",
    #"trk_pfp_id",
    "shrmoliereavg","shrmoliererms",
    "shr_tkfit_npointsvalid","shr_tkfit_npoints", # fitted vs. all hits for shower
    "shrclusfrac0","shrclusfrac1","shrclusfrac2", # track-fitted hits / all hits
    "trkshrhitdist2", # "trkshrhitdist0","trkshrhitdist1", distance between track and shower in 2D
    "shrsubclusters0","shrsubclusters1","shrsubclusters2", # number of sub-clusters in shower
    "trk_llr_pid_score_v", # trk-PID score
    #"pi0_energy2_Y", # pi0 tagger variables
    "_opfilter_pe_beam", "_opfilter_pe_veto", # did the event pass the common optical filter (for MC only)
    "reco_nu_vtx_sce_x","reco_nu_vtx_sce_y","reco_nu_vtx_sce_z",
    "nproton", "nu_e", "n_showers_contained", "shr_distance", "trk_distance",
    "hits_y", "shr_pz", "shr_energy", "shr_dedx_U", "shr_dedx_V", "shr_phi", "trk_phi", "trk_theta",
    "shr_tkfit_dedx_U", "shr_tkfit_dedx_V", "run", "sub", "evt", "nproton", "trk_pid_chipr_v",
    "trk_len", "mc_pdg", "slnunhits", "slnhits", "shr_score", "trk_score", "trk_hits_tot",
    "true_e_visible", "matched_E", "shr_bkt_E", "trk_bkt_E", "trk_energy", "tksh_distance", "tksh_angle",
    "npi0","npion","pion_e","muon_e","pi0truth_elec_etot",
    "pi0_e", "shr_energy_tot_cali", "shr_dedx_Y_cali", "evnunhits", "nslice", "interaction",
    "slclustfrac", "reco_nu_vtx_x", "reco_nu_vtx_y", "reco_nu_vtx_z","contained_fraction",
    "secondshower_Y_nhit","secondshower_Y_vtxdist","secondshower_Y_dot","secondshower_Y_dir","shrclusdir2",
    "shr_tkfit_nhits_Y","shr_tkfit_nhits_U","shr_tkfit_nhits_V",
    "shr_tkfit_2cm_nhits_Y","shr_tkfit_2cm_nhits_U","shr_tkfit_2cm_nhits_V",
    "shr_tkfit_gap10_nhits_Y","shr_tkfit_gap10_nhits_U","shr_tkfit_gap10_nhits_V"
]

variables_pi0 = [
    "run",'sub','evt','nslice',
    # pi0 variables
    "pi0_radlen1","pi0_radlen2","pi0_dot1","pi0_dot2","pi0_energy1_Y","pi0_energy2_Y",
    "pi0_dedx1_fit_Y","pi0_dedx2_fit_Y","pi0_shrscore1","pi0_shrscore2","pi0_gammadot",
    "pi0_dedx1_fit_V","pi0_dedx2_fit_V","pi0_dedx1_fit_U","pi0_dedx2_fit_U",
    "pi0_mass_Y","pi0_mass_V","pi0_mass_U",
    "pi0_dir2_x","pi0_dir2_y","pi0_dir2_z","pi0_dir1_x","pi0_dir1_y","pi0_dir1_z", 
]

#make the list unique
variables = list(set(variables))
#print(variables)

variables.remove("_closestNuCosmicDist")
variables.remove("crtveto")
variables.remove("crthitpe")

dfON = urON.pandas.df(variables_pi0, flatten=False)
dfOF = urOF.pandas.df(variables_pi0, flatten=False)

if (NUEVARS):

    uproot_v = [urON,urOF]
    df_v = [dfON,dfOF]
    for i,df in enumerate(df_v):
        up = uproot_v[i]
        trk_llr_pid_v = up.array('trk_llr_pid_score_v')
        trk_calo_energy_y_v = up.array('trk_calo_energy_y_v')
        trk_id = up.array('trk_id')-1 # I think we need this -1 to get the right result
        trk_llr_pid_v_sel = awkward.fromiter([pidv[tid] if tid<len(pidv) else 9999. for pidv,tid in zip(trk_llr_pid_v,trk_id)])
        trk_calo_energy_y_sel = awkward.fromiter([pidv[tid] if tid<len(pidv) else 9999. for pidv,tid in zip(trk_calo_energy_y_v,trk_id)])
        df['trkpid'] = trk_llr_pid_v_sel
        df['trackcaloenergy'] = trk_calo_energy_y_sel

In [None]:
# how to get the LLR-PID value for the "track candidate" (proton for nue selection, muon for numu)
# can be done for any variable
# code from Giuseppe!

INTERCEPT = 0.0
SLOPE = 0.83

df_v = [dfON,dfOF]

if (NUEVARS):

    for i,df in enumerate(df_v):
        df['subcluster'] = df['shrsubclusters0'] + df['shrsubclusters1'] + df['shrsubclusters2']
        df['trkfit'] = df['shr_tkfit_npointsvalid'] / df['shr_tkfit_npoints']
        # and the 2d angle difference
        df['anglediff_Y'] = np.abs(df['secondshower_Y_dir']-df['shrclusdir2'])
        df['shr_tkfit_nhits_tot'] = (df['shr_tkfit_nhits_Y']+df['shr_tkfit_nhits_U']+df['shr_tkfit_nhits_V'])
        df['shr_tkfit_dedx_avg'] = (df['shr_tkfit_nhits_Y']*df['shr_tkfit_dedx_Y'] + df['shr_tkfit_nhits_U']*df['shr_tkfit_dedx_U'] + df['shr_tkfit_nhits_V']*df['shr_tkfit_dedx_V'])/df['shr_tkfit_nhits_tot']
        df['shr_tkfit_2cm_nhits_tot'] = (df['shr_tkfit_2cm_nhits_Y']+df['shr_tkfit_2cm_nhits_U']+df['shr_tkfit_2cm_nhits_V'])
        df['shr_tkfit_2cm_dedx_avg'] = (df['shr_tkfit_2cm_nhits_Y']*df['shr_tkfit_2cm_dedx_Y'] + df['shr_tkfit_2cm_nhits_U']*df['shr_tkfit_2cm_dedx_U'] + df['shr_tkfit_2cm_nhits_V']*df['shr_tkfit_2cm_dedx_V'])/df['shr_tkfit_2cm_nhits_tot']
        df['shr_tkfit_gap10_nhits_tot'] = (df['shr_tkfit_gap10_nhits_Y']+df['shr_tkfit_gap10_nhits_U']+df['shr_tkfit_gap10_nhits_V'])
        df['shr_tkfit_gap10_dedx_avg'] = (df['shr_tkfit_gap10_nhits_Y']*df['shr_tkfit_gap10_dedx_Y'] + df['shr_tkfit_gap10_nhits_U']*df['shr_tkfit_gap10_dedx_U'] + df['shr_tkfit_gap10_nhits_V']*df['shr_tkfit_gap10_dedx_V'])/df['shr_tkfit_gap10_nhits_tot']
        df.loc[:,'shr_tkfit_dedx_max'] = df['shr_tkfit_dedx_Y']
        df.loc[(df['shr_tkfit_nhits_U']>df['shr_tkfit_nhits_Y']),'shr_tkfit_dedx_max'] = df['shr_tkfit_dedx_U']
        df.loc[(df['shr_tkfit_nhits_V']>df['shr_tkfit_nhits_Y']) & (df['shr_tkfit_nhits_V']>df['shr_tkfit_nhits_U']),'shr_tkfit_dedx_max'] = df['shr_tkfit_dedx_V']
        df.loc[df['secondshower_Y_dot'].isna(),'secondshower_Y_dot'] = 0.0
        df["reco_e"] = (df["shr_energy_tot_cali"] + INTERCEPT) / SLOPE + df["trk_energy_tot"]
        df["reco_e_qe"] = 0.938*((df["shr_energy"]+INTERCEPT)/SLOPE)/(0.938 - ((df["shr_energy"]+INTERCEPT)/SLOPE)*(1-np.cos(df["shr_theta"])))
        df["reco_e_rqe"] = df["reco_e_qe"]/df["reco_e"]
        df["bnbdata"] = np.zeros_like(df["shr_energy"])
        df["extdata"] = np.zeros_like(df["shr_energy"])
        df.loc[(df['category']!=1)&(df['category']!=10)&(df['category']!=11)&(df['category']!=111)&(df['slnunhits']/df['slnhits']<0.2), 'category'] = 4

In [None]:
df_v = [dfON,dfOF]
for i,df in enumerate(df_v):
    df["bnbdata"] = np.zeros_like(df["run"])
    df["extdata"] = np.zeros_like(df["run"])
dfON['bnbdata'] = np.ones_like(dfON["run"])
dfOF['extdata'] = np.ones_like(dfOF["run"])

In [None]:
# variables to be trained on
TRAINVAR = ["shr_score","tksh_distance","tksh_angle",
            "shr_tkfit_dedx_max",
            "trkfit","trkpid",
            "subcluster","shrmoliereavg",
            "trkshrhitdist2","hits_ratio",
            "secondshower_Y_nhit","secondshower_Y_vtxdist","secondshower_Y_dot","anglediff_Y",
            "CosmicIPAll3D","CosmicDirAll3D"]

In [None]:
LABELS =  ['pi0','nonpi0']
#LABELS =  ["bkg"]

if (USEBDT == True):
    for label, bkg_query in zip(LABELS, nue_booster.bkg_queries):
        with open(ls.pickle_path+'booster_%s_0304_extnumi.pickle' % label, 'rb') as booster_file:
            booster = pickle.load(booster_file)

            dfOF[label+"_score"] = booster.predict(
                xgb.DMatrix(dfOF[TRAINVAR]),
                ntree_limit=booster.best_iteration)
            dfON[label+"_score"] = booster.predict(
                xgb.DMatrix(dfON[TRAINVAR]),
                ntree_limit=booster.best_iteration)

In [None]:
import timedependence
import importlib
importlib.reload(timedependence)

In [None]:
# nue files
timeplotter = timedependence.TimePlotter(ls.ONBEAM_FAR_POT,ls.OFFBEAM_POT,dfON,dfOF)
# pi0 / numu files
timeplotter = timedependence.TimePlotter(ls.ONBEAM_NUMU_POT,ls.OFFBEAM_POT,dfON,dfOF)

In [None]:
import unblinding_far_sideband
import importlib
importlib.reload(unblinding_far_sideband)

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

In [None]:
from scipy.optimize import curve_fit

In [None]:
def line(x,s,b):
    return x*s + b

In [None]:
#print (PRESQ_twoplus_showers)
QUERY = unblinding_far_sideband.NPPRESQ_twoplus_showers
QUERY =  unblinding_far_sideband.NPPRESEQ_one_shower
QUERY = unblinding_far_sideband.NPVLCUTQ
QUERY = unblinding_far_sideband.PI0SEL
QUERY = 'nslice == 1'

ONBEAM = True

In [None]:
run_v, pass_v, pass_e = timeplotter.QueryByRun(50,QUERY,ONBEAMONLY=ONBEAM)#,RMIN=13700,RMAX=18000)

In [None]:
fig = plt.figure(figsize=(12,4))
plt.errorbar(run_v,pass_v,yerr=pass_e,fmt='o',color='r')
print (pass_e)
popt,popv = curve_fit(line,run_v,pass_v,sigma=pass_e,absolute_sigma=True,p0=[0.,np.mean(pass_v)])
pope = np.sqrt(np.diag(popv))
slopeval = (popt[0]/popt[1])*1e3 # fractional change / 1k runs
slopeerr = (pope[0]/popt[1])*1e3
print ('fits : ',popt)
print ('errors : ',pope)
percentdiff  = 100. * ( (line(np.max(run_v),*popt)) - (line(np.min(run_v),*popt)) ) / (line(np.min(run_v),*popt))
percenterrUP = 100. * ( (line(np.max(run_v),popt[0]+pope[0],popt[1])) - (line(np.min(run_v),popt[0]+pope[0],popt[1])) ) / (line(np.min(run_v),popt[0]+pope[0],popt[1]))
percenterrDN = 100. * ( (line(np.max(run_v),popt[0]-pope[0],popt[1])) - (line(np.min(run_v),popt[0]-pope[0],popt[1])) ) / (line(np.min(run_v),popt[0]-pope[0],popt[1]))
plt.plot(run_v,line(run_v,*popt),'b--',lw=2,label='change: %.0f [%.0f, %.0f] %%'%(percentdiff,percenterrDN,percenterrUP))
#plt.ylim([-0.5,0.5])
plt.title('Pi0Sel')
plt.legend(loc='best',fontsize=16)
plt.xlabel('run number')
#plt.ylim([popt[1]*0.5,popt[1]*1.5])
if (ONBEAM == True):
    plt.ylabel('on-beam / 1e18 POT',fontsize=16)
else:
    plt.ylabel('(on-off) beam / 1e18 POT',fontsize=16)
plt.show()