In [1]:
import os, sys
sys.path.append("../../common/")
from python_tools import *

python tools loaded.


In [2]:
import numba.extending

@numba.extending.overload(np.clip)
def np_clip(a, a_min, a_max, out=None):
    def np_clip_impl(a, a_min, a_max, out=None):
        if out is None:
            out = np.empty_like(a)
        for i in range(len(a)):
            if a[i] < a_min:
                out[i] = a_min
            elif a[i] > a_max:
                out[i] = a_max
            else:
                out[i] = a[i]
        return out
    return np_clip_impl

In [3]:
# some useful functions
# 
# you may need to comment out the 'numba' bits if your system can't install numba (like the gpvms...)
#

@numba.jit(nopython=True)
def costheta_numba(p1x,p1y,p1z,p1mag,
                   p2x,p2y,p2z,p2mag):
    return np.clip(np.where((p1mag>0.0)&(p2mag>0.0),
                            (p1x*p2x+p1y*p2y+p1z*p2z)/p1mag/p2mag,
                            np.nan),
                   -1.0,1.0)

def eval_costheta(df,suffix1="",suffix2=""):
    return costheta_numba(df.loc[:,"px"+suffix1].values,df.loc[:,"py"+suffix1].values,df.loc[:,"pz"+suffix1].values,df.loc[:,"p"+suffix1].values,
                          df.loc[:,"px"+suffix2].values,df.loc[:,"py"+suffix2].values,df.loc[:,"pz"+suffix2].values,df.loc[:,"p"+suffix2].values)

    
@numba.jit(nopython=True)
def q3_numba(p1x,p1y,p1z,p2x,p2y,p2z):
    return np.sqrt((p1x-p2x)**2+(p1y-p2y)**2+(p1z-p2z)**2)

def eval_q3(df,suffix1="",suffix2="_mu"):
    return q3_numba(df.loc[:,"px"+suffix1].values,df.loc[:,"py"+suffix1].values,df.loc[:,"pz"+suffix1].values,
                    df.loc[:,"px"+suffix2].values,df.loc[:,"py"+suffix2].values,df.loc[:,"pz"+suffix2].values)

In [4]:
def remove_duplicate_files(sig_files,bkg_files,bkg_path,aa_run,aa_label,ac_run,ac_label):
    for f in sig_files:
        r = int(f.split("/")[-1].split("_")[-2])
        sr = f.split("/")[-1].split("_")[-1].split(".")[0]
        run_set=""
        if(r==aa_run):
            run_set=aa_label
        if(r==ac_run):
            run_set=ac_label
    
        fname = "%s_%s/sampler_hist_%s.root"%(bkg_path,run_set,sr)
        try:
            bkg_files.remove(fname)
        except:
            print("%s not found. (%s)"%(fname,f))
            sig_files.remove(f)
    all_files = sig_files + bkg_files
    return all_files,sig_files

In [5]:
def convert_root(root_filenames,id_tree=True):
    t_df = []
    p_df = []
    pot_df = []
    id_df = []

    file_count = 0
    event_count = 0
    print("Processing %d files" % len(root_filenames))

    for root_filename in root_filenames:
    
        try:
            p_df.append(uproot.open(root_filename)['mcana/particle_tree'].pandas.df())
            t_df.append(uproot.open(root_filename)['mcana/mctruth_tree'].pandas.df())
            pot_df.append(uproot.open(root_filename)['potana/pot_tree'].pandas.df())
        except:
            print("File %s, trees not found."%root_filename)
        
        if id_tree:
            try:
                id_df.append(uproot.open(root_filename)['generator/id_tree'].pandas.df())
            except:
                print("\tFile %s, No ID Tree. Skipping...."%root_filename)
            
        event_count += len(t_df[-1])
        file_count += 1
        if file_count%500==0:
            print("\tProcessed %d files. %d events processed." % (file_count,event_count))

    p_df = pd.concat(p_df)
    t_df = pd.concat(t_df)
    pot_df = pd.concat(pot_df)
    if id_tree:
        id_df = pd.concat(id_df)

    p_df.set_index(["run","subrun","event","truth_index","p_index"],inplace=True)
    t_df.set_index(["run","subrun","event","truth_index"],inplace=True)
    pot_df.set_index(["run","subrun"],inplace=True)
        
    print("Have dataframe objects. Total events is %d." % len(t_df))
    
    #make a ke column
    p_df["ke"] = p_df["e"]-p_df["mass"]
    
    return t_df,p_df,pot_df,id_df

In [6]:
def create_finalstate_df(p_df):
    df_n = pd.DataFrame()
    
    df_n["n_mu"] = ((p_df["status"]==1)&(p_df["pdgcode"]==13)).replace(False,np.nan)
    df_n["n_e"] = ((p_df["status"]==1)&(p_df["pdgcode"]==11)).replace(False,np.nan)
    df_n["n_p_40MeV"] = ((p_df["status"]==1)&(p_df["pdgcode"]==2212)&((p_df["e"]-p_df["mass"])>0.04)).replace(False,np.nan)
    df_n["n_pi0"] = ((p_df["status"]==1)&((p_df["pdgcode"]==111))).replace(False,np.nan)
    df_n["n_chpi"] = ((p_df["status"]==1)&((p_df["pdgcode"]==211)^(p_df["pdgcode"]==-211))).replace(False,np.nan)
    df_n["n_gamma"] = ((p_df["status"]==1)&((p_df["pdgcode"]==22))&(p_df["e"]>0.02)).replace(False,np.nan)
    df_n = df_n.groupby(["run","subrun","event","truth_index"]).agg("sum")
    
    return df_n

In [7]:
#groupings for final state protons (up to 4), pi0, gammas, and leptons. And initial neutrino.
def group_particle_df(p_df):
    p_df_p_grouped = p_df.query("status==1 and pdgcode==2212").sort_values(by=["e"],ascending=False).groupby(["run","subrun","event","truth_index"])
    p_df_p1 = p_df_p_grouped.nth(0)
    p_df_p2 = p_df_p_grouped.nth(1)
    p_df_p3 = p_df_p_grouped.nth(2)
    p_df_p4 = p_df_p_grouped.nth(3)

    p_df_pi0_grouped = p_df.query("status==1 and pdgcode==111").sort_values(by=["e"],ascending=False).groupby(["run","subrun","event","truth_index"])
    p_df_pi0 = p_df_pi0_grouped.nth(0)

    p_df_gamma_grouped = p_df.query("status==1 and pdgcode==22").sort_values(by=["e"],ascending=False).groupby(["run","subrun","event","truth_index"])
    p_df_gamma = p_df_gamma_grouped.nth(0)

    p_df_lep = p_df.query("status==1 and (pdgcode==13 or pdgcode==-13 or pdgcode==11 or pdgcode==-11 or pdgcode==12 or pdgcode==-12 or pdgcode==14 or pdgcode==-14)").groupby(["run","subrun","event","truth_index"]).first()
    p_df_nu = p_df.query("status==0 and (pdgcode==12 or pdgcode==-12 or pdgcode==14 or pdgcode==-14)").groupby(["run","subrun","event","truth_index"]).first()
    
    return p_df_nu,p_df_lep,p_df_p1,p_df_p2,p_df_p3,p_df_p4,p_df_pi0,p_df_gamma

In [8]:
def event_df_calcs(df_ev_t):
    df_ev_t["costheta_lep"] = eval_costheta(df=df_ev_t,suffix1="",suffix2="_lep")
    df_ev_t["costheta_p1"] = eval_costheta(df=df_ev_t,suffix1="",suffix2="_p1")
    df_ev_t["costheta_pi0"] = eval_costheta(df=df_ev_t,suffix1="",suffix2="_pi0")
    
    return df_ev_t

In [9]:
def create_event_df(t_df,p_df):
    df_n = create_finalstate_df(p_df)
    p_df_nu,p_df_lep,p_df_p1,p_df_p2,p_df_p3,p_df_p4,p_df_pi0,p_df_gamma = group_particle_df(p_df)
    
    df_ev_t = t_df.copy()
    df_ev_t = df_ev_t.merge(p_df_nu,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_nu"])
    df_ev_t = df_ev_t.merge(p_df_lep,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_lep"])
    df_ev_t = df_ev_t.merge(p_df_p1,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_p1"])
    df_ev_t = df_ev_t.merge(p_df_p2,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_p2"])
    df_ev_t = df_ev_t.merge(p_df_p3,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_p3"])
    df_ev_t = df_ev_t.merge(p_df_p3,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_p4"])
    df_ev_t = df_ev_t.merge(p_df_pi0,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_pi0"])
    df_ev_t = df_ev_t.merge(p_df_gamma,how="left",on=["run","subrun","event","truth_index"],suffixes=["","_gamma"])
    df_ev_t = df_ev_t.merge(df_n,how="left",on=["run","subrun","event","truth_index"])
    
    df_ev_t = event_df_calcs(df_ev_t)
    
    return df_ev_t

In [10]:
def merge_on_run_subrun(df,rs_df):
    index_df = pd.MultiIndex.from_arrays([df.index.get_level_values('run').array,
                                          df.index.get_level_values('subrun').array])
    index_rs = pd.MultiIndex.from_arrays([rs_df[col] for col in ['run', 'subrun']])
    return df.loc[index_df.isin(index_rs)]

In [11]:
def write_hdf(name,t_df=None,p_df=None,pot_df=None,ev_df=None,id_df=None):
    if t_df is not None: t_df.to_hdf(name,"t_df")
    if p_df is not None: p_df.to_hdf(name,"p_df")
    if pot_df is not None: pot_df.to_hdf(name,"pot_df")
    if ev_df is not None: ev_df.to_hdf(name,"ev_df")
    if id_df is not None: id_df.to_hdf(name,"id_df")


In [12]:
root_filenames_Set1Run1_Sigs = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run1_Sampler_Hists_Set1/Run1_SignalFiles/*.root")
root_filenames_Set1Run1_Bkgs = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run1_Sampler_Hists_Set1/Run1_A[A,C]/*.root")
root_filenames_Set1Run3_Sigs = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run3_Sampler_Hists_Set1/Run3_SignalFiles/*.root")
root_filenames_Set1Run3_Bkgs = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run3_Sampler_Hists_Set1/Run3b_A[B,C]/*.root")

In [13]:
root_filenames_Set1Run1,root_filenames_Set1Run1_Sigs = remove_duplicate_files(root_filenames_Set1Run1_Sigs,
                                                                              root_filenames_Set1Run1_Bkgs,
                                                                              "/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run1_Sampler_Hists_Set1/Run1",
                                                                              6693842,"AA",6693859,"AC")
root_filenames_Set1Run3,root_filenames_Set1Run3_Sigs = remove_duplicate_files(root_filenames_Set1Run3_Sigs,
                                                                              root_filenames_Set1Run3_Bkgs,
                                                                              "/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run3_Sampler_Hists_Set1/Run3b",
                                                                              7165574,"AB",7165592,"AC")

/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run1_Sampler_Hists_Set1/Run1_AC/sampler_hist_-1.root not found. (/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Run1_Sampler_Hists_Set1/Run1_SignalFiles/sampler_hist_6693859_-1.root)


In [14]:
root_filenames_CV   = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/CVSet/*/*/sampler*.root")
root_filenames_Set1 = root_filenames_Set1Run1+root_filenames_Set1Run3
root_filenames_Set2 = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set2/*/*/sampler*.root")
root_filenames_Set3 = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set3/*/*/sampler*.root")
root_filenames_Set4 = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set4/*/*/sampler*.root")
root_filenames_Set5 = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set5/*/truth_Output.root")
root_filenames_Set7 = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set7/*/*/sampler*.root")
root_filenames_Set8 = glob.glob("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/*/*/sampler*.root")


In [None]:
rs_reco2_set1_files = ["/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Set1_Run1_RSub.txt",
                 "/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/Set1_Run3_RSub.txt"]
rs_df_reco2 = pd.concat([pd.read_csv(f,names=["run","subrun"],header=None,sep=".") for f in rs_reco2_set1_files])

In [None]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_CV)
ev_df = create_event_df(t_df,p_df)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/CVSet/mcana_dfs_25Feb21.h5",
          t_df,p_df,pot_df,ev_df)

In [None]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_Set1)
ev_df = create_event_df(t_df,p_df)
ev_df = merge_on_run_subrun(ev_df,rs_df_reco2)
t_df = merge_on_run_subrun(t_df,rs_df_reco2)
pot_df = merge_on_run_subrun(pot_df,rs_df_reco2)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set1/mcana_dfs_25Feb21.h5",
          t_df,p_df,pot_df,ev_df,id_df)

In [None]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_Set2)
ev_df = create_event_df(t_df,p_df)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set2/mcana_dfs_25Feb21.h5",
          t_df,p_df,pot_df,ev_df,id_df)

In [None]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_Set3)
ev_df = create_event_df(t_df,p_df)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set3/mcana_dfs_25Feb21.h5",
          t_df,p_df,pot_df,ev_df,id_df)

In [None]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_Set4)
ev_df = create_event_df(t_df,p_df)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set4/mcana_dfs_25Feb21.h5",
          t_df,p_df,pot_df,ev_df,id_df)

In [None]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_Set5,id_tree=False)
ev_df = create_event_df(t_df,p_df)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set5/mcana_dfs_25Feb21.h5",
          t_df,p_df,pot_df,ev_df)

In [None]:
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set5/mcana_dfs.h5",
          t_df,p_df,pot_df,ev_df)

In [None]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_Set7)
ev_df = create_event_df(t_df,p_df)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set7/mcana_dfs_25Feb21.h5",
          t_df,p_df,pot_df,ev_df)

In [15]:
t_df,p_df,pot_df,id_df = convert_root(root_filenames_Set8)
ev_df = create_event_df(t_df,p_df)
write_hdf("/Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/mcana_dfs_19May21.h5",
          t_df,p_df,pot_df,ev_df,id_df)

Processing 23066 files
File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3a/AA/sampler_hist_344.root, trees not found.
	File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3a/AA/sampler_hist_344.root, No ID Tree. Skipping....
	Processed 500 files. 6705 events processed.
File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run1/AB/sampler_hist_1095.root, trees not found.
	File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run1/AB/sampler_hist_1095.root, No ID Tree. Skipping....
	Processed 1000 files. 12666 events processed.
	Processed 1500 files. 18369 events processed.
	Processed 2000 files. 24292 events processed.
	Processed 2500 files. 30542 events processed.
	Processed 3000 files. 36933 events processed.
	Processed 3500 files. 43423 events processed.
File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run1/AA/sampler_hist_6120.root, trees not found.
	File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run1/AA/sampler_hist_6120.root, No ID Tree. Skipping....

File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_3376.root, trees not found.
	File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_3376.root, No ID Tree. Skipping....
File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_3398.root, trees not found.
	File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_3398.root, No ID Tree. Skipping....
	Processed 22000 files. 296285 events processed.
File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_268.root, trees not found.
	File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_268.root, No ID Tree. Skipping....
	Processed 22500 files. 303274 events processed.
File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_8649.root, trees not found.
	File /Users/wketchum/Data/MicroBooNE/FakeData2020/Set8/Run3b/AA/sampler_hist_8649.root, No ID Tree. Skipping....
File /Users/wketchum/Data/Mi

  entrypoints.init_all()
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->bytes,key->block2_values] [items->Index(['process', 'endprocess'], dtype='object')]

  pytables.to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['process', 'endprocess', 'process_lep', 'endprocess_lep', 'process_p1',
       'endprocess_p1', 'process_p2', 'endprocess_p2', 'process_p3',
       'endprocess_p3', 'process_p4', 'endprocess_p4', 'process_pi0',
       'endprocess_pi0', 'process_gamma', 'endprocess_gamma'],
      dtype='object')]

  pytables.to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->bytes,key->block1_values] [items->Index(['dataset_old'], dtype='object')]

  pytables.to_hdf(


In [None]:
pot_df = pot_df.sort_index()
pot_df.loc[:8000]

In [None]:
id_df["run_new"].unique()