In [1]:
import time; 

#io
import os
import json

#data
import numpy as np
import pandas as pd



##Hit level

In [2]:
types = ["ECAL","HCAL","IT","OT","TT","VELOR1","VELOR2","VELOPHI1","VELOPHI2","MUON","MUON2"]
type_code = {c:i for i,c in enumerate(types)}

def collect_hits(evt,types_dict = type_code):
    """collect hits (np.array) from a json event dictionary
    output: np.array([x,y,z,type_code]) of hits"""

    arrays = []
    
    #calorimeters:0
    for cal in "ECAL","HCAL":
        XYZ = [hit[1:4]+[types_dict[cal]] for hit in evt[cal]]
        arrays.append(np.array(XYZ))
    #triggers: 1
    for trigger in "TT","OT","IT":
        XYZ1 = [hit[0:3]+[types_dict[trigger]] for hit in evt[trigger]]
        arrays.append(np.array(XYZ1))
    #muon detector (x:2,x+dx:3)
    if "MUON" in evt:
        XYZmu = np.array([hit[::2]+[types_dict["MUON"]] for hit in evt["MUON"]])
        arrays.append(XYZmu)

        deltaXYZmu = np.array([hit[1::2]+[1] for hit in evt["MUON"]])
        XYZmu2 = XYZmu+deltaXYZmu
        XYZmu2[:,-1] = types_dict["MUON2"]
        arrays.append(XYZmu2)
    #velo detector (velor:4, velophi:5)
    for velo in "VELOR","VELOPHI":
        XYZ1 = [hit[:3]+[types_dict[velo+"1"]] for hit in evt[velo]]
        arrays.append(np.array(XYZ1))

        XYZ2 = [hit[:3]+[types_dict[velo+"2"]] for hit in evt[velo]]
        arrays.append(np.array(XYZ2))
    #compose a single array
    arrays = filter(lambda a:len(a)!=0,arrays)
    points = np.vstack(arrays)
    return points

In [3]:
import pandas
def extract_hits(evt):

    if len(evt["ECAL"]):
        hits = collect_hits(evt,type_code)
        df = pandas.DataFrame(hits,columns = ['X','Y','Z','source'])
        df["source"] = np.array(types)[df["source"].values.astype(int)]
        return df
    return False

In [4]:
def extract_tracks(evt):
    particle_dicts = evt["PARTICLES"]
    track_rows = []
    for pdict in particle_dicts:
        pdict = dict(pdict) #shallowcopy
        track = pdict["track"]
        for i,(x,y,z) in enumerate(track):
            pdict['x'+str(i)] = x
            pdict['y'+str(i)] = y
            pdict['z'+str(i)] = z
        del pdict["track"]
        track_rows.append(pdict)
    
    df = pandas.DataFrame(track_rows)
    return df        
            

In [5]:
def preprocess(json_folder,outdir_tracks,outdir_hits,n_shards=1,shard_i=0,omit_tracks=False):
    itr=0
    names = filter(lambda fname: fname.endswith(".json"),os.listdir(json_folder))

    names = names[shard_i::n_shards]
    
    try: os.mkdir(outdir_hits)
    except:pass 
    try: os.mkdir(outdir_tracks)
    except:pass 
    
    paths = np.array(map(lambda fname: os.path.join(json_folder,fname),names))
    #shuffle
    paths = paths[np.argsort(np.random.random(size = len(paths)))]

    for name,path in zip(names,paths):
        eventid = name.split(".")[0]#runnumber_eventnumber
        try:
            evt = json.load(open(path))
        except:
            continue
        hit_df = extract_hits(evt)

        if hit_df is not False:

            hit_df.to_csv(os.path.join(outdir_hits,eventid+".hits.csv"))
            
            if not omit_tracks:
                track_df = extract_tracks(evt)
                track_df.to_csv(os.path.join(outdir_tracks,eventid+".tracks.csv"))
            itr+=1

            if itr%100 ==0:
                print itr, "events"

In [6]:
"""
rm -rf /mnt/KSfinder/mc_bg/bg_jsons_*
tar -xvf /root/bg_down.tar.gz -C /mnt/KSfinder/mc_bg/
mv /mnt/KSfinder/mc_bg/bg_jsons_all/ /mnt/KSfinder/mc_bg/bg_jsons_down/
tar -xvf /root/bg_up.tar.gz -C /mnt/KSfinder/mc_bg/
tar -xvf /root/sig_jsns.tar.gz -C /mnt/KSfinder/mc_sig/
"""

'\nrm -rf /mnt/KSfinder/mc_bg/bg_jsons_*\ntar -xvf /root/bg_down.tar.gz -C /mnt/KSfinder/mc_bg/\nmv /mnt/KSfinder/mc_bg/bg_jsons_all/ /mnt/KSfinder/mc_bg/bg_jsons_down/\ntar -xvf /root/bg_up.tar.gz -C /mnt/KSfinder/mc_bg/\ntar -xvf /root/sig_jsns.tar.gz -C /mnt/KSfinder/mc_sig/\n'

In [None]:
from sklearn.externals import joblib
n_shards = 1
tasks =[
    joblib.delayed(preprocess)(
        json_folder= "../data/jsons/bg_up",
        outdir_hits = "../data/bg_up",
        outdir_tracks = "../data/bg_up",
        n_shards= n_shards,
        shard_i=i
    )
    for i in range(n_shards)
]

_=joblib.Parallel(n_jobs=1)(tasks)


In [None]:
from sklearn.externals import joblib
n_shards = 1
tasks =[
    joblib.delayed(preprocess)(
        json_folder= "../data/jsons/bg_down",
        outdir_hits = "../data/bg_down",
        outdir_tracks = "../data/bg_down",
        n_shards= n_shards,
        shard_i=i
    )
    for i in range(n_shards)
]

_=joblib.Parallel(n_jobs=1)(tasks)
