# About

These is a base solution of PID.

In [1]:
%matplotlib inline
import pandas
import numpy
import matplotlib.pyplot as plt
import root_numpy
import os

# Get files https

In [2]:
def get_files_https(files):

    files_http = []

    for one_file in files:

        one_file_path, one_file_tree_name = one_file.split(':')

        one_file_http = one_file_path.replace("/r02/lhcb/jonesc/ANNPID/", "http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/")

        files_http.append((one_file_http, one_file_tree_name))

    return files_http

In [3]:
path_files_txt = '../../../data/configs/training/MC2015Sim09Dev03-TrainingFiles-Mixture-Cambridge.txt'
files = numpy.loadtxt(path_files_txt, dtype='S', delimiter='\n', comments='#')

files_http = get_files_https(files)

print len(files_http)
files_http[:5]

552


[('http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/ProtoParticlePIDtuples/MC/Sim09Devx/Reco15x/Dev03/ANNPID.1.root',
  'ANNPID/DecayTree'),
 ('http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/ProtoParticlePIDtuples/MC/Sim09Devx/Reco15x/Dev03/ANNPID.2.root',
  'ANNPID/DecayTree'),
 ('http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/ProtoParticlePIDtuples/MC/Sim09Devx/Reco15x/Dev03/ANNPID.3.root',
  'ANNPID/DecayTree'),
 ('http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/ProtoParticlePIDtuples/MC/Sim09Devx/Reco15x/Dev03/ANNPID.4.root',
  'ANNPID/DecayTree'),
 ('http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/ProtoParticlePIDtuples/MC/Sim09Devx/Reco15x/Dev03/ANNPID.5.root',
  'ANNPID/DecayTree')]

# Get numpers of particles

In [4]:
def get_number_particles(files_http, particles_pdg, selection, log_file_name='get_number_particles.log'):
    
    numbers_particles = numpy.zeros((len(files_http), len(particles_pdg)))
    
    LOG = open(log_file_name, 'w')
    LOG.write('Particles pdgs: ' + str(particles_pdg) + '\n')
    LOG.write('Selection: ' + selection + '\n')
    LOG.flush()

    for num, (one_file_https, one_file_tree) in enumerate(files_http):
        
        success = 0    
        while success != 1:
            try:
                mc_particle_type = root_numpy.root2array(filenames=one_file_https, 
                                                         treename=one_file_tree, 
                                                         branches='MCParticleType',
                                                         selection=selection)

                for num_pdg, one_pdg in enumerate(particles_pdg):

                    numbers_particles[num, num_pdg] = (numpy.abs(mc_particle_type) == one_pdg).sum()

                LOG.write(str(num) + '. ' + one_file_https + '\n')
                LOG.write('Numbers of particles: ' + str(numbers_particles[num, :]) + '\n')
                LOG.flush()
                success = 1

            except:
                LOG.write(one_file_https + ' is not readed.' + '\n')
                LOG.flush()

    numbers_particles_df = pandas.DataFrame(data=numbers_particles, columns=[str(i) for i in particles_pdg])
    numbers_particles_df['http'] = numpy.array(files_http)[:, 0]
    numbers_particles_df['tree_name'] = numpy.array(files_http)[:, 1]
    
    return numbers_particles_df

In [5]:
selection = '(!HasMC || MCVertexType==1 || MCVertexType==2) && \
             (TrackLikelihood > -100.0) && \
             (TrackP > 0) && \
             (TrackPt > 0) && (abs(TrackType-3) < 0.1)'

dir_path = '../../../data/MC2015Sim09Dev03/Generation1/Long'
particles_pdg = [11,13,211,321,2212,0]

In [6]:
# log_file_name = dir_path + '/get_number_particles.log'
# numbers_particles_df = get_number_particles(files_http, 
#                                             particles_pdg, 
#                                             selection, 
#                                             log_file_name=log_file_name)

# numbers_particles_df.head()

In [7]:
numbers_particles_df = pandas.read_csv(dir_path + '/number_particles.csv', index_col=0)
#numbers_particles_df.to_csv(dir_path + '/number_particles.csv')
numbers_particles_df.head()

Unnamed: 0,11,13,211,321,2212,0,http,tree_name
0,1419,23390,285801,42435,32373,60454,http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/...,ANNPID/DecayTree
1,2476,3928,317760,69607,31487,70250,http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/...,ANNPID/DecayTree
2,15424,23188,1584021,241895,166204,359059,http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/...,ANNPID/DecayTree
3,11810,123150,1494633,266713,158086,337539,http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/...,ANNPID/DecayTree
4,12206,128919,1509125,255125,153070,330710,http://www.hep.phy.cam.ac.uk/~jonesc/lhcb/PID/...,ANNPID/DecayTree


# Train generation

In [8]:
from sklearn.cross_validation import train_test_split
import gc
import sys

def get_sample(numbers_particles, n_tracks, selection, file_path, log_path, readed_files_txt):
    
    # Estimate how many track of the each particle from the each file should be taken
    particles = numbers_particles.columns.drop(['http', 'tree_name'])
    part = 1. * n_tracks / numbers_particles[particles].sum()
    
    
    # Try to create or open LOG file
    if not os.path.exists(log_path):
    
        LOG = open(log_path, 'w')
        LOG.write('Particles pdgs: ' + str(particles) + '\n')
        LOG.write('Selection: ' + selection + '\n')
        LOG.write('Number of tracks: ' + str(n_tracks) + '\n')
        LOG.flush()
        
    else:
        
        LOG = open(log_path, 'a')
        
        
        
    # Try create or open file with the READED data files.
    if not os.path.exists(readed_files_txt):
        
        READED = open(readed_files_txt, 'w')
        READED.write("")
        READED.close()
        READED = list(numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1))
        
    else:
        
        READED = list(numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1))
        
        
    # Count how many track have been taken
    try:
        data = pandas.read_csv(file_path, usecols=['MCParticleType'])

        numbers_per_particle = {}
        for pdg in particles:
            numbers_per_particle[pdg] = len(data[numpy.abs(data.MCParticleType.values) == int(pdg)])

        data = pandas.DataFrame()

    except:

        data = pandas.DataFrame()
        
        numbers_per_particle = {}
        for i in particles:
            numbers_per_particle[i] = 0
      
    
    for index in numbers_particles.index:
        
        success = 0    
        while success != 1:
            try:
                file_http = numbers_particles.loc[index]['http']
                tree_name = numbers_particles.loc[index]['tree_name']

                # A file was readed before?
                if file_http in READED:
                    success = 1
                    continue

                branches = root_numpy.list_branches(file_http, treename=tree_name)
                branches = numpy.array(branches)

                data_array = root_numpy.root2array(filenames=file_http, 
                                                   treename=tree_name, 
                                                   branches=branches[branches != 'piplus_OWNPV_COV_'],
                                                   selection=selection)

                data = pandas.DataFrame(data=data_array, columns=branches[branches != 'piplus_OWNPV_COV_'])

                LOG.write(file_http + '\n')
                LOG.flush()

                data_iter = pandas.DataFrame(columns=branches[branches != 'piplus_OWNPV_COV_'])
                data_iter_index = []

                for one_particle in particles:

                    p_type = numpy.abs(data['MCParticleType'].values)
                    data_particle = data[p_type == int(one_particle)]

                    number = numbers_particles.loc[index][one_particle]
                    number_take = int(round(part[one_particle] * number))

                    data_particle_take_index, _ = train_test_split(data_particle.index, 
                                                                   train_size=number_take, 
                                                                   random_state=42)

                    data_iter_index += list(data_particle_take_index)
                    numbers_per_particle[one_particle] += number_take


                data_iter = data.loc[data_iter_index]

                if os.path.exists(file_path):
                    data_iter.to_csv(file_path, mode='a', header=False)
                else:
                    data_iter.to_csv(file_path, mode='a', header=True)

                del data_iter, data, data_array
                gc.collect()


                READED.append(file_http)
                numpy.array(READED).tofile(readed_files_txt, sep="\n")

                LOG.write('Tracks selected: ' + str(numbers_per_particle) + '\n')
                LOG.flush()

                success = 1

            except:
                
                LOG.write('Unexpected error \n')
                LOG.flush()
                
    return 1

In [None]:
index_sel = numpy.array(numbers_particles_df.index) % 2 == 0
numbers_particles_train = numbers_particles_df[index_sel]

get_sample(numbers_particles_train, 
           1000000, 
           selection, 
           dir_path + '/data_train.csv', 
           dir_path + '/get_sample_train.log',
           dir_path + '/readed_train.log')

In [None]:
data_train = pandas.read_csv(dir_path + '/data_train.csv', index_col=0, usecols=['MCParticleType', 'TrackType'])
len(data_train)

# Eval generation

In [None]:
index_sel = numpy.array(numbers_particles_df.index) % 2 == 1
numbers_particles_eval = numbers_particles_df[index_sel]

get_sample(numbers_particles_eval, 
           1000000, 
           selection, 
           dir_path + '/data_eval.csv', 
           dir_path + '/get_sample_eval.log',
           dir_path + '/readed_eval.log')

In [None]:
data_eval = pandas.read_csv(dir_path + '/data_eval.csv', index_col=0, usecols=['MCParticleType'])
len(data_eval)