-
Notifications
You must be signed in to change notification settings - Fork 6
/
preprocessing.py
260 lines (209 loc) · 8.89 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
from __future__ import absolute_import, division, print_function
import os
import numpy as np
import matplotlib.pyplot as plt
from root_numpy import root2array
def conv_root_to_np(save_path, name, files, treename=None, branches=None):
"""Convert trees in ROOT files into a numpy structured array.
Arguments
---------
save_path (str):
Path to the directory the array will be saved in.
files (str or list(str)):
The name of the files that will be converted into one structured array.
treename (str, optional (default=None)):
Name of the tree to convert.
branches (str or list(str), optional (default=None)):
List of branch names to include as collumns of the array. If None all
branches will be included.
"""
arr = root2array(files, treename, branches)
if not os.path.isdir(save_path):
os.makedirs(save_path)
np_file = save_path + '/' + name
np.save(np_file, arr)
class GetVariables:
"""Takes numpy structured arrays and keeps only events of a certain
category.
Attributes
----------
category : str
The category events have to belong to in order to be kept.
branchlist : str
Path to the text file with the branches which should be used.
variable list(str) :
A list cotaining filled with the branches from branchlist.
save_path : str
Path to the directory the processed array will be saved to.
arr_name : str
A name for the new array.
"""
def __init__(self, variablelist, weightlist, category, save_in):
"""Initializes the class with the given attributes.
"""
self._category = category
# create dir where arrays will be saved in
self.save_in = save_in + '/' + category
if not os.path.isdir(self.save_in):
os.makedirs(self.save_in)
print('created directory {}'.format(self.save_in))
self._vars = variablelist
self._weights = weightlist
def run(self, sig_paths, bkg_paths):
# load structured arrays
structured_sig = self._load_array(sig_paths)
structured_bkg = self._load_array(bkg_paths)
# get all variables from self._vars from the structured array as an
# 2d np.array
sig = self._get_vars(structured_sig, self._vars)
bkg = self._get_vars(structured_bkg, self._vars)
sig['weights'] = self._get_unnormalized_weights(structured_sig)
bkg['weights'] = self._get_unnormalized_weights(structured_bkg)
sig = self._get_category(sig, structured_sig)
bkg = self._get_category(bkg, structured_bkg)
n_sig_events = sig['data'].shape[0]
n_bkg_events = bkg['data'].shape[0]
sig['labels'] = self._get_labels(n_sig_events, 1.0)
bkg['labels'] = self._get_labels(n_bkg_events, 0.0)
self._save_array(sig, bkg)
def _load_array(self, path_list):
"""Loads all structured arrays given in the path list and stacks them
along axis 0.
"""
array_list = []
for path in path_list:
array = np.load(path)
array_list.append(array)
array = np.concatenate(array_list, axis=0)
return array
def _get_vars(self, structured_array, var_list):
"""Get _vars out of the structured array and place them into a
normal numpy ndarray. If the branch is vector like, only keep the first
four entries (jet variables).
"""
# define vector like variables
jets = ['CSV', 'Jet_CSV', 'Jet_CosThetaStar_Lepton', 'Jet_CosTheta_cm',
'Jet_Deta_Jet1', 'Jet_Deta_Jet2','Jet_Deta_Jet3',
'Jet_Deta_Jet4','Jet_E','Jet_Eta','Jet_Flav','Jet_GenJet_Eta',
'Jet_GenJet_Pt', 'Jet_M','Jet_PartonFlav', 'Jet_Phi',
'Jet_PileUpID', 'Jet_Pt']
array_list = []
vars = []
for var in var_list:
if var in jets:
# only keep the first four entries of the jet vector
array = [jet[:4] for jet in structured_array[var]]
array_list.append(np.vstack(array))
vars += [var+'_{}'.format(i) for i in range(1,5)]
else:
array = structured_array[var].reshape(-1,1)
array_list.append(array)
vars += [var]
data_dict = {'data': np.hstack(array_list), 'vars': vars}
return data_dict
def _get_unnormalized_weights(self, structured_array):
"""Calculate the weight for eacht event.
For each event we weill calculate:
Weight_XS * Weight_CSV * Weight_pu69p2
Then, the weights are normalized, so that the sum over all weights is
equal to 1.
Arguments
---------
structured_array : numpy structured array
Structured array converted from ROOT file.
Returns
-------
weights : numpy ndarray
An array of shape (-1,1) filled with the weight of each event.
"""
weights = self._get_vars(structured_array, self._weights)
weights = np.prod(weights['data'], axis=1).reshape(-1,1)
return weights
def _get_labels(self, n_events, label):
"""Create labels.
Arguments
---------
n_events : int
Number labels to create.
label : float
Label.
Returns
-------
label : numpy ndarray
A numpy ndarray of shape (n_events, 1) filled with the label.
"""
labels = np.full(shape=(n_events, 1), fill_value=label)
return labels
def _get_category(self, data_dict, structured_array):
"""Checks if the data belongs to the given category. Only keep events
that do.
Arguments
---------
data_dict : dict
Dictionary filled with event variables and corresponding weights.
structured_array : numpy structured array
Structured array converted from ROOT file.
"""
keep_events = []
for event in range(structured_array.shape[0]):
N_LL = structured_array[event]['N_LooseLeptons']
N_TL = structured_array[event]['N_TightLeptons']
N_J = structured_array[event]['N_Jets']
N_BTM = structured_array[event]['N_BTagsM']
if self._check_category(N_LL, N_TL, N_J, N_BTM, self._category):
keep_events.append(event)
else:
continue
keep_dict = {'data': data_dict['data'][keep_events],
'weights': data_dict['weights'][keep_events],
'vars': data_dict['vars']}
return keep_dict
def _check_category(self, N_LL, N_TL, N_J, N_BTM, name):
"""Returns category bool.
Arguments:
----------------
N_LL (int): N_LooseLeptons
N_TL (int): N_TightLeptons
N_J (int): N_Jets
N_BTM (int): N_BTagsM]
"""
category = {'43': (N_LL == 1 and N_TL == 1 and N_J == 4 and N_BTM == 3),
'44': (N_LL == 1 and N_TL == 1 and N_J == 4 and N_BTM == 4),
'53': (N_LL == 1 and N_TL == 1 and N_J == 5 and N_BTM == 3),
'54': (N_LL == 1 and N_TL == 1 and N_J == 5 and N_BTM >= 4),
'62': (N_LL == 1 and N_TL == 1 and N_J >= 6 and N_BTM >= 2),
'63': (N_LL == 1 and N_TL == 1 and N_J >= 6 and N_BTM == 3),
'64': (N_LL == 1 and N_TL == 1 and N_J >= 6 and N_BTM >= 4),
'all': True}
return category[name]
def _split_array(self, array):
num_evts = array.shape[0]
num_train_evts = int(0.5*num_evts)
num_val_evts = int(0.1*num_evts)
# split array in [train, val, test]
arrays = np.split(array, [num_train_evts,
(num_train_evts+num_val_evts)])
# normalize weights for each array
for array in arrays:
array[:, -1] /= np.sum(array[:, -1])
return arrays
def _save_array(self, sig, bkg):
"""Stacks data and saves the array to given path.
Arguments
---------
sig_dict : dict
Dictionary containing signal events.
bg_dict : dict
Dictionary containing background events.
"""
# write variable names to file
with open(self.save_in + '/vars.txt', 'w') as f:
for var in sig['vars']:
f.write(var + '\n')
sig = np.hstack((sig['labels'], sig['data'], sig['weights']))
bkg = np.hstack((bkg['labels'], bkg['data'], bkg['weights']))
sig = self._split_array(sig)
bkg = self._split_array(bkg)
np.save(self.save_in + '/train.npy', np.vstack((sig[0], bkg[0])))
np.save(self.save_in + '/val.npy', np.vstack((sig[1], bkg[1])))
np.save(self.save_in + '/test.npy', np.vstack((sig[2], bkg[2])))