# Validate the features generated with Spark Workflow vs ROOT

## Get the features from a parquet generated file

In [139]:
import numpy as np
np.set_printoptions(precision=5)

In [140]:
features = sqlContext.read.format("parquet").load("file:/Users/vk/data/ML_MP_JR/ttbar_lepFilter_13TeV/ttbar_features_950.parquet")

In [141]:
features.count()

357

In [142]:
features.show()

+--------------------+--------------------+
|           hfeatures|           lfeatures|
+--------------------+--------------------+
|[373.637100219726...|[WrappedArray(126...|
|[200.109180450439...|[WrappedArray(112...|
|[267.100757598876...|[WrappedArray(70....|
|[235.573299407958...|[WrappedArray(114...|
|[268.110122680664...|[WrappedArray(163...|
|[154.009853363037...|[WrappedArray(34....|
|[43.8011703491210...|[WrappedArray(52....|
|[355.839591979980...|[WrappedArray(192...|
|[498.655937194824...|[WrappedArray(150...|
|[405.600200653076...|[WrappedArray(125...|
|[0.0, 22.88990783...|[WrappedArray(28....|
|[323.521884918212...|[WrappedArray(182...|
|[83.6020126342773...|[WrappedArray(25....|
|[215.075908660888...|[WrappedArray(70....|
|[333.586856842041...|[WrappedArray(57....|
|[153.549125671386...|[WrappedArray(83....|
|[522.556297302246...|[WrappedArray(173...|
|[226.487812042236...|[WrappedArray(72....|
|[0.0, 46.76742172...|[WrappedArray(364...|
|[232.679992675781...|[WrappedAr

In [143]:
# 
# Take the first 10 rows and convert that into numpy array
#
sample_features = features.take(10)
sample_hfeatures = np.asarray([row.hfeatures for row in sample_features])
sample_lfeatures = np.asarray([np.asarray(row.lfeatures) for row in sample_features])
print sample_hfeatures.shape
print sample_lfeatures.shape

(10, 14)
(10, 801, 19)


In [144]:
print sample_hfeatures[0, :]

[  3.73637e+02   1.40337e+02   2.67839e+00   4.41126e+04   3.00000e+00
   2.00000e+00   1.24368e+02   1.74327e-01  -1.76712e+00   2.24426e-02
   0.00000e+00   0.00000e+00   1.00000e+00   1.00000e+00]


## Read the hdf5 file and obtain the pre-generated features

In [145]:
import h5py

In [146]:
hdf5FileName = "/Users/vk/data/ML_MP_JR/ttbar_lepFilter_13TeV/ttbar_lepFilter_13TeV_950.h5"
f = h5py.File(hdf5FileName)

In [147]:
h5_lfeatures = f["Particles"][..., 1:]
h5_hfeatures = f["HLF"][:, 1:]
print h5_lfeatures.shape
print h5_hfeatures.shape

(3616, 801, 19)
(3616, 14)


## Compare the arrays directly

In [148]:
print h5_lfeatures[1, 0, :]

[  1.12746e+02   5.68923e+01  -2.28131e+01   9.46277e+01   6.12958e+01
   1.21881e+00  -3.81357e-01   0.00000e+00   0.00000e+00   0.00000e+00
   0.00000e+00   0.00000e+00   2.62018e-02   0.00000e+00   0.00000e+00
   0.00000e+00   0.00000e+00   1.00000e+00  -1.00000e+00]


In [149]:
print sample_lfeatures[1, 0, :]

[  1.12746e+02   5.68923e+01  -2.28131e+01   9.46277e+01   6.12958e+01
   1.21881e+00  -3.81357e-01   0.00000e+00   0.00000e+00   0.00000e+00
   0.00000e+00   0.00000e+00   2.62018e-02   0.00000e+00   0.00000e+00
   0.00000e+00   0.00000e+00   1.00000e+00  -1.00000e+00]


In [152]:
print h5_hfeatures[1, :]

[  2.00109e+02   3.62160e+01  -2.73218e+00   1.82790e+04   3.00000e+00
   2.00000e+00   6.12958e+01   1.21881e+00  -3.81357e-01   0.00000e+00
   0.00000e+00   2.62018e-02  -1.00000e+00   0.00000e+00]


In [151]:
print sample_hfeatures[1, :]

[  2.00109e+02   3.62160e+01  -2.73218e+00   7.56226e+03   3.00000e+00
   2.00000e+00   6.12958e+01   1.21881e+00  -3.81357e-01   0.00000e+00
   0.00000e+00   2.62018e-02  -1.00000e+00   0.00000e+00]
