## Setup spark 

In [1]:
import findspark
findspark.init('/usr/hdp/spark/')

In [2]:
application_name = 'HLF-preparation'
master = "yarn"
num_executors = 10
num_cores = 4

In [3]:
from pyspark.sql import SparkSession
import os 

os.environ["PYTHONHOME"] = "/afs/cern.ch/work/m/migliori/public/anaconda2"
os.environ["PYTHONPATH"] = "/afs/cern.ch/work/m/migliori/public/anaconda2/lib/python2.7/site-packages"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.diana-hep:spark-root_2.11:0.1.16 pyspark-shell" 

spark = SparkSession.builder\
        .appName(application_name)\
        .config("spark.pyspark.python",
                "/afs/cern.ch/work/m/migliori/public/anaconda2/bin/python")\
        .config("spark.master", master)\
        .config("spark.executor.cores",
                '{}'.format(num_cores))\
        .config("spark.executor.instances",
                '{}'.format(num_executors))\
        .getOrCreate()

In [4]:
spark.sparkContext.addPyFile('Utils_functions.py')

In [5]:
spark

## Read and convert the samples

Create the vectors containing Low Level and High Level Features

In [31]:
%%time
from pyspark.sql.functions import lit
from Utils_functions import *

## labels: qcd=0, tt=1, wjets=2

requiredColumns = ["EFlowTrack", "MuonTight_size", "Electron_size",
                   "EFlowNeutralHadron", "EFlowPhoton", "Electron",
                   "MuonTight", "MissingET", "Jet"]

qcd = spark.read.format("org.dianahep.sparkroot.experimental") \
                .load('data_root/qcd*.root') \
                .select(requiredColumns) \
                .rdd \
                .map(convert) \
                .filter(lambda row: len(row) > 0) \
                .toDF() \
                .withColumn("label", lit(0))
        
tt = spark.read.format("org.dianahep.sparkroot.experimental") \
                .load('data_root/tt*.root') \
                .select(requiredColumns) \
                .rdd \
                .map(convert) \
                .filter(lambda row: len(row) > 0) \
                .toDF() \
                .withColumn("label", lit(1))
        
wjets = spark.read.format("org.dianahep.sparkroot.experimental") \
                .load('data_root/wjets*.root') \
                .select(requiredColumns) \
                .rdd \
                .map(convert) \
                .filter(lambda row: len(row) > 0) \
                .toDF() \
                .withColumn("label", lit(2))

CPU times: user 76.9 ms, sys: 23 ms, total: 99.9 ms
Wall time: 28.2 s


In [32]:
%time qcd.count()

CPU times: user 8.8 ms, sys: 9.73 ms, total: 18.5 ms
Wall time: 49 s


447

In [33]:
%time tt.count()

CPU times: user 73.1 ms, sys: 39 ms, total: 112 ms
Wall time: 9min 34s


28522

In [34]:
%time wjets.count() 

CPU times: user 65.8 ms, sys: 44 ms, total: 110 ms
Wall time: 9min 48s


29909

In [36]:
%time data = qcd.union(tt).union(wjets)

CPU times: user 543 µs, sys: 1.7 ms, total: 2.25 ms
Wall time: 6.92 ms


In [37]:
data.printSchema()

root
 |-- hfeatures: vector (nullable = true)
 |-- lfeatures: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- label: integer (nullable = false)



Save the datasets as a Parquet file

In [38]:
%time data.write.parquet("more_data.parquet", mode="overwrite")

CPU times: user 81.1 ms, sys: 36.2 ms, total: 117 ms
Wall time: 9min 49s


## Load the dataframe

In [39]:
%%time
## Load the dataset
loaded_dataset = spark.read.format("parquet").load("more_data.parquet")
print(loaded_dataset.count())

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.
58878
CPU times: user 4.85 ms, sys: 2.02 ms, total: 6.87 ms
Wall time: 2.78 s
