In [1]:
from __future__ import print_function

import ROOT

Welcome to JupyROOT 6.19/01


In [3]:
# Base path to local filesystem or to EOS containing the datasets
samplesBasePath = ("root://eospublic.cern.ch//eos/opendata/cms/upload/stefan/"
                   "AOD2NanoAODOutreachTool/HiggsTauTauNanoAODOutreachAnalysis/")

# Names of the datasets to be found in the base path and processed for the analysis
sampleNames = [
    "GluGluToHToTauTau",
    "VBF_HToTauTau",
    "DYJetsToLL",
    "TTbar",
    "W1JetsToLNu",
    "W2JetsToLNu",
    "W3JetsToLNu",
    "Run2012B_TauPlusX",
    "Run2012C_TauPlusX",
]

# Base path to repartitoned datasets destination on EOS

out_base_path = "root://eosuser.cern.ch//eos/user/v/vpadulan/higgsTauTauAnalysis/benchmarks/skim-pyrdf-spark/16ex1core/repartitioned-data/"

In [4]:
min_clusters = 16

for name in sampleNames:
    print("Repartitioning dataset: \n{}:".format(name))
    filename = samplesBasePath + name + ".root"
    treename = "Events"
    
    f = ROOT.TFile.Open(filename)
    t = f.Get(treename)

    entries = t.GetEntriesFast()
    it = t.GetClusterIterator(0)
    start = it()
    end = 0

    clusters = []
    while start < entries:
        end = it()
        cluster = (start, end)
        clusters.append(cluster)
        start = end
    
    numclusters = len(clusters)
    print("{} clusters.\n".format(numclusters))
    if numclusters < min_clusters:
        init_rdf = ROOT.RDataFrame(treename, filename)
        nentries = init_rdf.Count().GetValue()

        num_clusters = min_clusters
        entries_per_cluster = nentries // num_clusters
        remainder = nentries % num_clusters
        
        if remainder > 0:
            entries_per_cluster += 1

        # A simple helper function to fill a test tree: this makes the example stand-alone.
        rsops = ROOT.ROOT.RDF.RSnapshotOptions(
                 "RECREATE",       # mode
                 ROOT.ROOT.kZLIB,  # compression algorithm
                 1,                # compression level
                 entries_per_cluster,              # autoflush, number of events per cluster
                 99,               # split level of output tree
                 0                 # lazy
                )
        
        out_filename = out_base_path + name + ".root"
        init_rdf.Snapshot(treename, out_filename, "", rsops)

Repartitioning dataset: 
GluGluToHToTauTau:
8 clusters.

Repartitioning dataset: 
VBF_HToTauTau:
9 clusters.

Repartitioning dataset: 
DYJetsToLL:
349 clusters.

Repartitioning dataset: 
TTbar:
134 clusters.

Repartitioning dataset: 
W1JetsToLNu:
398 clusters.

Repartitioning dataset: 
W2JetsToLNu:
453 clusters.

Repartitioning dataset: 
W3JetsToLNu:
243 clusters.

Repartitioning dataset: 
Run2012B_TauPlusX:
436 clusters.

Repartitioning dataset: 
Run2012C_TauPlusX:
588 clusters.



In [4]:
f = ROOT.TFile.Open(out_base_path + "ZZTo2e2mu.root")
t = f.Get(treename)

entries = t.GetEntriesFast()
it = t.GetClusterIterator(0)
start = it()
end = 0

clusters = []
while start < entries:
    end = it()
    cluster = (start, end)
    clusters.append(cluster)
    start = end

print(clusters)
print(len(clusters))

[(0, 93591), (93591, 187182), (187182, 280773), (280773, 374364), (374364, 467955), (467955, 561546), (561546, 655137), (655137, 748728), (748728, 842319), (842319, 935910), (935910, 1029501), (1029501, 1123092), (1123092, 1216683), (1216683, 1310274), (1310274, 1403865), (1403865, 1497445)]
16
