In [6]:
import os
import torch
import uproot
import glob
import torchvision
import numpy as np
from collections import OrderedDict
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import save_image

from torch.utils.data import Dataset, DataLoader
print(uproot.__version__) # Need latest uproot v3.7.1 for LazzyArrays

3.7.1


## A class for load in data from ROOT file, using uproot

It should be generic for all kind of flattree
LazzyArrays is very new for uproot. Need more testing for performances

In [17]:

class P2L1NTP(Dataset):
    def __init__(self, dir_name, features = None,
                 tree_name="l1PhaseIITree/L1PhaseIITree",
                 sequence_length=50, verbose=False):
        self.tree_name = tree_name
        self.features = features
        self.sequence_length = sequence_length
        self.file_names = glob.glob(dir_name)
        ## Cache will be needed in case we train with >1 eposh
        ## Having issue and reported in https://github.com/scikit-hep/uproot/issues/296
        #self.cache = uproot.cache.ArrayCache(1024**2)
        #self.upTree = uproot.lazyarrays(self.file_names, self.tree_name, self.features.keys(), cache=self.cache)
        self.upTree = uproot.lazyarrays(self.file_names, self.tree_name, self.features.keys())
        
    def __len__(self):
        return uproot.numentries(self.file_names, self.tree_name, total=True)
    
    def __getitem__(self, idx):
        reflatnp = []
        event = self.upTree[idx]
        for b, ln in self.features.items():
            g  = event[b]
            if len(g) >= ln:
                tg = g[:ln]
            else:
                tg = np.pad(g, (0, ln-len(g)), 'constant', constant_values=0)
            reflatnp.append(tg)
        return np.concatenate(reflatnp, axis=0)
        ## If inputs are same length, reshape
        #return np.vstack(reflatnp).reshape((-1,),order='F')

## Set input files

physics object as a map of variables name in the root file and the expected number of objects.

Class will truncate or pad with 0 to the expected length

In [18]:
bg_files = "/uscms_data/d2/lpctrig/benwu/AutoEncoderSample/Phaes2L1Ntuple/NeutrinoGun_E_10GeV_V7_5_2_MERGED.root"
sg_files = "/uscms_data/d2/lpctrig/benwu/AutoEncoderSample/Phaes2L1Ntuple/VBF_HToInvisible_M125_14TeV_pythia8_PU200_V7_4_2.root"
PhysicsObt = OrderedDict(
    {
    "jetEt" : 12,
    "jetEta" : 12,
    "jetPhi" : 12,
    "tauEt" : 12,
    "tauEta" : 12,
    "tauPhi" : 12,
    "EGEt" : 12,
    "EGEta" : 12,
    "EGPhi" : 12,
    "globalMuonPt": 12,
    "globalMuonEta": 12,
    "globalMuonPhi": 12
}
)

bg = P2L1NTP(bg_files, PhysicsObt)
gg = bg[1]
print(gg.dtype)


float64


In [None]:
num_epochs = 1
batch_size = 1280
learning_rate = 1e-3

dataloader = DataLoader(bg, batch_size=batch_size, pin_memory=True, shuffle=False)

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(144, 1280),
            nn.ReLU(True),
            nn.Linear(1280, 64),
            nn.ReLU(True), 
            nn.Linear(64, 12), 
            nn.ReLU(True), 
            nn.Linear(12, 3))
        self.decoder = nn.Sequential(
            nn.Linear(3, 12),
            nn.ReLU(True),
            nn.Linear(12, 64),
            nn.ReLU(True),
            nn.Linear(64, 1280),
            nn.ReLU(True), nn.Linear(1280, 144), nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


model = autoencoder()
if torch.cuda.is_available():
    model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-5)

for epoch in range(num_epochs):
    for data in dataloader:
        img = Variable(data.type(torch.FloatTensor))
        if torch.cuda.is_available():
            img = img.cuda()
        # ===================forward=====================
        output = model(img)
        loss = criterion(output, img)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss.data[0]))

torch.save(model.state_dict(), './sim_autoencoder.pth')

## Testing cells below



In [16]:
bg_files = "/uscms_data/d2/lpctrig/benwu/AutoEncoderSample/Phaes2L1Ntuple/NeutrinoGun_E_10GeV_V7_5_2_MERGED.root"
nocache_data = uproot.lazyarrays(bg_files, "l1PhaseIITree/L1PhaseIITree", PhysicsObt.keys(), entrysteps=100)
print(nocache_data[1]["jetEt"])
cache = uproot.cache.ArrayCache(1024**2)
data = uproot.lazyarrays(bg_files, "l1PhaseIITree/L1PhaseIITree", PhysicsObt.keys(), entrysteps=100, cache=cache)
print(data[1]["jetEt"])

[139.000001 123.500001 108.500001 100.000001  98.500001  97.500001
  94.500001  90.000001  88.000001  87.500001  83.500001  82.500001]


ValueError: value too large

### In case you don't have uproot, you can use my local copy in LPC

Or, just "pip3 install uproot"

In [None]:
import sys
sys.path.insert(0, "/uscms/home/benwu/.local/lib/python3.6/site-packages/")
print(sys.path)

In [10]:
# Shuffle by indexing instead of copies. 
def random_dataloaders(dataset, train_frac, valid_frac, batch_size):
    # Split training into train and validation
    indices = torch.randperm(len(dataset))
    train_indices = indices[:len(indices)*valid_frac]
    valid_indices = indices[len(indices)*valid_frac:]

    train_loader = torch.utils.data.DataLoader(train_set, pin_memory=True, batch_size=batch_size,
                                               sampler=SubsetRandomSampler(train_indices))
    valid_loader = torch.utils.data.DataLoader(valid_set, pin_memory=True, batch_size=batch_size,
                                               sampler=SubsetRandomSampler(valid_indices))

    return train_loader, valid_loader