In [1]:
import math
import random
import numpy as np
import pandas as pd
import root_numpy as rnp
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from tqdm import tnrange

path = 'path/to/your/data' 

branchs = ['Zmomentum','Momentum','Theta','Position',\
                'MvdDEDX','MvdHits','SttMeanDEDX',\
                'SttHits','GemHits','TofStopTime','TofM2','TofTrackLength',\
                'TofQuality','DrcThetaC','DrcQuality',\
                'DiscThetaC','DiscQuality',\
                'RichThetaC','RichQuality',\
                'EmcRawEnergy','EmcCalEnergy','EmcQuality',\
                'EmcNumberOfCrystals','EmcNumberOfBumps','EmcModule','EmcZ20',\
                'EmcZ53','EmcLat','EmcE1','EmcE9','EmcE25',\
                'MuoNumberOfLayers',\
                'MuoHits','DegreesOfFreedom','FitStatus','ChiSquared']   

Welcome to JupyROOT 6.18/00


In [2]:
amount = 50 # change this to -1 to get all data avaliable
def get_box_data(path=path, branch_names=branchs):
    # TO DO path must be adjust to where the data is
    file1 = rnp.root2array(path+'/box/box_500k_electrons.root', 't1', branch_names)
    file1 = rnp.rec2array(file1)
    file1 = file1[0:amount,:]
    
    file2 = rnp.root2array(path+'/box/box_500k_pions.root', 't1', branch_names)
    file2 = rnp.rec2array(file2)
    file2 = file2[0:amount,:]
    
    file3 = rnp.root2array(path+'/box/box_500k_muons.root', 't1', branch_names)
    file3 = rnp.rec2array(file3)
    file3 = file3[0:amount,:]

    file4 = rnp.root2array(path+'/box/box_500k_kaons.root', 't1', branch_names)
    file4 = rnp.rec2array(file4)
    file4 = file4[0:amount,:]
    
    file5 = rnp.root2array(path+'/box/box_500k_protons.root', 't1', branch_names)
    file5 = rnp.rec2array(file5)
    file5 = file5[0:amount,:]
    
    file6 = rnp.root2array(path+'/box/box_500k_anti_electrons.root', 't1', branch_names)
    file6 = rnp.rec2array(file6)
    file6 = file6[0:amount,:]
    
    file7 = rnp.root2array(path+'/box/box_500k_anti_pions.root', 't1', branch_names)
    file7 = rnp.rec2array(file7)      
    file7 = file7[0:amount,:]
    
    file8 = rnp.root2array(path+'/box/box_500k_anti_muons.root', 't1', branch_names)
    file8 = rnp.rec2array(file8)
    file8 = file8[0:amount,:]
    
    file9 = rnp.root2array(path+'/box/box_500k_anti_kaons.root', 't1', branch_names)
    file9 = rnp.rec2array(file9)     
    file9 = file9[0:amount,:]
    
    file10 = rnp.root2array(path+'/box/box_500k_anti_protons.root', 't1', branch_names)
    file10 = rnp.rec2array(file10)
    file10 = file10[0:amount,:]
    
    X = np.concatenate((file1, file2, file3, file4, file5, file6, file7, file8, file9, file10))
    y = np.concatenate(( np.zeros(file1.shape[0]), np.zeros(file6.shape[0]),\
                         np.ones(file2.shape[0]), np.ones(file7.shape[0]),\
                         2*np.ones(file3.shape[0]), 2*np.ones(file8.shape[0]),\
                         3*np.ones(file4.shape[0]), 3*np.ones(file9.shape[0]),\
                         4*np.ones(file5.shape[0]), 4*np.ones(file10.shape[0]) )) 
    
    df_ = pd.DataFrame(np.hstack((X, y.reshape(y.shape[0], -1))),columns=branch_names+['temp'])
    df_['E/p'] = df_.loc[:,'EmcCalEnergy']/df_.loc[:,'Momentum']
    df_['labels'] = df_.loc[:,'temp']
    df_ = df_.drop(['temp'], axis=1)
    df_ = df_.dropna()
    return df_

def get_evt_data(path=path, branch_names=branchs):
    file1 = rnp.root2array(path+'/evt/evt_500k_electrons.root', 't1', branch_names)
    file1 = rnp.rec2array(file1)
    file1 = file1[0:amount*2,:]
    
    file2 = rnp.root2array(path+'/evt/evt_500k_pions.root', 't1', branch_names)
    file2 = rnp.rec2array(file2)
    file2 = file2[0:amount*2,:]
    
    file3 = rnp.root2array(path+'/evt/evt_500k_muons.root', 't1', branch_names)
    file3 = rnp.rec2array(file3)
    file3 = file3[0:amount*2,:]

    file4 = rnp.root2array(path+'/evt/evt_500k_kaons.root', 't1', branch_names)
    file4 = rnp.rec2array(file4)
    file4 = file4[0:amount*2,:]
    
    file5 = rnp.root2array(path+'/evt/evt_500k_protons.root', 't1', branch_names)
    file5 = rnp.rec2array(file5)
    file5 = file5[0:amount*2,:]
    
    X = np.concatenate((file1, file2, file3, file4, file5))
    y = np.concatenate(( np.zeros(file1.shape[0]),\
                         np.ones(file2.shape[0]), \
                         2*np.ones(file3.shape[0]),\
                         3*np.ones(file4.shape[0]),\
                         4*np.ones(file5.shape[0]) ))
    
    df_ = pd.DataFrame(np.hstack((X, y.reshape(y.shape[0], -1))),columns=branch_names+['temp'])
    df_['E/p'] = df_.loc[:,'EmcCalEnergy']/df_.loc[:,'Momentum']
    df_['labels'] = df_.loc[:,'temp']
    df_ = df_.drop(['temp'], axis=1)
    df_ = df_.dropna()
    return df_

def get_dpm_data(path=path, branch_names=branchs):
    file1 = rnp.root2array(path+'/dpm/dpmbkg_1M_electrons.root', 't1', branch_names)
    file1 = rnp.rec2array(file1)
    file1 = file1[0:17000,:]
    
    file2 = rnp.root2array(path+'/dpm/dpmbkg_1M_pions.root', 't1', branch_names)
    file2 = rnp.rec2array(file2)
    file2 = file2[0:17000,:]
    
    file3 = rnp.root2array(path+'/dpm/dpmbkg_1M_muons.root', 't1', branch_names)
    file3 = rnp.rec2array(file3)
    file3 = file3[0:17000,:]

    file4 = rnp.root2array(path+'/dpm/dpmbkg_1M_kaons.root', 't1', branch_names)
    file4 = rnp.rec2array(file4)
    file4 = file4[0:17000,:]
    
    file5 = rnp.root2array(path+'/dpm/dpmbkg_1M_protons.root', 't1', branch_names)
    file5 = rnp.rec2array(file5)
    file5 = file5[0:15000,:]
    
    X = np.concatenate((file1, file2, file3, file4, file5))
    y = np.concatenate(( np.zeros(file1.shape[0]),\
                         np.ones(file2.shape[0]), \
                         2*np.ones(file3.shape[0]),\
                         3*np.ones(file4.shape[0]),\
                         4*np.ones(file5.shape[0]) ))
    
    df_ = pd.DataFrame(np.hstack((X, y.reshape(y.shape[0], -1))),columns=branch_names+['temp'])
    df_['E/p'] = df_.loc[:,'EmcCalEnergy']/df_.loc[:,'Momentum']
    df_['labels'] = df_.loc[:,'temp']
    df_ = df_.drop(['temp'], axis=1)
    df_ = df_.dropna()
    return df_

In [3]:
%%time 
Box = get_box_data()
Evt = get_evt_data()
Dpm = get_dpm_data()

CPU times: user 25.1 s, sys: 1.32 s, total: 26.4 s
Wall time: 26.3 s


In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class DataGenerator(Dataset):
    '''Create PndFts HitPair Dataset'''

    def __init__(self, box=None, evt=None, dpm=None, transform=None):
        '''
        Constructor
        '''
        self.box         = box
        self.evt         = evt
        self.dpm         = dpm      
        self.transform   = transform
        
        self.__X, self.__y = self.__CombineAll()
        
    def __CombineAll(self):
        if ((self.box is None) or (self.evt is None) or (self.dpm is None)):
            raise ValueError('Datasets must be provided: box=? evt=? dpm=?')
        box_data = np.array(self.box.iloc[:,0:-1])
        evt_data = np.array(self.evt.iloc[:,0:-1])
        dpm_data = np.array(self.dpm.iloc[:,0:-1])

        box_labels  = np.array(self.box.iloc[:,-1])
        evt_labels  = np.array(self.evt.iloc[:,-1])
        dpm_labels  = np.array(self.dpm.iloc[:,-1])

        train_data_x = np.vstack([dpm_data, evt_data, box_data])
        train_data_y = np.hstack([dpm_labels,evt_labels,box_labels])
        
        train_data_x = torch.from_numpy(train_data_x)
        train_data_y = torch.from_numpy(train_data_y)
        
        train_data_x = train_data_x.to(device=device, dtype=torch.float32)
        train_data_y = train_data_y.to(device=device, dtype=torch.long)
        
        return train_data_x, train_data_y

    def __len__(self):
        return len(self.__X)
    
    def __getitem__(self, idx):
        return self.__X[idx], self.__y[idx]
    
    def getX(self):
        return self.__X
    
    def gety(self):
        return self.__y

In [9]:
dataset = DataGenerator(box=Box, evt=Evt, dpm=Dpm, transform=None)

In [15]:
dataloader = DataLoader(dataset, batch_size=4096, shuffle=True, num_workers=4)

In [16]:
#network architecture
net = nn.Sequential(nn.Linear(37, 500),
                      nn.ReLU(),
                      nn.Dropout(p=0.25),
                      nn.BatchNorm1d(500),
                      nn.Linear(500, 400),
                      nn.ReLU(),
                      nn.Dropout(p=0.25),
                      nn.BatchNorm1d(400),                    
                      nn.Linear(400, 300),
                      nn.ReLU(),
                      #nn.Dropout(p=0.5),
                      nn.BatchNorm1d(300),
                      nn.Linear(300, 200),
                      nn.ReLU(),
                      #nn.Dropout(p=0.5),
                      nn.BatchNorm1d(200),
                      nn.Linear(200, 100),
                      nn.ReLU(),
                      nn.BatchNorm1d(100),
                      nn.Linear(100, 50),
                      nn.ReLU(),
                      nn.Linear(50, 5),
                      nn.LogSoftmax(dim=1))

print(net)

Sequential(
  (0): Linear(in_features=37, out_features=500, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.25, inplace=False)
  (3): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (4): Linear(in_features=500, out_features=400, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.25, inplace=False)
  (7): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): Linear(in_features=400, out_features=300, bias=True)
  (9): ReLU()
  (10): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): Linear(in_features=300, out_features=200, bias=True)
  (12): ReLU()
  (13): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (14): Linear(in_features=200, out_features=100, bias=True)
  (15): ReLU()
  (16): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (17): Linear(in_features=100, out_features=50, bias=True)
  (18): ReLU()
  (19): Linear(in_fea

In [17]:
# create a loss and optimizer
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [18]:
# run the main training loop
total_step = len(dataloader)
kEpochs = 3
for epoch in range(kEpochs):
    for i_batch, (inputs, labels) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = net.forward(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        #if (i+1) % 100 == 0:
        print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'\
                   .format(epoch+1, kEpochs, i_batch+1, total_step, loss.item()))

Epoch [1/3], Step [1/21], Loss: 1.6217
Epoch [1/3], Step [2/21], Loss: 1.5748
Epoch [1/3], Step [3/21], Loss: 1.5503
Epoch [1/3], Step [4/21], Loss: 1.5612
Epoch [1/3], Step [5/21], Loss: 1.5353
Epoch [1/3], Step [6/21], Loss: 1.5160
Epoch [1/3], Step [7/21], Loss: 1.5134
Epoch [1/3], Step [8/21], Loss: 1.5433
Epoch [1/3], Step [9/21], Loss: 1.4868
Epoch [1/3], Step [10/21], Loss: 1.5006
Epoch [1/3], Step [11/21], Loss: 1.4837
Epoch [1/3], Step [12/21], Loss: 1.4807
Epoch [1/3], Step [13/21], Loss: 1.4675
Epoch [1/3], Step [14/21], Loss: 1.4632
Epoch [1/3], Step [15/21], Loss: 1.4529
Epoch [1/3], Step [16/21], Loss: 1.5003
Epoch [1/3], Step [17/21], Loss: 1.4553
Epoch [1/3], Step [18/21], Loss: 1.4520
Epoch [1/3], Step [19/21], Loss: 1.4413
Epoch [1/3], Step [20/21], Loss: 1.4273
Epoch [1/3], Step [21/21], Loss: 1.4162
Epoch [2/3], Step [1/21], Loss: 1.4175
Epoch [2/3], Step [2/21], Loss: 1.4557
Epoch [2/3], Step [3/21], Loss: 1.4109
Epoch [2/3], Step [4/21], Loss: 1.4118
Epoch [2/3], 

In [21]:
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    correct = 0
    total = 0
    for i_batch, (inputs, labels) in enumerate(dataloader):
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy: {:.2f} %'.format(100 * correct / total))

yhat  tensor([3, 2, 1,  ..., 2, 0, 3])
y     tensor([4, 2, 0,  ..., 0, 3, 1])
yhat  tensor([3, 3, 2,  ..., 2, 2, 2])
y     tensor([3, 1, 1,  ..., 0, 2, 2])
yhat  tensor([0, 4, 2,  ..., 1, 3, 3])
y     tensor([1, 0, 3,  ..., 3, 1, 1])
yhat  tensor([2, 1, 0,  ..., 3, 3, 3])
y     tensor([2, 0, 1,  ..., 2, 1, 2])
yhat  tensor([3, 1, 2,  ..., 2, 4, 2])
y     tensor([4, 1, 0,  ..., 2, 4, 0])
yhat  tensor([2, 0, 3,  ..., 4, 3, 2])
y     tensor([2, 1, 1,  ..., 4, 4, 2])
yhat  tensor([3, 4, 3,  ..., 2, 0, 2])
y     tensor([1, 3, 4,  ..., 3, 1, 2])
yhat  tensor([3, 2, 1,  ..., 1, 3, 0])
y     tensor([0, 1, 1,  ..., 2, 1, 0])
yhat  tensor([4, 2, 0,  ..., 2, 4, 2])
y     tensor([3, 0, 0,  ..., 0, 4, 2])
yhat  tensor([0, 2, 0,  ..., 2, 4, 0])
y     tensor([0, 2, 0,  ..., 1, 3, 0])
yhat  tensor([3, 0, 2,  ..., 3, 0, 2])
y     tensor([1, 0, 3,  ..., 3, 0, 2])
yhat  tensor([4, 0, 3,  ..., 1, 3, 0])
y     tensor([3, 0, 0,  ..., 3, 1, 0])
yhat  tensor([3, 3, 0,  ..., 2, 4, 4])
y     tensor([3, 3, 0,  .