In [None]:
import numpy as np
import pandas as pd
from pandas import read_csv
import math
import matplotlib.pyplot as plt
import glob

# Create training dataset from flat pT PixelAV datasets

In [None]:
# Global variables
threshold = 0.1
sensor_geom = '100x25'
train_dataset_name = 'dataset_5s' # for train datasets
test_dataset_name = 'dataset_4s' # for location of test (physical pT) datasets
dataset_savedir = 'dataset_5s' # for save loc of final datasets

In [None]:
dirtrain = '/location/of/parquets/smartpixels/'+train_dataset_name+'/'+train_dataset_name+'_'+sensor_geom+'_parquets/unflipped/'
# /location/of/parquets/smartpixels/dataset_2s/dataset_2s_50x12P5_parquets/unflipped
dftrain = pd.read_parquet(dirtrain+'labels_d16401.parquet')
print(dftrain.head())
print(dftrain.tail())

In [None]:
trainlabels = []
trainrecons = []

iter=0
suffix = 16400
for filepath in glob.iglob(dirtrain+'*.parquet'):
    iter+=1
print(iter," files present in directory.")
for i in range(int(iter/3)):
        trainlabels.append(pd.read_parquet(dirtrain+'labels_d'+str(suffix+i+1)+'.parquet'))
        trainrecons.append(pd.read_parquet(dirtrain+'recon2D_d'+str(suffix+i+1)+'.parquet'))
trainlabels_csv = pd.concat(trainlabels, ignore_index=True)
trainrecons_csv = pd.concat(trainrecons, ignore_index=True)

iter_0, iter_1, iter_2 = 0, 0, 0
iter_rem = 0
for iter, row in trainlabels_csv.iterrows():
    if(abs(row['pt'])>threshold):
        iter_0+=1
    elif(-1*threshold<=row['pt']<0):
        iter_1+=1
    elif(0<row['pt']<=threshold):
        iter_2+=1
    else:
        iter_rem+=1
print("iter_0: ",iter_0)
print("iter_1: ",iter_1)
print("iter_2: ",iter_2)
print("iter_rem: ",iter_rem)

plt.hist(trainlabels_csv['pt'], bins=100)
plt.title('pT of all events')
plt.show()

plt.hist(trainlabels_csv[abs(trainlabels_csv['pt'])>threshold]['pt'], bins=100)
plt.title('pT of Class 0 events')
plt.show()

plt.hist(trainlabels_csv[(0<=trainlabels_csv['pt'])&(trainlabels_csv['pt']<=threshold)]['pt'], bins=50)
plt.hist(trainlabels_csv[(-1*threshold<=trainlabels_csv['pt'])& (trainlabels_csv['pt']<0)]['pt'], bins=50)
plt.title('pT of Class 1+2 events')
plt.show()

number_of_events = (min(iter_1, iter_2)//1000)*1000
if(number_of_events*2>iter_0):
    number_of_events = (iter_0//1000)*1000/2
number_of_events = int(number_of_events)
print("Number of events: ",number_of_events)


In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
def sumRow(X):
    sum1 = 0
    sumList = []
    for i in X:
        sum1 = np.sum(i,axis=0)
        sumList.append(sum1)
        b = np.array(sumList)
    return b
trainlist1, trainlist2 = [], []

for (index1, row1), (index2, row2) in zip(trainrecons_csv.iterrows(), trainlabels_csv.iterrows()):
    rowSum = 0.0
    X = row1.values
    X = np.reshape(X,(13,21))
    rowSum = sumRow(X)
    trainlist1.append(rowSum)
    cls = -1
    if(abs(row2['pt'])>threshold):
        cls=0
    elif(-1*threshold<=row2['pt']<0):
        cls=1
    elif(0<=row2['pt']<=threshold):
        cls=2
    trainlist2.append([row2['y-local'], cls, row2['pt']])
traindf_all = pd.concat([pd.DataFrame(trainlist1), pd.DataFrame(trainlist2 , columns=['y-local', 'cls', 'pt'])], axis=1)
print(traindf_all.head())

In [None]:
totalsize = number_of_events
random_seed0 = 10#11
random_seed1 = 13#14
random_seed2 = 19#20

traindf_all = traindf_all.sample(frac=1, random_state=random_seed0).reset_index(drop=True)
# traindf_all.to_csv(dataset_savedir+'/'+'/FullTrainData_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)
traindfcls0 = traindf_all.loc[traindf_all['cls']==0]
traindfcls1 = traindf_all.loc[traindf_all['cls']==1]
traindfcls2 = traindf_all.loc[traindf_all['cls']==2]
print(traindfcls0.shape)
print(traindfcls1.shape)
print(traindfcls2.shape)
print(traindfcls2.head())
traindfcls0 = traindfcls0.iloc[:2*totalsize]
traindfcls1 = traindfcls1.iloc[:totalsize]
traindfcls2 = traindfcls2.iloc[:totalsize]
print(traindfcls2.head())

traincls0 = traindfcls0.sample(frac = 1, random_state=random_seed1)
traincls1 = traindfcls1.sample(frac = 1, random_state=random_seed1)
traincls2 = traindfcls2.sample(frac = 1, random_state=random_seed1)
train = pd.concat([traincls0, traincls1, traincls2], axis=0)

train = train.sample(frac=1, random_state=random_seed2)

print(traincls0.shape)
print(traincls1.shape)
print(traincls2.shape)
print(train.shape)

trainlabel = train['cls']
trainpt = train['pt']
train = train.drop(['cls', 'pt'], axis=1)

print(train.shape)
print(trainlabel.shape)
print(trainpt.shape)

train.to_csv(dataset_savedir+'/FullPrecisionInputTrainSet_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)
trainlabel.to_csv(dataset_savedir+'/TrainSetLabel_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)
trainpt.to_csv(dataset_savedir+'/TrainSetPt_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)

# Create test datasets from physical PixelAV dataset

In [None]:
dirtest = '/location/of/parquets/smartpixels/'+test_dataset_name+'/'+test_dataset_name+'_'+sensor_geom+'_parquets/unflipped/'
# /location/of/parquets/smartpixels/dataset_2s/dataset_2s_50x12P5_parquets/unflipped
dftest = pd.read_parquet(dirtest+'labels_d16401.parquet')
print(dftest.head())
print(dftest.tail())

In [None]:
testlabels = []
testrecons = []

iter=0
suffix = 16400
for filepath in glob.iglob(dirtest+'*.parquet'):
    iter+=1
print(iter," files present in directory.")
for i in range(int(iter/3)):
        testlabels.append(pd.read_parquet(dirtest+'labels_d'+str(suffix+i+1)+'.parquet'))
        testrecons.append(pd.read_parquet(dirtest+'recon2D_d'+str(suffix+i+1)+'.parquet'))
testlabels_csv = pd.concat(testlabels, ignore_index=True)
testrecons_csv = pd.concat(testrecons, ignore_index=True)

iter_0, iter_1, iter_2 = 0, 0, 0
iter_rem = 0
for iter, row in testlabels_csv.iterrows():
    if(abs(row['pt'])>threshold):
        iter_0+=1
    elif(-1*threshold<=row['pt']<0):
        iter_1+=1
    elif(0<row['pt']<=threshold):
        iter_2+=1
    else:
        iter_rem+=1
print("iter_0: ",iter_0)
print("iter_1: ",iter_1)
print("iter_2: ",iter_2)
print("iter_rem: ",iter_rem)

plt.hist(testlabels_csv['pt'], bins=100)
plt.title('pT of all events')
plt.show()

plt.hist(testlabels_csv[abs(testlabels_csv['pt'])>threshold]['pt'], bins=100)
plt.title('pT of Class 0 events')
plt.show()

plt.hist(testlabels_csv[(0<=testlabels_csv['pt'])&(testlabels_csv['pt']<=threshold)]['pt'], bins=50)
plt.hist(testlabels_csv[(-1*threshold<=testlabels_csv['pt'])& (testlabels_csv['pt']<0)]['pt'], bins=50)
plt.title('pT of Class 1+2 events')
plt.show()

number_of_events = (min(iter_1, iter_2)//1000)*1000
if(number_of_events*2>iter_0):
    number_of_events = (iter_0//1000)*1000/2
number_of_events = int(number_of_events)
print("Number of events: ",number_of_events)


In [None]:
np.set_printoptions(threshold=sys.maxsize)
def sumRow(X):
    sum1 = 0
    sumList = []
    for i in X:
        sum1 = np.sum(i,axis=0)
        sumList.append(sum1)
        b = np.array(sumList)
    return b
testlist1, testlist2 = [], []

for (index1, row1), (index2, row2) in zip(testrecons_csv.iterrows(), testlabels_csv.iterrows()):
    rowSum = 0.0
    X = row1.values
    X = np.reshape(X,(13,21))
    rowSum = sumRow(X)
    testlist1.append(rowSum)
    cls = -1
    if(abs(row2['pt'])>threshold):
        cls=0
    elif(-1*threshold<=row2['pt']<0):
        cls=1
    elif(0<=row2['pt']<=threshold):
        cls=2
    testlist2.append([row2['y-local'], cls, row2['pt']])
testdf_all = pd.concat([pd.DataFrame(testlist1), pd.DataFrame(testlist2 , columns=['y-local', 'cls', 'pt'])], axis=1)
print(testdf_all.head())

In [None]:
totalsize = number_of_events#227000
random_seed0 = 10#11
random_seed1 = 13#14
random_seed2 = 19#20

testdf_all = testdf_all.sample(frac=1, random_state=random_seed0).reset_index(drop=True)
testdf_all.to_csv(dataset_savedir+'/'+'/FullTestData_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)
# testdfcls0 = testdf_all.loc[testdf_all['cls']==0]
# testdfcls1 = testdf_all.loc[testdf_all['cls']==1]
# testdfcls2 = testdf_all.loc[testdf_all['cls']==2]
# print(testdfcls0.shape)
# print(testdfcls1.shape)
# print(testdfcls2.shape)
# print(testdfcls2.head())
# testdfcls0 = testdfcls0.iloc[:2*totalsize]
# testdfcls1 = testdfcls1.iloc[:totalsize]
# testdfcls2 = testdfcls2.iloc[:totalsize]
# print(testdfcls2.head())

# testcls0 = testdfcls0.sample(frac = 1, random_state=random_seed1)
# testcls1 = testdfcls1.sample(frac = 1, random_state=random_seed1)
# testcls2 = testdfcls2.sample(frac = 1, random_state=random_seed1)
# test = pd.concat([testcls0, testcls1, testcls2], axis=0)

# test = test.sample(frac=1, random_state=random_seed2)
test=testdf_all
# print(testcls0.shape)
# print(testcls1.shape)
# print(testcls2.shape)
print(test.shape)

testlabel = test['cls']
testpt = test['pt']
test = test.drop(['cls', 'pt'], axis=1)

print(test.shape)
print(testlabel.shape)
print(testpt.shape)

test.to_csv(dataset_savedir+'/FullPrecisionInputTestSet_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)
testlabel.to_csv(dataset_savedir+'/TestSetLabel_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)
testpt.to_csv(dataset_savedir+'/TestSetPt_'+sensor_geom+'_0P'+str(threshold - int(threshold))[2:]+'thresh.csv', index=False)

In [None]:
# plotdata = pd.concat([traincls0['pt'], testcls0['pt']], axis=0)
# #print(plt.hist(traincls0['pt'], bins=100))
# #print(plt.hist(testcls0['pt'], bins=100))
# plt.hist(traincls0['pt'], bins=100)
# plt.hist(testpt, bins=100)
# plt.xticks(np.arange(-5, 5, 1))
# plt.show()

In [None]:
# Dataset 2s
# 0.1 threshold =  288000 total events for test/train!?
# 0.125 thresh  = 432000 total events for test/train
# 0.15 thresh   = 588000 total events for test/train!?
# 0.175 thresh  = 748000 total events for test/train
# 0.2 threshold =  908000 total events for test/train
# 0.3 threshold = 1536000 total events for test/train!?
# 0.4 threshold = 1998000 total events for test/train
# 0.5 threshold = 1566000 total events for test/train
# 100x25x100, dataset4s_50x12.5 follows above numbers. Not sure what about 50x25 which used the new random_seeds I believe.
# For 100x25x150 and maybe for the new random seeds): 0,1=284000, 0.15=576000, 0.2=896000, 0.3=1516000, 0.4=1898000, 0.5=1550000

# Dataset 1s
# 0.1 threshold =  288000 total events for test/train
# 0.15 threshold = 588000 total events for test/train
# 0.2 threshold =  908000 total events for test/train
# 0.3 threshold = 1536000 total events for test/train
# 0.4 threshold = 1916000 total events for test/train
# 0.5 threshold = 1484000 total events for test/train

# Dataset 3s
# 0.1 threshold =  52000 total events for train
# 0.15 threshold = 96000 total events for train
# 0.2 threshold =  136000 total events for train
# 0.3 threshold =  216000 total events for train
# 0.4 threshold =  296000 total events for train
# 0.5 threshold =  376000 total events for train
# for 100x25x150, the stats were lesser by 4k events all pt boundaries except 0.1