# Extract data from output files

### Code to extract timing information from output files of Lbann code
March 9, 2020

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import subprocess as sp
import os
import glob
import itertools

In [2]:
%matplotlib widget

## Extract training times

In [3]:
def f_get_output(fname):
    ''' Use grep to get run times from output files
    '''
    cmd='grep "run time" {0}| grep "training"'.format(fname)
    op1=sp.check_output(cmd,shell=True).decode('utf-8').split('\n')[:-1]
    # print(op1)

    cmd='grep "run time" {0}| grep "validation"'.format(fname)
    op2=sp.check_output(cmd,shell=True).decode('utf-8').split('\n')[:-1]
    
    return op1,op2

def f_get_run_times(op_arr):
    '''
    Get run times from output file out.log
    '''
    run_times=np.array([np.float(i.split(':')[-1][:-1]) for i in op_arr])
    return run_times


def f_store_run_times(fname):
    
    ### Get output 
    op1,op2=f_get_output(fname+'/out.log')
    ### Get arrays from outputs for training and validation
    arr1=f_get_run_times(op1)
    arr2=f_get_run_times(op2)
    ### Print times
    size=len(arr1) ### Number of epochs
    ### Compute mean and errors of times
    train_mean,train_err=np.mean(arr1),np.std(arr1)/np.sqrt(size)
    val_mean,val_err=np.mean(arr2),np.std(arr2)/np.sqrt(size)
    
    
    ### Extract processor info from the file name
    try:
#         ## For slurm files
#         lst=fname.split('/')[-1].split('.')[0].split('_')
# #         print(lst)
#         nodes,procs=int(lst[1]),int(lst[2].split('-')[0])
    
        # For out.log files
        lst=fname.split('_')
#         print(lst)
        batch,nodes,procs=int(lst[4].split('bsize')[-1]),int(lst[6]),int(lst[7])
    
    except Exception as e:
        print(e,fle)
        nodes,procs,cpus=None,None,None
        pass
    
    job_strg='%s_%s_%s'%(nodes,procs,batch)
    
    keys=['train_arr','val_arr','train_mean','train_err','val_mean','val_err','num_epochs','nodes','GPUs_per_node','batchsize','job_strg']
    values=[arr1,arr2,train_mean,train_err,val_mean,val_err,size,nodes,procs,batch,job_strg]
    info_dict=dict(zip(keys,values))
    
    return info_dict

In [4]:
# fname=main_dir+'20200309_161923_exagan/out.log'
# f_store_run_times(fname)

In [5]:
### Code for directly using slurm files
# main_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/'
# df=pd.DataFrame([])

# for fle in glob.glob(main_dir+'slurm-scaling_*_*.out'):
# #     print(fle)
#     info_dict=f_store_run_times(fle)
#     df=df.append(info_dict,ignore_index=True)
    
# df=df.sort_values(by=['nodes','GPUs_per_node']).reset_index(drop=True)
# df

In [6]:
main_dir='/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/128square/scaling_runs/'
df=pd.DataFrame([])

for fle in glob.glob(main_dir+'*scaling*'):
#     print(fle)

    info_dict=f_store_run_times(fle)
    df=df.append(info_dict,ignore_index=True)
    
df=df.sort_values(by=['nodes','GPUs_per_node']).reset_index(drop=True)

df['num_GPUs']=df.GPUs_per_node*df.nodes
df['batch_samples_per_GPU']=(df.batchsize/df.num_GPUs).astype(int)
df

Unnamed: 0,GPUs_per_node,batchsize,job_strg,nodes,num_epochs,train_arr,train_err,train_mean,val_arr,val_err,val_mean,num_GPUs,batch_samples_per_GPU
0,1.0,32.0,1_1_32,1.0,10.0,"[359.143, 354.098, 354.01, 355.164, 357.03, 35...",0.551521,356.1952,"[15.6804, 15.4904, 15.1138, 16.7692, 15.5531, ...",0.163812,15.5513,1.0,32
1,4.0,128.0,1_4_128,1.0,10.0,"[109.946, 106.336, 104.856, 105.27, 105.423, 1...",0.444558,105.9669,"[4.88709, 4.62119, 4.63423, 4.72335, 4.72887, ...",0.058385,4.676315,4.0,32
2,8.0,64.0,1_8_64,1.0,10.0,"[79.5875, 76.1539, 76.2804, 78.7019, 76.6649, ...",0.444776,77.87435,"[2.82619, 2.97278, 3.79764, 2.86661, 2.96543, ...",0.216331,3.410702,8.0,8
3,8.0,256.0,1_8_256,1.0,10.0,"[59.2155, 54.1355, 53.9806, 55.1209, 54.3788, ...",0.497785,54.84403,"[2.85722, 2.41639, 2.46879, 2.47476, 2.77964, ...",0.046266,2.536589,8.0,32
4,8.0,32.0,1_8_32,1.0,10.0,"[117.838, 111.049, 113.513, 115.824, 114.904, ...",0.628472,114.0449,"[4.4348, 4.46372, 4.15406, 4.15757, 4.64013, 4...",0.255027,4.682148,8.0,4
5,8.0,1024.0,1_8_1024,1.0,10.0,"[52.2489, 44.9361, 46.0965, 43.5757, 43.7856, ...",0.818162,45.10415,"[3.96834, 1.93799, 1.9347, 1.94431, 3.38007, 1...",0.22355,2.288528,8.0,128
6,8.0,512.0,1_8_512,1.0,10.0,"[51.7851, 47.3257, 46.464, 45.7134, 46.7922, 4...",0.557007,47.06034,"[3.64415, 1.96741, 2.11442, 1.95557, 2.3897, 1...",0.162875,2.256876,8.0,64
7,8.0,2048.0,1_8_2048,1.0,10.0,"[63.1145, 41.8487, 44.9689, 41.5233, 45.2813, ...",1.889516,45.77109,"[6.69585, 2.00216, 1.98863, 1.98551, 1.98065, ...",0.508531,2.747454,8.0,256
8,2.0,128.0,2_2_128,2.0,10.0,"[106.761, 103.506, 104.514, 105.436, 103.888, ...",0.307315,104.5503,"[4.66266, 4.32728, 4.29564, 4.07905, 3.99203, ...",0.099529,4.443819,4.0,32
9,4.0,256.0,2_4_256,2.0,10.0,"[55.2396, 53.0037, 53.7125, 54.0469, 51.1769, ...",0.408644,52.65502,"[2.36805, 2.05706, 2.05694, 2.06412, 2.40571, ...",0.189731,2.471418,8.0,32


In [7]:
col_list=['job_strg','batchsize','num_GPUs','batch_samples_per_GPU','train_mean','train_err']
df[col_list]

Unnamed: 0,job_strg,batchsize,num_GPUs,batch_samples_per_GPU,train_mean,train_err
0,1_1_32,32.0,1.0,32,356.1952,0.551521
1,1_4_128,128.0,4.0,32,105.9669,0.444558
2,1_8_64,64.0,8.0,8,77.87435,0.444776
3,1_8_256,256.0,8.0,32,54.84403,0.497785
4,1_8_32,32.0,8.0,4,114.0449,0.628472
5,1_8_1024,1024.0,8.0,128,45.10415,0.818162
6,1_8_512,512.0,8.0,64,47.06034,0.557007
7,1_8_2048,2048.0,8.0,256,45.77109,1.889516
8,2_2_128,128.0,4.0,32,104.5503,0.307315
9,2_4_256,256.0,8.0,32,52.65502,0.408644


In [8]:
markers=itertools.cycle(('*','s','D','h','.','+'))


In [17]:
### Scaling plot 
df_temp=df[(df.batch_samples_per_GPU==32)].sort_values(by='num_GPUs')

plt.figure()
for i,j in df_temp.iterrows():
    x,y,yerr=j.GPUs_per_node*j.nodes,j.train_mean,j.train_err
    plt.errorbar(x=x,y=y,yerr=yerr,label=j.job_strg,markersize=10,marker=next(markers))
    
### Comparison with expected scaling
x=np.linspace(1,16,num=50)
plt.plot(x,356.0/x,color='y',label='y=356/x')
plt.xlabel('total GPUs')
plt.ylabel('Training time \nin seconds')
plt.legend()
plt.title('Plot of Training time vs total number of GPUs ')

# plt.savefig('scalingplot1.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Plot of Training time vs total number of GPUs ')

In [18]:
df_temp[col_list]

Unnamed: 0,job_strg,batchsize,num_GPUs,batch_samples_per_GPU,train_mean,train_err
0,1_1_32,32.0,1.0,32,356.1952,0.551521
1,1_4_128,128.0,4.0,32,105.9669,0.444558
8,2_2_128,128.0,4.0,32,104.5503,0.307315
3,1_8_256,256.0,8.0,32,54.84403,0.497785
9,2_4_256,256.0,8.0,32,52.65502,0.408644
12,4_2_256,256.0,8.0,32,59.80198,0.690984
10,2_8_512,512.0,16.0,32,25.85925,0.623332
13,4_4_512,512.0,16.0,32,25.42578,0.412116


In [12]:
### Variation with batch size
df_temp=df[(df.nodes==1.0)&(df.GPUs_per_node==8.0)].sort_values(by='batchsize')
plt.figure()
for i,j in df_temp.iterrows():
    x,y,yerr=j.batchsize,j.train_mean,j.train_err
    plt.errorbar(x=x,y=y,yerr=yerr,label=j.job_strg,markersize=8,marker=next(markers))
    
# ### Comparison with expected scaling
# x=np.linspace(16,1024,num=50)
# plt.plot(x,(120*16)/x,color='y',label='y=350/x')
plt.xlabel('batch size')
plt.ylabel('Training time \nin seconds')
# plt.xscale('log')
plt.xticks([2**i for i in range(4,12)])
plt.title('Training time vs batch-size')
plt.legend()


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x2aaad8fc2250>

In [15]:
df_temp[col_list]

Unnamed: 0,job_strg,batchsize,num_GPUs,batch_samples_per_GPU,train_mean,train_err
4,1_8_32,32.0,8.0,4,114.0449,0.628472
2,1_8_64,64.0,8.0,8,77.87435,0.444776
3,1_8_256,256.0,8.0,32,54.84403,0.497785
6,1_8_512,512.0,8.0,64,47.06034,0.557007
5,1_8_1024,1024.0,8.0,128,45.10415,0.818162
7,1_8_2048,2048.0,8.0,256,45.77109,1.889516


In [None]:
# ### Plot individual times
# plt.figure()
# y=df[df.job_strg=='2_8_512'].train_arr.values[0]
# plt.plot(y,linestyle='',marker='*')
# plt.axhline(np.mean(y),color='y')

28.8