# Extract data from output files
### Analyze the output from a single LBANN run
March 9, 2020 \
April 6, 2020 : to store files in order of epochs \
April 21, 2020: added jupyter widgets to compare pixel intensity plots \
July 30, 2020: Perform analysis with stored histograms.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import subprocess as sp
import os
import glob
import sys

import itertools
import time

from ipywidgets import *

In [2]:
%matplotlib widget

In [3]:
sys.path.append('/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/3_analysis')
from modules_image_analysis import *

In [4]:
### Transformation functions for image pixel values
def f_transform(x):
    return 2.*x/(x + 4. + 1e-8) - 1.

def f_invtransform(s):
    return 4.*(1. + s)/(1. - s + 1e-8)

## Extract folder

In [5]:
dict1={'scratch':'/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/128square/',
    'proj':'/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/'}

u=interactive(lambda x: dict1[x], x=Select(options=dict1.keys()))
display(u)


interactive(children=(Select(description='x', options=('scratch', 'proj'), value='scratch'), Output()), _dom_c…

In [6]:
parent_dir=u.result
dir_lst=[i.split('/')[-1] for i in glob.glob(parent_dir+'20*')]
w=interactive(lambda x: x, x=Dropdown(options=dir_lst))
display(w)

interactive(children=(Dropdown(description='x', options=('20200911_083711_bsize64_spec_test_128_nospec', '2020…

In [7]:
result=w.result
main_dir=parent_dir+result
print(main_dir)

/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/128square/20201030_072640_bsize64_no_adv


## Extract metrics info from log file

In [8]:
def f_extract_info(fname):
    '''
    Module to extract information from out.log files of Lbann training
    Reads in file name
    '''
    strg_lst=['objective','d_real','d_fake','gen','spec_loss','run time','mini-batch']
    keys=['training_'+strg for strg in strg_lst]
    dict1={}
    for category in ['training','validation']:
        for strg in strg_lst:
            try: 
                key=category+'_'+strg
                cmd='grep "{0}" {1} | grep "{2}"'.format(category,fname,strg)
        #         print(cmd)
                op1=sp.check_output(cmd,shell=True).decode('utf-8').split('\n')
                obj=np.array([strg.split(':')[-1] for strg in op1 if strg])
                dict1[key]=obj
            except Exception as e:
                print(e)
                dict1[key]=np.nan
    
    df=pd.DataFrame([])
    key_lst=['training_objective', 'training_d_real', 'training_d_fake', 'training_gen', 'training_spec_loss','validation_objective', 'validation_d_real', 'validation_d_fake', 'validation_gen','validation_spec_loss']
    col_list=['train_obj','train_dreal','train_dfake','train_gen','train_spec','val_obj','val_dreal','val_dfake','val_gen','val_spec']
    for col,key in zip(col_list,key_lst):
        try: 
            df[col]=dict1[key].astype(np.float)
        except: pass

    ### Need to remove the trailing 's' in the timings
    for col,key in zip(['train_time','val_time'],['training_run time','validation_run time']):
        df[col]=np.array([i[:-1] for i in dict1[key]]).astype(np.float)

    for col,key in zip(['train_batch_stats','val_batch_stats'],['training_mini-batch','validation_mini-batch']):
        df[col]=dict1[key]
        
    return df

def f_plot_metrics(df,col_list=['train_obj']):
    '''
    Plot multiple columns of the dataframe
    '''
    plt.figure()
    
    marker_lst=('o','*','H','D','.','x')
    marker=itertools.cycle(marker_lst)
    for col in col_list: plt.plot(df[col],linestyle='',marker=next(marker),label=col)
    plt.legend()
    plt.xlabel('Epoch')
    

In [9]:
strg=main_dir+'/out.log'
df_metrics=f_extract_info(glob.glob(strg)[0])

In [10]:
df_metrics.head()

Unnamed: 0,train_obj,train_dreal,train_dfake,train_gen,train_spec,val_obj,val_dreal,val_dfake,val_gen,val_spec,train_time,val_time,train_batch_stats,val_batch_stats
0,40.802,0.076274,0.091411,7.97377,5.07929,40.4323,0.06228,0.061174,0.146947,5.0386,174.956,18.6701,"0.0495697s mean, 3.40507s max, 0.018013s min,...","0.0476893s mean, 1.80061s max, 0.0133379s min..."
1,40.3618,0.062311,0.061893,5.09521,5.0297,40.3294,0.055859,0.055527,0.057676,5.02725,183.664,14.4007,"0.0520436s mean, 1.04747s max, 0.0277937s min...","0.0367715s mean, 0.7699s max, 0.0142252s min,..."
2,40.3187,0.061565,0.059525,4.92271,5.0247,40.3386,0.060451,0.059045,0.063521,5.02738,169.156,13.9162,"0.0479191s mean, 0.7856s max, 0.0194172s min,...","0.0355324s mean, 0.417156s max, 0.011705s min..."
3,40.3005,0.060006,0.060363,4.86138,5.02252,40.3004,0.061581,0.061746,0.043963,5.02213,170.507,12.5547,"0.0483025s mean, 1.01236s max, 0.0186408s min...","0.0320475s mean, 0.900836s max, 0.0118082s mi..."
4,40.2807,0.055643,0.05529,4.84553,5.02122,40.2946,0.059355,0.05669,0.012427,5.02232,174.757,13.3806,"0.0495121s mean, 0.96741s max, 0.0278907s min...","0.0341637s mean, 0.471293s max, 0.0119462s mi..."


In [11]:
col_list=df_metrics.columns[~df_metrics.columns.str.endswith('stats')]
interact_manual(f_plot_metrics,col_list=SelectMultiple(options=col_list),df=fixed(df_metrics))



interactive(children=(SelectMultiple(description='col_list', options=('train_obj', 'train_dreal', 'train_dfake…

<function __main__.f_plot_metrics(df, col_list=['train_obj'])>

## Extract data from stored dataframe

In [12]:
### Load data
df=pd.read_pickle(main_dir+'/df_processed.pkle')
df[['epoch','step']]=df[['epoch','step']].astype(int)
df['label']=df.epoch.astype(str)+'-'+df.step.astype(str) # Add label column for plotting

In [13]:
print(df.shape)
df.head()


(105480, 20)


Unnamed: 0,epoch,step,img_type,fname,chi_1a,chi_1b,chi_1c,chi_1,chi_2,chi_imgvar,chi_spec1,chi_spec2,num_large,num_vlarge,hist_val,hist_err,hist_bin_centers,spec_val,spec_err,label
0,0,0,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,188.919404,4.75252,0.013036,193.68496,12.402488,26.13253,17729.620094,20681580.0,0,0,"[0.00021404027994051573, 0.05836540469577179, ...","[5.26696921090502e-05, 0.00262919138735563, 0....","[-1.031746031584782, -0.6161616169043975, -0.3...","[2266.70275271032, 1044.6298010309226, 826.267...","[68.58417320455962, 37.496178009155265, 32.861...",0-0
1,0,1,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,238.507563,5.333524,0.013186,243.854272,14.271373,31.088954,8926.736292,11049890.0,0,0,"[5.0693750512227395e-05, 0.02997636799661518, ...","[1.2111174322443983e-05, 0.0016252022154385672...","[-1.031746031584782, -0.6161616169043975, -0.3...","[5380.415806778707, 1882.6894395637264, 1255.5...","[173.33113652735864, 60.374580587056485, 39.78...",0-1
2,0,2,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,306.525973,6.977621,0.013186,313.51678,16.628523,35.783426,5553.641825,8492108.0,0,0,"[7.510185261070727e-06, 0.013816863319699692, ...","[3.635852805382416e-06, 0.0007983249998404476,...","[-1.031746031584782, -0.6161616169043975, -0.3...","[12367.983792795334, 3665.5671864400974, 2255....","[301.26794961431574, 97.75754271703867, 64.377...",0-2
3,0,3,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,407.756221,10.432857,0.013186,418.202263,19.775777,39.63778,4773.594189,11001510.0,0,0,"[0.0, 0.0050806403238357585, 0.128989845624795...","[0.0, 0.0003324106489253007, 0.002421003735140...","[-1.031746031584782, -0.6161616169043975, -0.3...","[21902.86684275791, 6149.352097958418, 3553.02...","[524.5239781223627, 152.36700498473846, 83.203...",0-3
4,0,4,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,558.621431,17.987106,0.013186,576.621723,24.191412,44.686587,5043.314563,13932650.0,0,0,"[0.0, 0.0009736418739058081, 0.061062097601746...","[0.0, 7.408091242751353e-05, 0.001451086926648...","[-1.031746031584782, -0.6161616169043975, -0.3...","[32610.217919546645, 8960.136380770708, 5103.8...","[638.5630870288279, 174.40569575426963, 113.26...",0-4


### View best epochs

#### Locations with best chi_sqr

In [14]:
chi_sqr_keys=['epoch','step','chi_1a','chi_1b','chi_1c','chi_1','chi_2','chi_imgvar','chi_spec1','chi_spec2']
# index location of min/max values of chi squares
inds=[]
for key in ['chi_1a','chi_1b','chi_1c','chi_1','chi_2','chi_imgvar','chi_spec1','chi_spec2']:
    inds.append(df[key].idxmin(axis=1))
inds=np.array(inds)
df.loc[inds][chi_sqr_keys]

Unnamed: 0,epoch,step,chi_1a,chi_1b,chi_1c,chi_1,chi_2,chi_imgvar,chi_spec1,chi_spec2
335,0,335,8.501149,10678.354143,0.009205,10686.864497,139.695985,45.541814,13.488987,2128461.0
0,0,0,188.919404,4.75252,0.013036,193.68496,12.402488,26.13253,17729.620094,20681580.0
23,0,23,427.92509,4040.671493,0.007937,4468.60452,84.483457,126.359903,771.070322,18682810.0
0,0,0,188.919404,4.75252,0.013036,193.68496,12.402488,26.13253,17729.620094,20681580.0
0,0,0,188.919404,4.75252,0.013036,193.68496,12.402488,26.13253,17729.620094,20681580.0
100634,28,100634,15.846818,14438.491124,0.013186,14454.351129,209.857797,4.826868,5.446888,81710820.0
17065,4,17065,21.642005,13249.915085,0.013186,13271.570277,189.563377,12.592422,4.795904,6111066.0
145,0,145,15.710029,9412.134922,0.494352,9428.339304,125.418594,94.308695,13.577529,626214.1


In [15]:
def f_get_best_chisqr_models(df,cutoff=0.2):
    '''
    Pick models with lowest 20% chi-square for multiple categories
    '''
    chi_sqr_keys=[ 'chi_1', 'chi_spec1']
    q_dict=dict(df.quantile(q=cutoff,axis=0)[chi_sqr_keys])
#     print(q_dict)
    
    df_sliced=df.query('chi_1 < {0} & chi_spec1 < {1}'.format(q_dict['chi_1'],q_dict['chi_spec1']))
    
    return df_sliced

In [16]:
df_sliced=f_get_best_chisqr_models(df,cutoff=0.5)
print(df_sliced.shape)

(31709, 20)


In [17]:
df_sliced.sort_values(by=['chi_1','chi_spec1'])[['epoch','step','chi_1','chi_spec1']].head(5)

Unnamed: 0,epoch,step,chi_1,chi_spec1
1233,0,1233,8734.705439,6.142266
1646,0,1646,9100.211169,6.057698
3611,1,3611,9126.174529,5.162732
1519,0,1519,9255.177282,6.009847
2299,0,2299,9293.971427,5.781311


In [18]:
### Plot chi-sqr values
df_sliced.plot(x="step", y=["chi_1", "chi_imgvar", "chi_spec1"],style='.',marker='*')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x2aaadf379048>

### High Pixel images

In [19]:
### Plot number of very high pixel images
plt.figure()
plt.plot(df[df.img_type=='train_gen'].step,df[df.img_type=='train_gen'].num_vlarge,linestyle='',marker='*')
plt.xlabel('Steps in Epochs')
plt.ylabel('Number of very large pixel images from a batch of images')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'Number of very large pixel images from a batch of images')

In [20]:
# df[df.num_vlarge>10].tail()

## Compare samples

In [21]:
# df_sliced.sort_values(by=['chi_spec1','chi_1']).head(5)
# df_sliced.sort_values(by=['chi_1','chi_spec1']).head(5)

In [28]:
best_idx=[]
best_idx.append(df_sliced.sort_values(by=['chi_1','chi_spec1']).head(2).index)
best_idx.append(df_sliced.sort_values(by=['chi_spec1','chi_1']).head(2).index)

best_idx=[i for j in best_idx for i in j]
print(best_idx)

[1233, 1646, 17065, 19957]


In [32]:
best_idx=np.arange(40130,40135).astype(int)
best_idx=[100634,17065]
df_best=df.loc[best_idx]
print([(df_best.loc[idx].epoch,df_best.loc[idx].step) for idx in best_idx])
df_best

[(28, 100634), (4, 17065)]


Unnamed: 0,epoch,step,img_type,fname,chi_1a,chi_1b,chi_1c,chi_1,chi_2,chi_imgvar,chi_spec1,chi_spec2,num_large,num_vlarge,hist_val,hist_err,hist_bin_centers,spec_val,spec_err,label
100634,28,100634,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,15.846818,14438.491124,0.013186,14454.351129,209.857797,4.826868,5.446888,81710820.0,0,0,"[0.022468596754808362, 0.012468516845836183, 0...","[1.8628202344514676e-06, 4.106824642190773e-06...","[-1.031746031584782, -0.6161616169043975, -0.3...","[66643.16493225098, 46131.23129272461, 33550.9...","[11.868603777924712, 6.807562746768907, 4.8901...",28-100634
17065,4,17065,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,21.642005,13249.915085,0.013186,13271.570277,189.563377,12.592422,4.795904,6111066.0,0,0,"[0.02097970252680108, 0.015041291736884257, 0....","[3.448953438735106e-05, 3.8291704352752016e-05...","[-1.031746031584782, -0.6161616169043975, -0.3...","[69439.77669525146, 46262.48702697754, 33519.4...","[40.96777767704382, 18.74720983984627, 15.8040...",4-17065


In [29]:
df_best=df_sliced.loc[best_idx]
print([(df_best.loc[idx].epoch,df_best.loc[idx].step) for idx in best_idx])
df_best

[(0, 1233), (0, 1646), (4, 17065), (5, 19957)]


Unnamed: 0,epoch,step,img_type,fname,chi_1a,chi_1b,chi_1c,chi_1,chi_2,chi_imgvar,chi_spec1,chi_spec2,num_large,num_vlarge,hist_val,hist_err,hist_bin_centers,spec_val,spec_err,label
1233,0,1233,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,20.326478,8714.365775,0.013186,8734.705439,120.674105,34.142874,6.142266,2890788.0,0,0,"[0.017938077496067427, 0.018151044873541, 0.01...","[3.616104973333107e-05, 6.656473307701632e-05,...","[-1.031746031584782, -0.6161616169043975, -0.3...","[80042.68093109131, 48069.597332763675, 33042....","[117.79682520485683, 74.2415925352025, 53.6527...",0-1233
1646,0,1646,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,17.177538,9083.020446,0.013186,9100.211169,126.228545,32.426835,6.057698,3572045.0,0,0,"[0.019064605285228034, 0.017389833953941926, 0...","[3.0876219184607585e-05, 6.254894035937549e-05...","[-1.031746031584782, -0.6161616169043975, -0.3...","[84637.69305419922, 49571.25069732667, 34465.5...","[157.9988083042326, 78.38822487065389, 45.0606...",0-1646
17065,4,17065,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,21.642005,13249.915085,0.013186,13271.570277,189.563377,12.592422,4.795904,6111066.0,0,0,"[0.02097970252680108, 0.015041291736884257, 0....","[3.448953438735106e-05, 3.8291704352752016e-05...","[-1.031746031584782, -0.6161616169043975, -0.3...","[69439.77669525146, 46262.48702697754, 33519.4...","[40.96777767704382, 18.74720983984627, 15.8040...",4-17065
19957,5,19957,train_gen,/global/cscratch1/sd/vpa/proj/cosmogan/results...,22.388823,13133.668428,0.013186,13156.070437,189.519817,11.126015,4.931669,8726183.0,0,0,"[0.021430313642465323, 0.013094007989072644, 0...","[1.7455880401643642e-05, 5.411747214082419e-05...","[-1.031746031584782, -0.6161616169043975, -0.3...","[68311.12114715576, 46774.15375823976, 33732.7...","[41.317014255248324, 19.035190457245605, 13.69...",5-19957


## Plot pixel intensity and spectrum

In [30]:
def f_plot_hist_spec_combined(df,dict_bkg):

    fig=plt.figure(figsize=(6,6))
    ax1=fig.add_subplot(121)
    ax2=fig.add_subplot(122)
    for (i,row),marker in zip(df.iterrows(),itertools.cycle('>^*sDHPdpx_')):

        x1=row.hist_bin_centers
        y1=row.hist_val
        yerr1=row.hist_err
        x1=f_invtransform(x1)

        y2=row.spec_val
        yerr2=row.spec_err
        x2=np.arange(len(y2))

        label='{0}_{1}_{2}'.format(i,row.epoch,row.step)
        ax1.errorbar(x1,y1,yerr1,marker=marker,markersize=5,linestyle='',label=label)
    #     ax2.errorbar(x2,y2,yerr2,marker=marker,markersize=5,linestyle='',label='{0}-{1}'.format(epoch,step))

        ax2.fill_between(x2, y2 - yerr2, y2 + yerr2, alpha=0.4)
        ax2.plot(x2, y2, marker=marker, linestyle=':',label=label)

    ### Plot input data
    x,y,yerr=dict_bkg['hist_bin_centers'],dict_bkg['hist_val'],dict_bkg['hist_err']
    x=f_invtransform(x)
    ax1.errorbar(x, y,yerr,color='k',linestyle='-',label='bkgnd')   

    y,yerr=dict_bkg['spec_val'],dict_bkg['spec_err']
    x=np.arange(len(y))
    ax2.fill_between(x, y - yerr, y + yerr, color='k',alpha=0.8)

    plt.legend()
    # plt.yscale('log')
    ax1.set_xscale('symlog',linthreshx=50)
    ax1.set_yscale('log')
    ax2.set_yscale('log')
    
    
    
def f_compute_hist_spect(sample,bins):
    ''' Compute pixel intensity histograms and radial spectrum for 2D arrays
    Input : Image arrays and bins
    Output: dictionary with 5 arrays : Histogram values, errors and bin centers, Spectrum values and errors.
    '''
    ### Compute pixel histogram for row
    gen_hist,gen_err,hist_bins=f_batch_histogram(sample,bins=bins,norm=True,hist_range=None)
    ### Compute spectrum for row
    spec,spec_err=f_compute_spectrum(sample,plot=False)

    dict1={'hist_val':gen_hist,'hist_err':gen_err,'hist_bin_centers':hist_bins,'spec_val':spec,'spec_err':spec_err }
    return dict1


### Extract validation data
fname='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/raw_data/128_square/dataset_2_smoothing_200k/norm_1_train_val.npy'
s_val=np.load(fname,mmap_mode='r')[:4000][:,0,:,:][200:300]
print(s_val.shape)

bins=np.concatenate([np.array([-0.5]),np.arange(0.5,20.5,1),np.arange(20.5,100.5,5),np.arange(100.5,1000.5,50),np.array([2000])]) #bin edges to use
bins=f_transform(bins)   ### scale to (-1,1) 
### Compute histogram and spectrum of raw data 
dict_val=f_compute_hist_spect(s_val,bins)

(100, 128, 128)


In [33]:
f_plot_hist_spec_combined(df_best,dict_val)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [None]:
df[(df.step>40100)& (df.step<40150)].plot(kind='line',x='step',y=['chi_1','chi_spec1'])

## View image block

In [None]:
def f_get_img(df,epoch,step):
    df_temp=df[(df.epoch==epoch)&(df.step==step)]
    images=np.load(df_temp.fname.values[0])[:,0,:,:]
    return images

img_arr=f_get_img(df,11,40129)

In [None]:
f_plot_grid(img_arr[:18],cols=6,fig_size=(10,6))

In [None]:
f_plot_grid(s_val[100:118],cols=6,fig_size=(10,6))

In [None]:
# df_sliced[df_sliced.epoch==4]['step'].values

In [None]:
df[(df.epoch==18)&(df.step>=15975)&(df.step<=15983)][['epoch','step','chi_1','chi_spec1','num_vlarge']]