# Extract data from output files
### Analyze the output from a single LBANN run
March 9, 2020 \
April 6, 2020 : Major edit to store files in order of epochs \
April 21, 2020: Major edit, added jupyter widgets to compare pixel intensity plots
May 8, 2020: Major edit, using all images for a given batch

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import subprocess as sp
import os
import glob
import sys

import time
from scipy import fftpack
# from ipywidgets import interact, interact_manual,fixed, SelectMultiple, IntText, IntSlider, FloatSlider,SelectionSlider,BoundedIntText
from ipywidgets import *

In [2]:
%matplotlib widget

In [3]:
sys.path.append('/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/LBANN/lbann_cosmogan/3_analysis/')
from modules_image_analysis import *

[NbConvertApp] Converting notebook modules_image_analysis.ipynb to script
[NbConvertApp] Writing 14928 bytes to modules_image_analysis.py


In [4]:
### Transformation functions for image pixel values
def f_transform(x):
    return 2.*x/(x + 4. + 1e-8) - 1.

def f_invtransform(s):
    return 4.*(1. + s)/(1. - s + 1e-8)

### Modules for Extraction

In [5]:
def f_get_files_df_sorted():
    '''
    Module to create Dataframe with filenames for each epoch and step
    Sorts by step and epoch
    '''
    
    ## Get images files and .npy arrays for each image in dump_outs folder
    t1=time.time()
    files_dict={}
    keys=['train_gen','train_input','val_gen','val_input']
    file_strg_lst=['model0-training*-gen_img*-output0.npy','model0-training*-inp_img*-output0.npy','model0-validation*-gen_img*-output0.npy','model0-validation*-inp_img*-output0.npy']
    for key,file_strg in zip(keys,file_strg_lst):
        files_dict[key]=np.array(glob.glob(main_dir+file_strg))
        if files_dict[key].shape[0]>1000 : 
            print('Warning the number of files is very large. Possibility of memory overload')

    df_files=pd.DataFrame([])
    dict1={}
    t1=time.time()
    ### First get sorted Dataframe with file names
    for key in keys: 
        files_arr=files_dict[key]  # Get array of files
        print(key,len(files_arr))
        for fname in files_arr:
            ### Extract the Epoch number and step number from the file name
            dict1['img_type']=key
            dict1['epoch']=np.int32(fname.split('epoch')[-1].split('-')[0])
            dict1['step']=np.int64(fname.split('step')[-1].split('-')[0])
            dict1['fname']=fname

            df_files=df_files.append(dict1,ignore_index=True)
    ## Sort values
    df_files=df_files.sort_values(by=['img_type','epoch','step']).reset_index(drop=True)
    # df_files
    t2=time.time()
    print("Time for Sorting",t2-t1)
    
    return df_files


def f_filter_epoch(df_input,num_sliced=1):
    '''
    Get just the last few stored step images for each epoch
    '''
    print('Extracting last %s steps of each epoch'%(num_sliced))
    df_output=pd.DataFrame([])
    for key in ['train_gen','train_input','val_gen','val_input']: 
        ### For each type of images, get list of epochs
        df1=df_input[df_input.img_type==key]
        epochs=np.unique(df1.epoch.values).astype(int)

        for epoch in epochs:### Extract the last few steps in each epoch
            df2=df1[df1.epoch==epoch]
            df_output=df_output.append(df2.iloc[-num_sliced:])  
    
    return df_output.reset_index(drop=True)


def f_get_images_df(df_files):
    '''
    Read dataframe with file names, read files and create new dataframe with images as numpy arrays
    '''
    
    
    def f_row(df_row):
        '''
        Extract image
        '''
        fname,key=df_row.fname,df_row.img_type
        a1=np.load(fname)
        if key.endswith('input'): 
            size=np.int(np.sqrt(a1.shape[-1])) ### Extract size of images (=128)
            batch_size=a1.shape[0] ### Number of batches
            samples=a1.reshape(batch_size,size,size)
        elif key.endswith('gen') : samples=a1[:,0,:,:]
        else : raise SystemError

        return samples
    
    t1=time.time()
    ##### Create new Dataframe with sorted images
    df=df_files.copy()
    df['images']=df.apply(lambda row: f_row(row), axis=1)
    t2=time.time()
    
    print("Time for Reading images",t2-t1)
    
    return df
    

def f_get_samples(df,img_type,start_epoch=0,end_epoch=None):
    '''
    Module to extract images for a range of epochs given a dataframe
    '''
    if end_epoch==None: end_epoch=start_epoch+1
    
    arr=df[(df.epoch>=start_epoch) & (df.epoch<=end_epoch) & (df.img_type==img_type)].images.values
    arr=np.vstack(arr)
#     print(arr.shape,np.max(arr))    
    
    return arr


## Extract image data 

In [6]:
fldr_name='20200413_095840_exagan'
fldr_name='20200423_122631_exagan_modified_paddding'
fldr_name='20200424_083456_exagan_modified_padding_2'
fldr_name='20200506_121613_exagan_200k_samples'
fldr_name='20200513_121910_peters_dataset'


main_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/{0}/dump_outs/'.format(fldr_name)
print(main_dir)


/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200513_121910_peters_dataset/dump_outs/


In [7]:

### Get dataframe with file names, sorted by epoch and step
df_files=f_get_files_df_sorted()

### Slice out rows to keep only the last few steps for each epoch.
df_files=f_filter_epoch(df_files,num_sliced=1)

#############################################################
### Read images one by one into a numpy array and create a new DataFrame
df=f_get_images_df(df_files)
print(df.shape)

# ### Filter to keep just one step per epoch
# df=f_filter_epoch(df,1)

train_gen 902
train_input 902
val_gen 226
val_input 226
Time for Sorting 6.468547582626343
Extracting last 1 steps of each epoch
Time for Reading images 11.022383213043213
(240, 5)


In [8]:
df_files.head(10)

Unnamed: 0,epoch,fname,img_type,step
0,0.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,1230.0
1,1.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,2460.0
2,2.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,3690.0
3,3.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,4920.0
4,4.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,6150.0
5,5.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,7380.0
6,6.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,8610.0
7,7.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,9840.0
8,8.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,11070.0
9,9.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,train_gen,12300.0


In [9]:
### Some epoch-steps have different batch size
for i,j in df.iterrows():
    if j.images.shape[0]!=128:
        print(i,j.epoch,j.step,j.img_type,j.images.shape)


## Explore epochs

In [64]:
samples=df[(df.img_type=='train_gen')].images.values

## Find the region without very high pixel values


In [65]:
### Get the maximum of each images for an epoch, count number of these above the cutoff
cutoff=0.9966
num_large=np.zeros(samples.shape)

for count,ee in enumerate(samples):
    max_arr=np.amax(ee,axis=(1,2)) # maximum of each image
    num_large[count]=max_arr[max_arr>cutoff].shape[0]

### Epochs with no high pixel values
zero_idx=np.where(num_large==0.0)[0]
other_idx=np.where(num_large>0.0)[0]

print("Epochs with no high pixel images",zero_idx)

plt.figure()
plt.plot(zero_idx,num_large[zero_idx],linestyle='',marker='*',color='r')
plt.plot(other_idx,num_large[other_idx],linestyle='',marker='D',color='b')

Epochs with no high pixel images [ 0  1  2  3  4  6  7  9 10 11 12 13 14 15 16 17 19 20 21 22 23 24 25 26
 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 52 53 54 55 56 57 58 59]


  from ipykernel import kernelapp as app


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x2aab22dcd240>]

In [66]:
### Plot the maximum across samples for each epoch

def f_plot_max_values(samples,cutoff=0.994):
    '''
    Make a plot of max values of images of a given set of sample images
    cutoff used to discard high values
    '''
    ### Get max pixel values of images
    max_values=np.array([np.max(i) for i in samples])
    ### Less than cutoff
    lesser_idx=np.where(max_values<cutoff)[0]
    higher_idx=np.where(max_values>=cutoff)[0]
    
    plt.figure()
    plt.plot(lesser_idx,max_values[lesser_idx],linestyle='',marker='*',color='r')
    plt.plot(higher_idx,max_values[higher_idx],linestyle='',marker='D',color='b')

    plt.axhline(y=cutoff,linestyle='--',color='k')
    plt.ylim(0.9,1.0)
    
f_plot_max_values(samples,0.9966)  ### The full dataset has a max value of just over 0.9966


  


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [67]:

# samples=f_invtransform(f_get_samples(df,'val_gen',epoch,epoch+1))
# ### Recomputing validation histogram to match bins with generated image
# samples_input=f_invtransform(f_get_samples(df,'train_input',0,60))
                       
# samples.shape,samples_input.shape

In [25]:
### Computing the chi-sqr, including high pixel values. 
# This takes time as the histogram is determined for each epoch

num_epochs=60
chi_sqr=np.zeros(num_epochs)
chi_sqr2=np.zeros(num_epochs)


for epoch in range(0,num_epochs):
    print(epoch)
    samples=f_invtransform(f_get_samples(df,'val_gen',epoch,epoch+1))
    ### Recomputing validation histogram to match bins with generated image
    samples_input=f_invtransform(f_get_samples(df,'train_input',0,60))
    max_val=np.max([np.max(samples),np.max(samples_input)])
    val_hist,val_err=f_pixel_intensity(samples_input,plot=False,normalize=False,bins=100,hist_range=(0,max_val))
    val_dr=val_hist.copy()
    val_dr[val_dr<=0.]=1.0    ### Avoiding division by zero for zero bins
    
    gen_hist,gen_err=f_pixel_intensity(samples,plot=False,normalize=False,bins=100,hist_range=(0,max_val))
    sq_diff=(gen_hist-val_hist)**2
    chi_sqr[epoch]=np.sum(np.divide(sq_diff,val_dr)) ###  sum((Obs-Val)^2/(Val))
    chi_sqr2[epoch]=np.sum(sq_diff-val_hist)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


In [23]:
### Computing the chi-sqr, ignoring high pixel values
# the histogram bin is determined by the input maximum value

num_epochs=60
chi_sqr=np.zeros(num_epochs)
chi_sqr2=np.zeros(num_epochs)

samples_input=f_invtransform(f_get_samples(df,'train_input',0,60))
max_val=np.max(samples_input)
val_hist,val_err=f_pixel_intensity(samples_input,plot=False,normalize=False,bins=100,hist_range=(0,max_val))
val_dr=val_hist.copy()
val_dr[val_dr<=0.]=1.0    ### Avoiding division by zero for zero bins

for epoch in range(0,num_epochs):
    samples=f_invtransform(f_get_samples(df,'val_gen',epoch,epoch+1))
    ### Recomputing validation histogram to match bins with generated image
    gen_hist,gen_err=f_pixel_intensity(samples,plot=False,normalize=False,bins=100,hist_range=(0,max_val))
    sq_diff=(gen_hist-val_hist)**2
    chi_sqr[epoch]=np.sum(np.divide(sq_diff,val_dr)) ###  sum((Obs-Val)^2/(Val))
    chi_sqr2[epoch]=np.sum(sq_diff-val_hist)

In [24]:
plt.figure()
plt.plot(chi_sqr,marker='o')
# plt.plot(chi_sqr2,marker='*')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x2aaade1ad7b8>]

In [26]:
plt.figure()
plt.plot(chi_sqr,marker='o')
# plt.plot(chi_sqr2,marker='*')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x2aaade34ada0>]

## Explore image samples

In [28]:
# f_pixel_intensity(samples2,'s2',normalize=True,mode='simple',bins=50)
# f_compare_pixel_intensity([samples2[20:60],samples4,['s2','s4'],normalize=normalize,log_scale=log_scale, mode=mode,bins=bins)
# f_compute_spectrum(samples2)
# f_compare_spectrum([samples2[20:60],samples4],['s2','s4'])

In [29]:
def f_widget_individual(df,img_type='val_gen',idx_range=(0,50),Fig_type='pixel',normalize=True,log_scale=True,rescale=True,mode='avg'):
    '''
    Module to plot pixel intensity or power spectrum for a given sample set of images
    Options for normalization, log-scal, and rescale
    Rescale converts image pixel values from (-1,1) to the original pixel range
    2 Fig_type: pixel-> pixel intensity and spectrum -> power spectrum
    '''
    
    
    start,end=idx_range[0],idx_range[1]
    print('Index Range %s - %s'%(start,end))
    try :
        sliced_arr=f_get_samples(df,img_type=img_type,start_epoch=start,end_epoch=end)
        if sliced_arr.shape[0]<1:
            print('Input indices %s %s are invalid.\nUsing full array'%(start,end))
            start0,end=0,'end'
            sliced_arr=f_get_samples(df,img_type=img_type)
    except Exception as e:
        print(e)
    if rescale: ### Converting from pixel intensity range (-1,1) to original range
        sliced_arr=f_invtransform(sliced_arr)
    print('Array size used',sliced_arr.shape)
    
    if Fig_type=='pixel':
        f_pixel_intensity(sliced_arr,label=img_type+': {0}-{1}'.format(str(start),str(end)),normalize=normalize,log_scale=log_scale,mode=mode)
    elif Fig_type=='spectrum':
        f_compute_spectrum(sliced_arr,label=img_type+': {0}-{1}'.format(str(start),str(end)),log_scale=log_scale)


In [30]:
interact_manual(f_widget_individual,df=fixed(df),img_type=fixed('val_gen'),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),mode=['avg','simple'],
                idx_range=IntRangeSlider(value=(0,60),min=0,max=80,step=1),)

interactive(children=(IntRangeSlider(value=(0, 60), description='idx_range', max=80), ToggleButtons(descriptio…

<function __main__.f_widget_individual(df, img_type='val_gen', idx_range=(0, 50), Fig_type='pixel', normalize=True, log_scale=True, rescale=True, mode='avg')>

## Compare samples

In [31]:
def f_widget_compare(sample_names,sample_dict,Fig_type='pixel',rescale=True,log_scale=True,bins=25,mode='avg',normalize=True):
    '''
    Module to make widget plots for pixel intensity or spectrum comparison for multiple sample sets
    '''
    img_list=[sample_dict[key] for key in sample_names]
    label_list=list(sample_names)
    
    if rescale: 
        for count,img in enumerate(img_list):
            img_list[count]=f_invtransform(img)
    
    assert Fig_type in ['pixel','spectrum'],"Invalid mode %s"%(mode)
    
    if Fig_type=='pixel':
        f_compare_pixel_intensity(img_lst=img_list,label_lst=label_list,normalize=normalize,log_scale=log_scale, mode=mode,bins=bins)
    elif Fig_type=='spectrum':
        f_compare_spectrum(img_lst=img_list,label_lst=label_list,log_scale=log_scale)


#### Compare different epochs

In [63]:
# img_list,labels_list=f_get_sample_epochs(df,'val_gen',10)

img_list,labels_list=[],[]
for epoch_range in [(0,4),(17,20),(25,27),(34,37),(44,51),(53,59)]:
    start,end=epoch_range[0],epoch_range[1]
    img_list.append(f_get_samples(df,'val_gen',start,end))
    labels_list.append('%s:%s'%(str(start),str(end)))

dict_samples=dict.fromkeys(labels_list)
for key,val in zip(labels_list,img_list): dict_samples[key]=val

### Compare with input
dict_samples['val input']=f_get_samples(df,img_type='val_input')
interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),bins=IntText(value=50),mode=['avg','simple'])

interactive(children=(SelectMultiple(description='sample_names', options=('0:4', '17:20', '25:27', '34:37', '4…

<function __main__.f_widget_compare(sample_names, sample_dict, Fig_type='pixel', rescale=True, log_scale=True, bins=25, mode='avg', normalize=True)>

#### Compare image types

In [49]:
### Available options : keys=['train_gen','train_input','val_gen','val_input']
start,end=35,46
samples1=f_get_samples(df,'val_gen',start,end)
samples2=f_get_samples(df,'val_input',0,60)
samples3=f_get_samples(df,'train_gen',start,end)
samples4=f_get_samples(df,'train_input')

print(np.max(samples1))

0.99780107


In [50]:
dict_samples={'s1':samples1, 's2': samples2, 's3': samples3, 's4':samples4}
interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),bins=IntText(value=50),mode=['avg','simple'])

interactive(children=(SelectMultiple(description='sample_names', options=('s1', 's2', 's3', 's4'), value=()), …

<function __main__.f_widget_compare(sample_names, sample_dict, Fig_type='pixel', rescale=True, log_scale=True, bins=25, mode='avg', normalize=True)>

## Compare different sample sets

#### Compare lbann images with input and keras code images

In [58]:
### Load images from keras code
# img_keras='/global/cfs/cdirs/dasrepo/vpa/cosmogan/data/computed_data/exagan1/run_100k_samples_35epochs/models/gen_imgs.npy'
# img_keras='/global/cfs/cdirs/dasrepo/vpa/cosmogan/data/computed_data/exagan1/run_200k_samples_24epochs/models/gen_imgs.npy'
img_keras='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_from_other_code/exagan1/run_200k_samples_peter_dataset_20_epochs/models/gen_imgs.npy'

a1=np.load(img_keras)
s_keras=a1[:,:,:]

### Load validation samples
img_raw='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/raw_data/very_large_dataset_val.npy'
a1=np.load(img_raw)
s_raw=f_transform(a1[:,:,:,0])[:3000]

print(s_raw.shape,s_keras.shape)

(3000, 128, 128) (3000, 128, 128)


In [61]:
### Extract a few images generated by Lban directly for a set of epochs
# parent_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200506_121613_exagan_200k_samples/'
parent_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200513_121910_peters_dataset/'

ff=[]

for epoch in [34,37]:
    f_strg=parent_dir+'dump_outs/model0-validation-epoch{}-*gen_img*.npy'.format(epoch)
    lst=glob.glob(f_strg)
    ff.append(lst)
f_list=[fle for a in ff for fle in a] ## Flattening out a list of lists
print(len(f_list))

arr=[np.load(fname)[:,0,:,:] for fname in f_list]
s_lbann=np.vstack(arr)
print(s_lbann.shape,np.max(s_lbann))

8
(1024, 128, 128) 0.9998577


In [62]:
dict_samples={'lbann':s_lbann, 'keras':s_keras,'raw': s_raw}

interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),
                bins=SelectionSlider(options=np.arange(10,200,10),value=50),
                mode=['avg','simple'])

interactive(children=(SelectMultiple(description='sample_names', options=('lbann', 'keras', 'raw'), value=()),…

<function __main__.f_widget_compare(sample_names, sample_dict, Fig_type='pixel', rescale=True, log_scale=True, bins=25, mode='avg', normalize=True)>