# Extract data from output files
### Analyze the output from a single LBANN run
March 9, 2020 \
April 6, 2020 : Major edit to store files in order of epochs \
April 21, 2020: Major edit, added jupyter widgets to compare pixel intensity plots

May 8, 2020: Major edit, using all images for a given batch



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import subprocess as sp
import os
import glob
import sys

import time
from scipy import fftpack
# from ipywidgets import interact, interact_manual,fixed, SelectMultiple, IntText, IntSlider, FloatSlider,SelectionSlider,BoundedIntText
from ipywidgets import *

In [2]:
%matplotlib widget

In [3]:
sys.path.append('/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/LBANN/lbann_cosmogan/3_analysis/')
from modules_image_analysis import *

[NbConvertApp] Converting notebook modules_image_analysis.ipynb to script
[NbConvertApp] Writing 15103 bytes to modules_image_analysis.py


In [4]:
### Transformation functions for image pixel values
def f_transform(x):
    return 2.*x/(x + 4. + 1e-8) - 1.

def f_invtransform(s):
    return 4.*(1. + s)/(1. - s + 1e-8)

### Modules for Extraction

In [5]:
def f_get_files_df_sorted():
    '''
    Module to create Dataframe with filenames for each epoch and step
    Sorts by step and epoch
    '''
    
    ## Get images files and .npy arrays for each image in dump_outs folder
    t1=time.time()
    files_dict={}
    keys=['train_gen','train_input','val_gen','val_input']
    file_strg_lst=['model0-training*-gen_img*-output0.npy','model0-training*-inp_img*-output0.npy','model0-validation*-gen_img*-output0.npy','model0-validation*-inp_img*-output0.npy']
    for key,file_strg in zip(keys,file_strg_lst):
        files_dict[key]=np.array(glob.glob(main_dir+file_strg))
        if files_dict[key].shape[0]>1000 : 
            print('Warning the number of files is very large. Possibility of memory overload')

    df_files=pd.DataFrame([])
    dict1={}
    t1=time.time()
    ### First get sorted Dataframe with file names
    for key in keys: 
        files_arr=files_dict[key]  # Get array of files
        print(key,len(files_arr))
        for fname in files_arr:
            ### Extract the Epoch number and step number from the file name
            dict1['img_type']=key
            dict1['epoch']=np.int32(fname.split('epoch')[-1].split('-')[0])
            dict1['step']=np.int64(fname.split('step')[-1].split('-')[0])
            dict1['fname']=fname

            df_files=df_files.append(dict1,ignore_index=True)
    ## Sort values
    df_files=df_files.sort_values(by=['img_type','epoch','step']).reset_index(drop=True)
    # df_files
    t2=time.time()
    print("Time for Sorting",t2-t1)
    
    return df_files


def f_filter_epoch(df_input,num_sliced=1):
    '''
    Get just the last few stored step images for each epoch
    '''
    print('Extracting last %s steps of each epoch'%(num_sliced))
    df_output=pd.DataFrame([])
    for key in ['train_gen','train_input','val_gen','val_input']: 
        ### For each type of images, get list of epochs
        df1=df_input[df_input.img_type==key]
        epochs=np.unique(df1.epoch.values).astype(int)

        for epoch in epochs:### Extract the last few steps in each epoch
            df2=df1[df1.epoch==epoch]
            df_output=df_output.append(df2.iloc[-num_sliced:])  
    
    return df_output.reset_index(drop=True)


def f_get_images_df(df_files):
    '''
    Read dataframe with file names, read files and create new dataframe with images as numpy arrays
    Also computes number of images with intensity beyond a cutoff
    '''
    
    def f_row(df_row):
        '''
        Extract image
        '''
        fname,key=df_row.fname,df_row.img_type
        a1=np.load(fname)
        if key.endswith('input'): 
            size=np.int(np.sqrt(a1.shape[-1])) ### Extract size of images (=128)
            batch_size=a1.shape[0] ### Number of batches
            samples=a1.reshape(batch_size,size,size)
        elif key.endswith('gen') : samples=a1[:,0,:,:]
        else : raise SystemError

        return samples
    
    def f_high_pixel(df_row,cutoff=0.9966):
        '''
        Get number of images with a pixel about max cut-off value
        '''
        max_arr=np.amax(df_row.images,axis=(1,2))
        num_large=max_arr[max_arr>cutoff].shape[0]

        return num_large
    
    t1=time.time()
    ##### Create new Dataframe with sorted images
    df=df_files.copy()
    df['images']=df.apply(lambda row: f_row(row), axis=1)
    t2=time.time()
    print("Time for Reading images",t2-t1)
    
    ### Store the number of images with large pixel value
    cutoff=0.9966
    df['num_large']=df.apply(lambda row: f_high_pixel(row,cutoff), axis=1)
    
    return df
    

def f_get_sample_epochs(df,img_type,start_epoch=None,end_epoch=None):
    '''
    Module to extract images for a range of epochs given a dataframe
    '''
    if start_epoch==None and end_epoch==None:
        max_epoch=np.int(np.max(df.epoch.values))
        print(max_epoch)
        start_epoch=0; end_epoch=max_epoch
#     if end_epoch==None: end_epoch=start_epoch+1
    
    arr=df[(df.epoch>=start_epoch) & (df.epoch<=end_epoch) & (df.img_type==img_type)].images.values
    arr=np.vstack(arr)
    
    return arr


## Extract image data 

In [6]:
fldr_name='20200423_122631_exagan_modified_paddding'
fldr_name='20200424_083456_exagan_modified_padding_2'
fldr_name='20200506_121613_exagan_200k_samples'
fldr_name='20200513_121910_peters_dataset'

main_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/{0}/dump_outs/'.format(fldr_name)
print(main_dir)


/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200513_121910_peters_dataset/dump_outs/


In [7]:
### Get dataframe with file names, sorted by epoch and step
df_files=f_get_files_df_sorted()

### Slice out rows to keep only the last few steps for each epoch.
# df_files=f_filter_epoch(df_files,num_sliced=2)

#############################################################
### Read images one by one into a numpy array and create a new DataFrame
df=f_get_images_df(df_files)
print(df.shape)

# ### Filter to keep just one step per epoch
# df=f_filter_epoch(df,1)

train_gen 902
train_input 902
val_gen 226
val_input 226
Time for Sorting 8.253874778747559
Time for Reading images 268.7634389400482
(2256, 6)


In [8]:
# df_files.head(20)
df[['epoch','step','img_type','fname']].iloc[[0,1,-2,-1]]

Unnamed: 0,epoch,step,img_type,fname
0,0.0,0.0,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...
1,0.0,82.0,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...
2254,59.0,18368.0,val_input,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...
2255,59.0,18450.0,val_input,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...


## Chi-square

In [9]:
def f_compute_chisqr(df):
    ''' Compute chi-sqr of pixel intensity histogram for each row
    Uses the module f_pixel_intensity to compute histograms
    '''
    
    def f_chisqr(df_row,val_hist,val_err,max_val=2000):
        ''' Compute chi-sqr of rows wrt to input data'''
        
        val_dr=val_hist.copy()
        val_dr[val_dr<=0.]=1.0    ### Avoiding division by zero for zero bins
#         print(val_dr)
        sample=f_invtransform(df_row.images)[0]
        ### Compute pixel histogram for row   ### !!! Ensure both pixel histograms have save bins and normalization !!! ###
        gen_hist,gen_err=f_pixel_intensity(sample,plot=False,normalize=True,bins=50,hist_range=(0,max_val),mode='avg')
        ### Compute chi-sqr
        sq_diff=(gen_hist-val_hist)**2
        ###  chi_sqr :: sum((Obs-Val)^2/(Val))
        chi_sqr1=np.sum(np.divide(sq_diff[:15],val_dr[:15]**2))
        chi_sqr2=np.sum(np.divide(sq_diff[:15],1.0))
        chi_sqr3=np.sum(gen_err[:15])/np.sum(val_err[:15])  ## measures total spread in histograms wrt to input data
        
        return chi_sqr1,chi_sqr2,chi_sqr3
    
    ### Get pixel histogram of all input data
    samples_input=f_invtransform(f_get_sample_epochs(df,'train_input'))    
    max_val=np.max(samples_input)
    val_hist,val_err=f_pixel_intensity(samples_input,plot=False,normalize=True,bins=50,hist_range=(0,max_val),mode='avg')
    del samples_input
    
    ### Get chi-sqr for each row (step-epoch) for generated data
    chi_sqrs=df.apply(lambda row: f_chisqr(row,val_hist=val_hist,val_err=val_err,max_val=2000), axis=1).values
    chi_vals=np.array(list(zip(*chi_sqrs)))  ## transposing list of list 

    df['chi_sqr1'],df['chi_sqr2'],df['chi_sqr3']=chi_vals[0],chi_vals[1],chi_vals[2]
    print(type(chi_sqrs))
    
    return df

t1=time.time()
# df1=f_compute_chisqr(df.loc[[0,1,2,3,100,200]])
df1=f_compute_chisqr(df)
t2=time.time()
print("Time to compute chi-sqr",t2-t1)

59
<class 'numpy.ndarray'>
Time to compute chi-sqr 148.16715288162231


In [10]:
df1[['epoch','step','img_type','num_large','chi_sqr1','chi_sqr2','chi_sqr3']]

Unnamed: 0,epoch,step,img_type,num_large,chi_sqr1,chi_sqr2,chi_sqr3
0,0.0,0.0,train_gen,0,13.982354,0.000041,1.030463e+01
1,0.0,82.0,train_gen,0,14.041752,0.000041,1.767216e-11
2,0.0,164.0,train_gen,0,14.041752,0.000041,1.767216e-11
3,0.0,246.0,train_gen,0,14.041752,0.000041,1.767216e-11
4,0.0,328.0,train_gen,0,14.041752,0.000041,1.767216e-11
5,0.0,410.0,train_gen,3,13.617518,0.000041,4.072878e+01
6,0.0,492.0,train_gen,0,13.924775,0.000041,2.060926e+01
7,0.0,574.0,train_gen,20,52.797790,0.000042,1.201373e+02
8,0.0,656.0,train_gen,93,3779.737756,0.000043,2.319822e+02
9,0.0,738.0,train_gen,30,1281.740083,0.000043,2.216389e+02


In [11]:
fig=plt.figure(figsize=(10,3))
# for img_type in ['val_input','val_gen','train_input','train_gen']:
for img_type in ['train_gen','train_input']:
# for img_type in ['val_gen','val_input']:
    df_temp=df1[df1.img_type==img_type]
    print(df_temp.shape)
    fig.add_subplot(1,3,1)
    plt.plot(df_temp.epoch.values,df_temp['chi_sqr1'].values,linestyle='-',marker='*',label=img_type)
    plt.title('chisqr1')
    
    fig.add_subplot(1,3,2)
    plt.plot(df_temp.epoch.values,df_temp['chi_sqr2'].values,linestyle='-',marker='*',label=img_type)
    plt.title('chisqr2')

    fig.add_subplot(1,3,3)
    plt.plot(df_temp.epoch.values,df_temp['chi_sqr3'].values,linestyle='-',marker='*',label=img_type)
    plt.title('Deviation in histograms')

plt.xlabel('Epoch')
plt.legend()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

(902, 9)
(902, 9)


  import sys
  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app


In [None]:
# df_temp.plot('step','chi_sqr')
df_temp[(df_temp.step<=66000) & (df_temp.step>56000)][['epoch','step']]

In [None]:
# df_temp.loc[df_temp['chi_sqr'].idxmin()][['epoch','step','chi_sqr','num_large']]

In [14]:
### Plot number of high pixel images
# df.plot('epoch','num_large',kind='scatter')
plt.figure()
plt.plot(df[df.img_type=='val_gen'].step,df[df.img_type=='val_gen'].num_large,linestyle='',marker='*')
plt.xlabel('Steps in Epochs')
plt.ylabel('Number of large pixel images from a batch set of 128 images')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'Number of large pixel images from a batch set of 128 images')

## Explore image samples

In [None]:
# f_pixel_intensity(samples2,'s2',normalize=True,mode='simple',bins=50)1
# f_compare_pixel_intensity([samples2[20:60],samples4,['s2','s4'],normalize=normalize,log_scale=log_scale, mode=mode,bins=bins)
# f_compute_spectrum(samples2)
# f_compare_spectrum([samples2[20:60],samples4],['s2','s4'])

In [15]:
def f_widget_individual(df,img_type='val_gen',idx_range=(0,50),Fig_type='pixel',normalize=True,log_scale=True,rescale=True,mode='avg'):
    '''
    Module to plot pixel intensity or power spectrum for a given sample set of images
    Options for normalization, log-scal, and rescale
    Rescale converts image pixel values from (-1,1) to the original pixel range
    2 Fig_type: pixel-> pixel intensity and spectrum -> power spectrum
    '''
    
    start,end=idx_range[0],idx_range[1]
    print('Index Range %s - %s'%(start,end))
    
    try :
        sliced_arr=f_get_sample_epochs(df,img_type=img_type,start_epoch=start,end_epoch=end)
        if sliced_arr.shape[0]<1:
            print('Input indices %s %s are invalid.\nUsing full array'%(start,end))
            start0,end=0,'end'
            sliced_arr=f_get_sample_epochs(df,img_type=img_type)
    except Exception as e:
        print(e)

    ### Crop out large pixel values
    sliced_arr=np.array([arr for arr in sliced_arr if np.max(arr)<=0.994])

    if rescale: ### Converting from pixel intensity range (-1,1) to original range
        sliced_arr=f_invtransform(sliced_arr)
    print('Array size used',sliced_arr.shape)
    
    if Fig_type=='pixel':
        f_pixel_intensity(sliced_arr,label=img_type+': {0}-{1}'.format(str(start),str(end)),normalize=normalize,log_scale=log_scale,mode=mode)
    elif Fig_type=='spectrum':
        f_compute_spectrum(sliced_arr,label=img_type+': {0}-{1}'.format(str(start),str(end)),log_scale=log_scale)


In [16]:
interact_manual(f_widget_individual,df=fixed(df),img_type=fixed('val_gen'),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),mode=['avg','simple'],
                idx_range=IntRangeSlider(value=(0,60),min=0,max=80,step=1),)

interactive(children=(IntRangeSlider(value=(0, 60), description='idx_range', max=80), ToggleButtons(descriptio…

<function __main__.f_widget_individual(df, img_type='val_gen', idx_range=(0, 50), Fig_type='pixel', normalize=True, log_scale=True, rescale=True, mode='avg')>

## Compare samples

In [17]:

def f_widget_compare(sample_names,sample_dict,Fig_type='pixel',rescale=True,log_scale=True,bins=25,mode='avg',normalize=True):
    '''
    Module to make widget plots for pixel intensity or spectrum comparison for multiple sample sets
    '''
    
    ### Crop out large pixel values
    for key in sample_names:
        print(sample_dict[key].shape)
        sample_dict[key]=np.array([arr for arr in sample_dict[key] if np.max(arr)<=0.994])
        print(sample_dict[key].shape)
    
    img_list=[sample_dict[key] for key in sample_names]
    label_list=list(sample_names)
        
    hist_range=(0,0.996)
    
    if rescale: 
        for count,img in enumerate(img_list):
            img_list[count]=f_invtransform(img)
        hist_range=(0,2000)

    
    assert Fig_type in ['pixel','spectrum'],"Invalid mode %s"%(mode)
    
    if Fig_type=='pixel':
        f_compare_pixel_intensity(img_lst=img_list,label_lst=label_list,normalize=normalize,log_scale=log_scale, mode=mode,bins=bins,hist_range=hist_range)
    elif Fig_type=='spectrum':
        f_compare_spectrum(img_lst=img_list,label_lst=label_list,log_scale=log_scale)



#### Compare different epochs

In [None]:
# img_list,labels_list=f_get_sample_epochs(df,'val_gen',10)

img_list,labels_list=[],[]
# for epoch_range in [(0,4),(17,20),(25,27),(34,37),(44,51),(53,59)]:
for epoch_range in [(i,i+2) for i in range(0,60,2)]:
    start,end=epoch_range[0],epoch_range[1]
    img_list.append(f_get_sample_epochs(df,'val_gen',start,end))
    labels_list.append('%s:%s'%(str(start),str(end)))

dict_samples=dict.fromkeys(labels_list)
for key,val in zip(labels_list,img_list): dict_samples[key]=val

### Compare with input
dict_samples['val input']=f_get_sample_epochs(df,img_type='val_input')
interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),bins=IntText(value=50),mode=['avg','simple'])

#### Compare image types

In [None]:
### Available options : keys=['train_gen','train_input','val_gen','val_input']
start,end=57,59
samples1=f_get_sample_epochs(df,'val_gen',start,end)
samples2=f_get_sample_epochs(df,'val_input',0,60)
samples3=f_get_sample_epochs(df,'train_gen',start,end)
samples4=f_get_sample_epochs(df,'train_input')

print(np.max(samples1))

In [None]:
dict_samples={'s1':samples1, 's2': samples2, 's3': samples3, 's4':samples4}
interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),bins=IntText(value=50),mode=['avg','simple'])