# Extract data from output files
### Analyze the output from a single LBANN run
March 9, 2020 \
April 6, 2020 : to store files in order of epochs \
April 21, 2020: added jupyter widgets to compare pixel intensity plots \
May 8, 2020: using all images for a given batch \
May 29, 2020: Modified for new update of LBANN. File names of images changed, so new extraction code. Also added code for computing chi-squared. \



In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import subprocess as sp
import os
import glob
import sys

import itertools
import time
from scipy import fftpack
# from ipywidgets import interact, interact_manual,fixed, SelectMultiple, IntText, IntSlider, FloatSlider,SelectionSlider,BoundedIntText
from ipywidgets import *

In [6]:
%matplotlib widget

In [7]:
sys.path.append('/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/3_analysis')
from modules_image_analysis import *

[NbConvertApp] Converting notebook modules_image_analysis.ipynb to script
[NbConvertApp] Writing 15990 bytes to modules_image_analysis.py


In [8]:
### Transformation functions for image pixel values
def f_transform(x):
    return 2.*x/(x + 4. + 1e-8) - 1.

def f_invtransform(s):
    return 4.*(1. + s)/(1. - s + 1e-8)

In [9]:
# ### Other transformatino functinos
# ### Transformation functions for image pixel values

# def f_transform_new(x):
#     if x<=50:
#         a=0.03; b=-1.0
#         return a*x+b
#     elif x>50: 
#         a=0.5/np.log(300)
#         b=0.5-a*np.log(50)
#         return a*np.log(x)+b

# def f_invtransform_new(y):
#     if y<=0.5:
#         a=0.03;b=-1.0
#         return (y-b)/a
#     elif y>0.5: 
#         a=0.5/np.log(300)
#         b=0.5-a*np.log(50)
#         return np.exp((y-b)/a)
    

# def f_transform(x):
#     return np.vectorize(f_transform_new)(x)

# def f_invtransform(s):
#     return np.vectorize(f_invtransform_new)(s)

# f_transform_new(2000)

### Modules for Extraction

In [10]:
def f_get_files_df_sorted():
    '''
    Module to create Dataframe with filenames for each epoch and step
    Sorts by step and epoch
    '''
    
    ## Get images files and .npy arrays for each image in dump_outs folder
    t1=time.time()
    files_dict={}
    keys=['train_gen','train_input','val_gen','val_input']
#     file_strg_lst=['model0-training*-gen_img*-output0.npy','model0-training*-inp_img*-output0.npy','model0-validation*-gen_img*-output0.npy','model0-validation*-inp_img*-output0.npy']
    file_strg_lst=['sgd.training*_gen_img*_output0.npy','sgd.training*_inp_img*_output0.npy','sgd.validation*_gen_img*_output0.npy','sgd.validation*_inp_img*_output0.npy']

    for key,file_strg in zip(keys,file_strg_lst):
        files_dict[key]=np.array(glob.glob(main_dir+file_strg))
        if files_dict[key].shape[0]>1000 : 
            print('Warning the number of files is very large. Possibility of memory overload')
    
    df_files=pd.DataFrame([])
    dict1={}
    t1=time.time()
    ### First get sorted Dataframe with file names
    for key in keys:
        files_arr=files_dict[key]  # Get array of files
        print(key,len(files_arr))
        for fname in files_arr:
            ### Extract the Epoch number and step number from the file name
            dict1['img_type']=key
            dict1['epoch']=np.int32(fname.split('epoch')[-1].split('.')[1])
            dict1['step']=np.int64(fname.split('step')[-1].split('.')[1].split('_')[0])
            dict1['fname']=fname
            
            df_files=df_files.append(dict1,ignore_index=True)
    ## Sort values
    df_files=df_files.sort_values(by=['img_type','epoch','step']).reset_index(drop=True)
    # df_files
    t2=time.time()
    print("Time for Sorting",t2-t1)
    
    return df_files


def f_filter_epoch(df_input,num_sliced=1):
    '''
    Get just the last few stored step images for each epoch
    '''
    print('Extracting last %s steps of each epoch'%(num_sliced))
    df_output=pd.DataFrame([])
    for key in ['train_gen','train_input','val_gen','val_input']: 
        ### For each type of images, get list of epochs
        df1=df_input[df_input.img_type==key]
        epochs=np.unique(df1.epoch.values).astype(int)

        for epoch in epochs:### Extract the last few steps in each epoch
            df2=df1[df1.epoch==epoch]
            df_output=df_output.append(df2.iloc[-num_sliced:])  
    
    return df_output.reset_index(drop=True)


def f_get_images_df(df_files):
    '''
    Read dataframe with file names, read files and create new dataframe with images as numpy arrays
    Also computes number of images with intensity beyond a cutoff
    '''
    
    def f_row(df_row):
        '''
        Extract image
        '''
        fname,key=df_row.fname,df_row.img_type
        a1=np.load(fname)
        if key.endswith('input'): 
            size=np.int(np.sqrt(a1.shape[-1])) ### Extract size of images (=128)
            batch_size=a1.shape[0] ### Number of batches
            samples=a1.reshape(batch_size,size,size)
        elif key.endswith('gen') : samples=a1[:,0,:,:]
        else : raise SystemError

        return samples
    
    def f_high_pixel(df_row,cutoff=0.9966):
        '''
        Get number of images with a pixel about max cut-off value
        '''
        max_arr=np.amax(df_row.images,axis=(1,2))
        num_large=max_arr[max_arr>cutoff].shape[0]

        return num_large
    
    t1=time.time()
    ##### Create new Dataframe with sorted images
    df=df_files.copy()
    df['images']=df.apply(lambda row: f_row(row), axis=1)
    t2=time.time()
    print("Time for Reading images",t2-t1)
    
    ### Store the number of images with large pixel value
    cutoff=0.9966
    df['num_large']=df.apply(lambda row: f_high_pixel(row,cutoff), axis=1)
    
    return df
    

def f_get_sample_epochs(df,img_type,start_epoch=None,end_epoch=None):
    '''
    Module to extract images for a range of epochs given a dataframe
    '''
    if start_epoch==None and end_epoch==None:
        max_epoch=np.int(np.max(df.epoch.values))
#         print(max_epoch)
        start_epoch=0; end_epoch=max_epoch
#     if end_epoch==None: end_epoch=start_epoch+1
    
    arr=df[(df.epoch>=start_epoch) & (df.epoch<=end_epoch) & (df.img_type==img_type)].images.values
    arr=np.vstack(arr)
    
    return arr


def f_compute_chisqr(df):
    ''' Compute chi-sqr of pixel intensity histogram for each row
    Uses the module f_pixel_intensity to compute histograms
    '''
    
    def f_chisqr(df_row,val_hist,val_err,max_val=2000):
        ''' Compute chi-sqr of rows wrt to input data'''
        
        val_dr=val_hist.copy()
#         val_dr[val_dr<=0.]=1.0    ### Avoiding division by zero for zero bins
        
        sample=df_row.images ### Get all images in a batch
#         sample=f_invtransform(df_row.images)
#         print(np.max(sample),np.min(sample))
        ### Compute pixel histogram for row   ### !!Both pixel histograms MUST have save bins and normalization!
        gen_hist,gen_err=f_pixel_intensity(sample,plot=False,normalize=True,bins=200,hist_range=(0,max_val),mode='avg')
        ### Compute chi-sqr
        ### Used in keras code : np.sum(np.divide(np.power(valhist - samphist, 2.0), valhist))
        ###  chi_sqr :: sum((Obs-Val)^2/(Val))
        sq_diff=(gen_hist-val_hist)**2
        idx=150
#         print(gen_hist)
        chi_sqr1=np.sum(np.divide(sq_diff[:idx],val_dr[:idx]))
        chi_sqr2=np.sum(np.divide(sq_diff[:idx],1.0))
        chi_sqr3=np.sum(gen_err[:idx])/np.sum(val_err[:idx])  ## measures total spread in histograms wrt to input data
#         gen_hist,val_hist,val_dr=gen_hist[:-5],val_hist[:-5],val_dr[:-5]
        chi_sqr1=np.sum(np.divide(np.power(gen_hist[:idx] - val_hist[:idx], 2.0), val_dr[:idx]))
        chi_sqr2=np.sum(np.divide(np.power(gen_hist[:idx] - val_hist[:idx], 2.0), 1.0))
        chi_sqr3=np.sum(gen_err[:idx])/np.sum(val_err[:idx])  ## measures total spread in histograms wrt to input data
#         chi_sqr3=0.0
        
        return chi_sqr1,chi_sqr2,chi_sqr3
    
    ### Get pixel histogram of all input data
    samples_input=f_get_sample_epochs(df,'train_input')
    max_val=np.max(samples_input)
    val_hist,val_err=f_pixel_intensity(samples_input,plot=False,normalize=True,bins=200,hist_range=(0,max_val),mode='avg')
    del samples_input
    
    ### Get chi-sqr for each row (step-epoch) for generated data
    chi_sqrs=df.apply(lambda row: f_chisqr(row,val_hist=val_hist,val_err=val_err,max_val=1), axis=1).values
    chi_vals=np.array(list(zip(*chi_sqrs)))  ## transposing list of list 

    df['chi_sqr1'],df['chi_sqr2'],df['chi_sqr3']=chi_vals[0],chi_vals[1],chi_vals[2]
    
    return df


## Extract image data 

In [11]:
# fldr_name='20200529_111342_seed3273_80epochs'
fldr_name='20200611_083500_exagan'
fldr_name='20200612_113211_exagan'
fldr_name='20200615_085410_batchsize_256_exagan'
fldr_name='20200616_071748_batchsize_512_exagan'

main_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/{0}/dump_outs/trainer0/model0/'.format(fldr_name)
print(main_dir)


/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200616_071748_batchsize_512_exagan/dump_outs/trainer0/model0/


In [12]:
### Get dataframe with file names, sorted by epoch and step
df_files=f_get_files_df_sorted()

### Slice out rows to keep only the last few steps for each epoch.
df_files=f_filter_epoch(df_files,num_sliced=4)

#############################################################
### Read images one by one into a numpy array and create a new DataFrame
df_full=f_get_images_df(df_files)
print(df_full.shape)

# ### Filter to keep just one step per epoch
# df_full=f_filter_epoch(df_full,1)

train_gen 1588
train_input 1588
val_gen 400
val_input 400
Time for Sorting 12.935046672821045
Extracting last 4 steps of each epoch
Time for Reading images 89.94568276405334
(1280, 6)


In [13]:
# df_files.head(20)
df_full[['epoch','step','img_type','fname','num_large']].iloc[[0,1,-2,-1]]

Unnamed: 0,epoch,step,img_type,fname,num_large
0,0.0,320.0,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,138
1,0.0,340.0,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,179
1278,79.0,7960.0,val_input,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,0
1279,79.0,7980.0,val_input,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,0


In [21]:
arr=df_full[df_full.img_type=='train_input'].images.values

In [24]:
arr.shape

(320,)

## Chi-square

In [10]:
def f_compute_chisqr(df):
    ''' Compute chi-sqr of pixel intensity histogram for each row
    Uses the module f_pixel_intensity to compute histograms
    '''
    
    def f_chisqr(df_row,val_hist,val_err,bins,max_val=2000):
        ''' Compute chi-sqr of rows wrt to input data'''
        
        val_dr=val_hist.copy()
        val_dr[val_dr<=0.]=1.0    ### Avoiding division by zero for zero bins
        
        ### Get all images in a batch
#         sample=df_row.images
        sample=f_invtransform(df_row.images)
#         print(np.max(sample),np.min(sample))

        ### Compute pixel histogram for row   ### !!Both pixel histograms MUST have same bins and normalization!
        gen_hist,gen_err=f_pixel_intensity(sample,plot=False,normalize=True,bins=bins,hist_range=(0,max_val),mode='avg')
        ### Compute chi-sqr
        ### Used in keras code : np.sum(np.divide(np.power(valhist - samphist, 2.0), valhist))
        ###  chi_sqr :: sum((Obs-Val)^2/(Val))
        sq_diff=(gen_hist-val_hist)**2
        idx=-5  # Choosing the number of histograms to use

        chi_sqr1=np.sum(np.divide(sq_diff[:idx],val_dr[:idx]))
        chi_sqr2=np.sum(np.divide(sq_diff[:idx],1.0))
        chi_sqr3=np.sum(gen_err[:idx])/np.sum(val_err[:idx])  ## measures total spread in histograms wrt to input data
#         gen_hist,val_hist,val_dr=gen_hist[:-5],val_hist[:-5],val_dr[:-5]
#         chi_sqr1=np.sum(np.divide(np.power(gen_hist[:idx] - val_hist[:idx], 2.0), val_dr[:idx]))
#         chi_sqr2=np.sum(np.divide(np.power(gen_hist[:idx] - val_hist[:idx], 2.0), 1.0))
#         chi_sqr3=np.sum(gen_err[:idx])/np.sum(val_err[:idx])  ## measures total spread in histograms wrt to input data
#         chi_sqr3=0.0
        
        return chi_sqr1,chi_sqr2,chi_sqr3
    
    
    bins=np.concatenate([np.array([-0.5]),np.arange(0.5,20.5,1),np.arange(20.5,100.5,5),np.arange(100.5,1000.5,50),np.array([2000])]) #bin edges to use

    bins=f_transform(bins)
#     bins=200
    ### Get pixel histogram of all input data
    samples_input=f_get_sample_epochs(df,'train_input',0,2)
    max_val=np.max(samples_input)
    val_hist,val_err=f_pixel_intensity(samples_input,plot=False,normalize=True,bins=bins,hist_range=(0,max_val),mode='avg')
    del samples_input
    
    ### Get chi-sqr for each row (step-epoch) for generated data
    chi_sqrs=df.apply(lambda row: f_chisqr(row,val_hist=val_hist,val_err=val_err,bins=bins,max_val=1), axis=1).values
    chi_vals=np.array(list(zip(*chi_sqrs)))  ## transposing list of list 

    df['chi_sqr1'],df['chi_sqr2'],df['chi_sqr3']=chi_vals[0],chi_vals[1],chi_vals[2]
    
    return df

t1=time.time()
# df1=f_compute_chisqr(df_full.loc[[0,1,2,3,100,200]])
df_full=f_compute_chisqr(df_full)
t2=time.time()
print("Time to compute chi-sqr",t2-t1)

Time to compute chi-sqr 133.47150421142578


In [11]:
df_full[['chi_sqr1','chi_sqr2','chi_sqr3']].describe()

Unnamed: 0,chi_sqr1,chi_sqr2,chi_sqr3
count,1280.0,1280.0,1280.0
mean,1627.323315,64.047269,3.052714
std,72.094378,2.905558,3.14847
min,1016.547222,39.194695,1.654931e-13
25%,1590.884724,62.500931,3.122079e-13
50%,1656.09177,65.211627,2.257582
75%,1683.127739,66.312099,5.843843
max,1683.127739,66.312099,13.04639


In [12]:
df=df_full.copy()

In [None]:
# Get row with min chi-sqr
# df.loc[df.chi_sqr1.idxmin(axis=1)][['epoch','step','chi_sqr1','chi_sqr2','chi_sqr3','img_type']]

## Slicing output 
### Values in the range for chisqr1 and chisqr3
def f_get_best_epochs(df):
    ''' Get quantiles for type=train_gen and choose  best epochs '''
    img_type='train_gen'
    c1=df[df.img_type==img_type].quantile(q=0.5,axis=0) 
    c3=df[df.img_type==img_type].quantile(q=0.5,axis=0)
    # print(c1,c3)

    df_sliced=df[(df.chi_sqr1<c1.chi_sqr1) & (df.chi_sqr3>c3.chi_sqr3)][['epoch','step','chi_sqr1','chi_sqr2','chi_sqr3','img_type']]

    return df_sliced



df_sliced=f_get_best_epochs(df_full)

In [14]:
df_sliced[df_sliced.img_type=='val_gen']

Unnamed: 0,epoch,step,chi_sqr1,chi_sqr2,chi_sqr3,img_type
640,0.0,41.0,1223.198753,47.998551,10.138766,val_gen
641,0.0,82.0,1225.436104,48.025875,10.771612,val_gen
642,0.0,123.0,1222.139610,47.957314,10.505947,val_gen
643,0.0,164.0,1218.129174,47.888383,10.709677,val_gen
644,1.0,246.0,1442.765734,56.787555,7.542819,val_gen
645,1.0,287.0,1445.540515,56.867837,7.683764,val_gen
646,1.0,328.0,1444.621044,56.839896,7.482073,val_gen
647,1.0,369.0,1447.523788,57.020926,7.607327,val_gen
648,2.0,451.0,1465.820699,57.711418,7.698641,val_gen
649,2.0,492.0,1474.356576,57.658038,7.970780,val_gen


In [15]:
def f_plot_epochs(df,mode='epoch'):
    
    fig=plt.figure(figsize=(10,3))
#     for img_type in ['val_input','val_gen','train_input','train_gen']:
#     for img_type in ['train_gen','train_input']:
#     for img_type in ['val_gen','val_input']:
    for img_type in ['train_gen']:
        df_temp=df[df.img_type==img_type]

        if mode=='epoch': x=df_temp.epoch.values
        elif mode=='step': x=df_temp.step.values
                
        fig.add_subplot(1,3,1)
        plt.plot(x,df_temp['chi_sqr1'].values,linestyle='-',marker='*',label=img_type)
        plt.title('chisqr1')

        fig.add_subplot(1,3,2)
        plt.plot(x,df_temp['chi_sqr2'].values,linestyle='-',marker='*',label=img_type)
        plt.title('chisqr2')

        fig.add_subplot(1,3,3)
        plt.plot(x,df_temp['chi_sqr3'].values,linestyle='-',marker='*',label=img_type)
        plt.title('Deviation in histograms')

    plt.xlabel(mode)
    plt.legend()
    plt.tight_layout()
    
f_plot_epochs(df_sliced,mode='step')
f_plot_epochs(df_sliced,mode='epoch')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### High Pixel images

In [16]:
### Plot number of high pixel images
# df.plot('epoch','num_large',kind='scatter')
# df=df_full.copy()
plt.figure()
plt.plot(df[df.img_type=='val_gen'].step,df[df.img_type=='val_gen'].num_large,linestyle='',marker='*')
plt.xlabel('Steps in Epochs')
plt.ylabel('Number of large pixel images from a batch set of 128 images')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'Number of large pixel images from a batch set of 128 images')

In [17]:
df[(df.num_large>0) &(df.img_type=='val_gen')][['epoch','step','num_large']]

Unnamed: 0,epoch,step,num_large
640,0.0,41.0,148
641,0.0,82.0,125
642,0.0,123.0,142
643,0.0,164.0,137
644,1.0,246.0,26
645,1.0,287.0,30
646,1.0,328.0,25
647,1.0,369.0,21
732,23.0,4633.0,2
733,23.0,4674.0,1


## Explore image samples

In [None]:
def f_widget_individual(df,img_type='val_gen',idx_range=(0,50),Fig_type='pixel',normalize=True,log_scale=True,rescale=True,mode='avg'):
    '''
    Module to plot pixel intensity or power spectrum for a given sample set of images
    Options for normalization, log-scal, and rescale
    Rescale converts image pixel values from (-1,1) to the original pixel range
    2 Fig_type: pixel-> pixel intensity and spectrum -> power spectrum
    '''
    
    start,end=idx_range[0],idx_range[1]
    print('Index Range %s - %s'%(start,end))
    
    try :
        sliced_arr=f_get_sample_epochs(df,img_type=img_type,start_epoch=start,end_epoch=end)
        if sliced_arr.shape[0]<1:
            print('Input indices %s %s are invalid.\nUsing full array'%(start,end))
            start0,end=0,'end'
            sliced_arr=f_get_sample_epochs(df,img_type=img_type)
    except Exception as e:
        print(e)

    ### Crop out large pixel values
    sliced_arr=np.array([arr for arr in sliced_arr if np.max(arr)<=0.994])

    if rescale: ### Converting from pixel intensity range (-1,1) to original range
        sliced_arr=f_invtransform(sliced_arr)
    print('Array size used',sliced_arr.shape)
    
    if Fig_type=='pixel':
        f_pixel_intensity(sliced_arr,label=img_type+': {0}-{1}'.format(str(start),str(end)),normalize=normalize,log_scale=log_scale,mode=mode)
    elif Fig_type=='spectrum':
        f_compute_spectrum(sliced_arr,label=img_type+': {0}-{1}'.format(str(start),str(end)),log_scale=log_scale)



In [None]:
interact_manual(f_widget_individual,df=fixed(df),img_type=fixed('val_gen'),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),mode=['avg','simple'],
                idx_range=IntRangeSlider(value=(0,60),min=0,max=80,step=1),)

## View image block

In [None]:
def f_plot_grid(arr,cols=16,fig_size=(15,5)):
    ''' Plot a grid of images
    '''
    size=arr.shape[0]    
    rows=int(np.ceil(size/cols))
    print(rows,cols)
    
    fig,axarr=plt.subplots(rows,cols,figsize=fig_size, gridspec_kw = {'wspace':0, 'hspace':0})
    if rows==1: axarr=np.reshape(axarr,(rows,cols))
    if cols==1: axarr=np.reshape(axarr,(rows,cols))
    
    for i in range(min(rows*cols,size)):
        row,col=int(i/cols),i%cols
        try: 
            axarr[row,col].imshow(arr[i],origin='lower',interpolation='nearest',cmap='cool', extent = [0, 128, 0, 128])
        # Drop axis label
        except Exception as e:
            print('Exception:',e)
            pass
        temp=plt.setp([a.get_xticklabels() for a in axarr[:-1,:].flatten()], visible=False)
        temp=plt.setp([a.get_yticklabels() for a in axarr[:,1:].flatten()], visible=False)
    
#     fig.subplots_adjust(wspace=0.00,hspace=0.000)
#     fig.tight_layout()

img_arr=f_get_sample_epochs(df,'val_input',46,48)[20:50,:,:]
f_plot_grid(img_arr,cols=6,fig_size=(10,5))


## Compare samples

In [24]:
def f_compare_pixel_intensity(img_lst,label_lst=['img1','img2'],bkgnd_arr=[],log_scale=True, normalize=True, mode='avg',bins=25, hist_range=None):
    '''
    Module to compute and plot histogram for pixel intensity of images
    Has 2 modes : simple and avg
    simple mode: No errors. Just flatten the input image array and compute histogram of full data
    avg mode(Default) : 
        - Compute histogram for each image in the image array
        - Compute errors across each histogram 
        
    bkgnd_arr : histogram of this array is plotting with +/- sigma band
    '''
    
    norm=normalize # Whether to normalize the histogram
    
    def f_batch_histogram(img_arr,bins,norm,hist_range):
        ''' Compute histogram statistics for a batch of images'''
        
        ## Extracting the range. This is important to ensure that the different histograms are compared correctly
        if hist_range==None : ulim,llim=np.max(img_arr),np.min(img_arr)
        else: ulim,llim=hist_range[1],hist_range[0]
#         print(ulim,llim)
        ### array of histogram of each image
        hist_arr=np.array([np.histogram(arr.flatten(), bins=bins, range=(llim,ulim), density=norm) for arr in img_arr]) ## range is important
        hist=np.stack(hist_arr[:,0]) # First element is histogram array
#         print(hist.shape)

        bin_list=np.stack(hist_arr[:,1]) # Second element is bin value 
        ### Compute statistics over histograms of individual images
        mean,err=np.mean(hist,axis=0),np.std(hist,axis=0)/np.sqrt(hist.shape[0])
        bin_edges=bin_list[0]
        centers = (bin_edges[:-1] + bin_edges[1:]) / 2
#         print(bin_edges,centers)

        return mean,err,centers
    
    plt.figure()
    
    ## Plot background distribution
    if len(bkgnd_arr):
        if mode=='simple':
            hist, bin_edges = np.histogram(bkgnd_arr.flatten(), bins=bins, density=norm, range=hist_range)
            centers = (bin_edges[:-1] + bin_edges[1:]) / 2
            plt.errorbar(centers, hist, color='k',marker='*',linestyle=':', label='bkgnd')

        elif mode=='avg':
            ### Compute histogram for each image. 
            mean,err,centers=f_batch_histogram(bkgnd_arr,bins,norm,hist_range)
            plt.plot(centers,mean,linestyle=':',color='k',label='bkgnd')
            plt.fill_between(centers, mean - err, mean + err, color='k', alpha=0.4)
    
    ### Plot the rest of the datasets
    for img,label,mrkr in zip(img_lst,label_lst,itertools.cycle('>^_*sDH')):     
        if mode=='simple':
            hist, bin_edges = np.histogram(img.flatten(), bins=bins, density=norm, range=hist_range)
            centers = (bin_edges[:-1] + bin_edges[1:]) / 2
            plt.errorbar(centers, hist, fmt=mrkr+'-', label=label)

        elif mode=='avg':
            ### Compute histogram for each image. 
            mean,err,centers=f_batch_histogram(img,bins,norm,hist_range)
#             print('Centers',centers)
            plt.errorbar(centers,mean,yerr=err,fmt=mrkr+'-',label=label)

    if log_scale: 
        plt.yscale('log')
        plt.xscale('symlog',linthreshx=50)

    plt.legend()
    plt.xlabel('Pixel value')
    plt.ylabel('Counts')
    plt.title('Pixel Intensity Histogram')
    

In [39]:
  
def f_compare_spectrum(img_lst,label_lst=['img1','img2'],bkgnd_arr=[],log_scale=True):
    '''
    Compare the spectrum of 2 sets of images: 
    img_lst contains the set of images arrays, Each is of the form (num_images,height,width)
    label_lst contains the labels used in the plot
    '''
    plt.figure()
    
    
    ## Plot background distribution
    if len(bkgnd_arr):
        Pk= f_batch_spectrum(bkgnd_arr)
        mean,err = np.mean(Pk, axis=0),np.std(Pk, axis=0)/np.sqrt(Pk.shape[0])
        k=np.arange(len(mean))
        plt.plot(k, mean,color='k',linestyle='-',label='bkgnd')    
        plt.fill_between(k, mean - err, mean + err, color='k',alpha=0.8)
    
    
    for img_arr,label,mrkr in zip(img_lst,label_lst,itertools.cycle('>^_*sDH')): 
        Pk= f_batch_spectrum(img_arr)
        mean,err = np.mean(Pk, axis=0),np.std(Pk, axis=0)/np.sqrt(Pk.shape[0])

        k=np.arange(len(mean))
#         print(mean.shape,std.shape)
        plt.fill_between(k, mean - err, mean + err, alpha=0.4)
        plt.plot(k, mean, marker=mkrk, linestyle=':',label=label)

    if log_scale: plt.yscale('log')
    plt.ylabel(r'$P(k)$')
    plt.xlabel(r'$k$')
    plt.title('Power Spectrum')
    plt.legend()  

In [20]:
# img_list=[f_invtransform(f_get_sample_epochs(df,'val_gen',40,41))];label_list=['img1']
# f_compare_pixel_intensity(img_lst=img_list,label_lst=label_list,normalize=False,log_scale=True, mode='avg',bins=bin_edges,hist_range=None,bkgnd_arr=img_list[0])

In [40]:

def f_widget_compare(sample_names,sample_dict,Fig_type='pixel',rescale=True,log_scale=True,bins=25,mode='avg',normalize=True,bkgnd=[]):
    '''
    Module to make widget plots for pixel intensity or spectrum comparison for multiple sample sets
    '''
    
    ### Crop out large pixel values
    for key in sample_names:
        print(sample_dict[key].shape)
        sample_dict[key]=np.array([arr for arr in sample_dict[key] if np.max(arr)<=0.994])
        print(sample_dict[key].shape)
    
    img_list=[sample_dict[key] for key in sample_names]
    label_list=list(sample_names)
    
    
    bins=np.concatenate([np.array([-0.5]),np.arange(0.5,20.5,1),np.arange(20.5,100.5,5),np.arange(100.5,1000.5,50),np.array([2000])]) #bin edges to use
    
    if rescale: 
        for count,img in enumerate(img_list):
            img_list[count]=f_invtransform(img)
        if len(bkgnd): bkgnd=f_invtransform(bkgnd)
        hist_range=(0,2000)
    else:
        bins=f_transform(bins)
        hist_range=(-1,0.996)
    assert Fig_type in ['pixel','spectrum'],"Invalid mode %s"%(mode)
    
    if Fig_type=='pixel':
#         f_compare_pixel_intensity(img_lst=img_list,label_lst=label_list,normalize=normalize,log_scale=log_scale, mode=mode,bins=bins,hist_range=hist_range)
        f_compare_pixel_intensity(img_lst=img_list,label_lst=label_list,normalize=normalize,log_scale=log_scale, mode=mode,bins=bins,hist_range=hist_range,bkgnd_arr=bkgnd)

    elif Fig_type=='spectrum':
        f_compare_spectrum(img_lst=img_list,label_lst=label_list,log_scale=log_scale,bkgnd_arr=bkgnd)



#### Compare different epochs

In [41]:
# img_list,labels_list=f_get_sample_epochs(df,'val_gen',10)

img_list,labels_list=[],[]
# for epoch_range in [(8,10),(10,12),(14,16),(60,62),(72,74)]:
# for epoch_range in [(20,22),(46,48),(51,52),(66,68),(40,50),(50,60),(60,70),(70,80)]:
# for epoch_range in [(i,i+2) for i in range(60,75,2)]:
for epoch_range in [(i,i+1) for i in range(0,20,1)]:
    start,end=epoch_range[0],epoch_range[1]
    img_list.append(f_get_sample_epochs(df,'val_gen',start,end))
    labels_list.append('%s:%s'%(str(start),str(end)))

dict_samples=dict.fromkeys(labels_list)
for key,val in zip(labels_list,img_list): dict_samples[key]=val

### Compare with input
dict_samples['val input']=f_get_sample_epochs(df,img_type='val_input',start_epoch=0,end_epoch=10)
# dict_samples['keras']=s_keras

# bkgnd=[]
bkgnd=f_get_sample_epochs(df,img_type='val_input',start_epoch=0,end_epoch=10)

interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),bins=IntText(value=50),mode=['avg','simple'],bkgnd=fixed(bkgnd))


interactive(children=(SelectMultiple(description='sample_names', options=('0:1', '1:2', '2:3', '3:4', '4:5', '…

<function __main__.f_widget_compare(sample_names, sample_dict, Fig_type='pixel', rescale=True, log_scale=True, bins=25, mode='avg', normalize=True, bkgnd=[])>

In [35]:
df[(df.img_type=='val_gen')&(df.epoch<15)&(df.epoch>5)][['epoch','step','chi_sqr1','chi_sqr2','chi_sqr3']]

Unnamed: 0,epoch,step,chi_sqr1,chi_sqr2,chi_sqr3
664,6.0,1230.0,1487.22669,58.505518,7.452888
665,6.0,1271.0,1488.269355,58.486375,7.394805
666,6.0,1312.0,1490.654224,58.5457,7.488657
667,6.0,1353.0,1488.193889,58.480138,7.232227
668,7.0,1435.0,1499.792482,58.920728,7.936032
669,7.0,1476.0,1494.620605,58.831218,8.281618
670,7.0,1517.0,1500.773846,58.962213,8.41337
671,7.0,1558.0,1493.623299,58.780974,8.435929
672,8.0,1640.0,1483.028567,58.181048,7.891587
673,8.0,1681.0,1482.017844,58.190258,8.180366


#### Compare image types

In [None]:
# ### Available options : keys=['train_gen','train_input','val_gen','val_input']
# # start,end=70,75
# start,end=46,47
# samples1=f_get_sample_epochs(df,'val_gen',start,end)
# samples2=f_get_sample_epochs(df,'val_input',0,10)
# samples3=f_get_sample_epochs(df,'train_gen',start,end)
# samples4=f_get_sample_epochs(df,'train_input',0,4)

# print(np.max(samples1))

# dict_samples={'s1':samples1, 's2': samples2, 's3': samples3, 's4':samples4}
# interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
#                 sample_names=SelectMultiple(options=dict_samples.keys()),
#                 Fig_type=ToggleButtons(options=['pixel','spectrum']),bins=IntText(value=50),mode=['avg','simple'])

In [None]:
fname='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200615_085410_batchsize_256_exagan/dump_outs/trainer0/model0/sgd.training.epoch.1.step.984_inp_img_output0.npy'
# fname='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200615_085410_batchsize_256_exagan/dump_outs/trainer0/model0/sgd.training.epoch.1.step.984_gen_img_instance1_activation_output0.npy'
a1=np.load(fname)
a1.shape

In [1]:
845*2

1690

In [2]:
size=105060  # Esimated number of *total* samples. Used to estimate save_interval
data_pct,val_ratio=1.0,0.2 # Percentage of data to use, % of data for validation



for batchsize in [64,128,512,1028]: 
    save_interval=int(size*val_ratio/(2.0*batchsize))
    print(batchsize,save_interval)

64 164
128 82
512 20
1028 10


In [4]:
105060*0.2/1028

20.439688715953306