# Extract data from output files
### Analyze the output from a single LBANN run
March 9, 2020 \
April 6, 2020 : Major edit to store files in order of epochs \
April 21, 2020: Major edit, added jupyter widgets to compare pixel intensity plots


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import subprocess as sp
import os
import glob
import sys

import time
from scipy import fftpack
# from ipywidgets import interact, interact_manual,fixed, SelectMultiple, IntText, IntSlider, FloatSlider,SelectionSlider,BoundedIntText
from ipywidgets import *

In [2]:
%matplotlib widget

In [3]:
sys.path.append('/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/LBANN/lbann_cosmogan/3_analysis/')
from modules_image_analysis import *

[NbConvertApp] Converting notebook modules_image_analysis.ipynb to script
[NbConvertApp] Writing 14763 bytes to modules_image_analysis.py


In [4]:
### Transformation functions for image pixel values
def f_transform(x):
    return 2.*x/(x + 4. + 1e-8) - 1.

def f_invtransform(s):
    return 4.*(1. + s)/(1. - s + 1e-8)

In [5]:
def f_get_samples(df,key):
    '''
    Extract array of samples from the DataFrame with images
    Images are of two types:
    1. *_gen have shape (64,1,128,128)
    2. *_input have shape (64,16384)
    '''
    
    keys=['train_gen','train_input','val_gen','val_input']
    assert key in keys,"Given key %s is not the the list of keys %s"%(key,keys)
    
    lst=df[df.type==key]['image'].values
    
    if key.endswith('input'):
        size=np.int(np.sqrt(lst[0].shape[-1])) ### Extract size of images (=128)
        samples=np.array([ii[0,:].reshape(size,size) for ii in lst])
    else : 
        samples=np.array([ii[0,0,:,:] for ii in lst])
    
    return samples


def f_filter_epoch(df_input,num_sliced=1):
    '''
    Get just the last few stored step images for each epoch
    '''
    print('Extracting last %s steps of each epoch'%(num_sliced))
    df_output=pd.DataFrame([])
    for key in ['train_gen','train_input','val_gen','val_input']: 
        ### For each type of images, get list of epochs
        df1=df_input[df_input.type==key]
        epochs=np.unique(df1.epoch.values).astype(int)

        for epoch in epochs:### Extract the last few steps in each epoch
            df2=df1[df1.epoch==epoch]
            df_output=df_output.append(df2.iloc[-num_sliced:])  
    
    return df_output.reset_index(drop=True)



## Extract image data 

In [6]:
fldr_name='20200316_112134_exagan'
fldr_name='20200406_080207_exagan_with_mcr'
fldr_name='20200407_093719_exagan_no_mcr'
fldr_name='20200409_084926_exagan_no_mcr'
fldr_name='20200409_083646_exagan_with_mcr'
fldr_name='20200413_095840_exagan'

fldr_name='20200421_055139_exagan'
fldr_name='20200421_130545_exagan'
fldr_name='20200423_071820_exagan_v0_works'
fldr_name='20200423_122631_exagan_modified_paddding'
fldr_name='20200424_083456_exagan_modified_padding_2'
fldr_name='20200506_121613_exagan_200k_samples'
### Code for set of runs
# f_list=['20200401_125919_exagan_0.1_1','20200401_130321_exagan_0.1_4',
#         '20200401_130907_exagan_0.3_1','20200401_130646_exagan_0.3_4']
# fldr_name=f_list[0]


main_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/{0}/dump_outs/'.format(fldr_name)
print(main_dir)


/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200506_121613_exagan_200k_samples/dump_outs/


In [7]:
## Extracted generated image from keras cosmogan code

fname='/global/cfs/cdirs/dasrepo/vpa/cosmogan/data/computed_data/exagan1/run1/gen_imgs.npy'
a1=np.load(fname)
print(a1.shape)

(100, 2, 128, 128)


In [8]:

## Get images files and .npy arrays for each image in dump_outs folder
files_dict={}
keys=['train_gen','train_input','val_gen','val_input']
file_strg_lst=['model0-training*-gen_img*-output0.npy','model0-training*-inp_img*-output0.npy','model0-validation*-gen_img*-output0.npy','model0-validation*-inp_img*-output0.npy']
for key,file_strg in zip(keys,file_strg_lst):
    files_dict[key]=np.array(glob.glob(main_dir+file_strg))
    if files_dict[key].shape[0]>1000 : 
        print('Warning the number of files is very large. Possibility of memory overload')

df_files=pd.DataFrame([])
dict1={}
t1=time.time()
### First get sorted Dataframe with file names
for key in keys: 
    files_arr=files_dict[key]  # Get array of files
    print(key,len(files_arr))
    for fname in files_arr:
        ### Extract the Epoch number and step number from the file name
        dict1['type']=key
        dict1['epoch']=np.int32(fname.split('epoch')[-1].split('-')[0])
        dict1['step']=np.int64(fname.split('step')[-1].split('-')[0])
        dict1['fname']=fname
        
        df_files=df_files.append(dict1,ignore_index=True)
## Sort values
df_files=df_files.sort_values(by=['type','epoch','step']).reset_index(drop=True)
# df_files
print("Sorting done")
t2=time.time()
print("Time for Sorting",t2-t1)

#############################################################
### Slice out rows to keep only the last 2 steps for each epoch.
df_files=f_filter_epoch(df_files,num_sliced=2)
t3=time.time()

### Then read images one by one into a numpy array and create a new DataFrame
sorted_fnames=df_files.fname.values
### Read images one by one. This is time-consuming.
### Deliberately kept as list because some of the input arrays have different dimensions, causing creation of array of arrays in some cases
images=[np.load(fname) for fname in sorted_fnames]  

##### Create new Dataframe with sorted images
df_full=pd.DataFrame([])
df_full['image']=images
t4=time.time()
for col in ['epoch','step','type','fname']: df_full[col]=df_files[col].values
    
print("Extraction done")
print("Time for filtering",t3-t2)
print("Time for Reading images",t4-t3)
df=df_full.copy()
print(df.shape)


train_gen 962
train_input 962
val_gen 241
val_input 241
Sorting done
Time for Sorting 8.824666500091553
Extracting last 2 steps of each epoch
Extraction done
Time for filtering 0.6494908332824707
Time for Reading images 22.71859884262085
(480, 5)


In [9]:
### Filter to keep just one image per epoch
df=f_filter_epoch(df,1)

Extracting last 1 steps of each epoch


In [10]:
df_files.head(10)

Unnamed: 0,epoch,fname,step,type
0,0.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,1230.0,train_gen
1,0.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,1312.0,train_gen
2,1.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,2542.0,train_gen
3,1.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,2624.0,train_gen
4,2.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,3854.0,train_gen
5,2.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,3936.0,train_gen
6,3.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,5166.0,train_gen
7,3.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,5248.0,train_gen
8,4.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,6478.0,train_gen
9,4.0,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,6560.0,train_gen


## Extract samples 

In [11]:
### Available options : keys=['train_gen','train_input','val_gen','val_input']
samples1=f_get_samples(df,'train_input')
print(samples1.shape)
samples2=f_get_samples(df,'val_gen')
print(samples2.shape)

samples3=f_get_samples(df,'train_gen')
print(samples3.shape)
samples4=f_get_samples(df,'val_input')
print(samples4.shape)

(60, 128, 128)
(60, 128, 128)
(60, 128, 128)
(60, 128, 128)


## Find the region without very high pixel values


In [12]:
def f_plot_max_values(samples,cutoff=0.994):
    '''
    Make a plot of max values of images of a given set of sample images
    cutoff used to discard high values
    '''
    ### Get max pixel values of images
    max_values=np.array([np.max(i) for i in samples])
    ### Less than cutoff
    lesser_idx=np.where(max_values<cutoff)[0]
    higher_idx=np.where(max_values>=cutoff)[0]
    
    plt.figure()
    plt.plot(lesser_idx,max_values[lesser_idx],linestyle='',marker='*',color='r')
    plt.plot(higher_idx,max_values[higher_idx],linestyle='',marker='D',color='b')

    plt.axhline(y=cutoff,linestyle='--',color='k')
    plt.ylim(0.9,1.0)
    
f_plot_max_values(samples2,0.9966)  ### The full dataset has a max value of just over 0.9966
# f_plot_max_values(samples2,0.9945)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Explore image samples

In [13]:
# f_pixel_intensity(samples2,'s2',normalize=True,mode='simple',bins=50)
# f_compare_pixel_intensity([samples2[20:60],samples4,['s2','s4'],normalize=normalize,log_scale=log_scale, mode=mode,bins=bins)
# f_compute_spectrum(samples2)
# f_compare_spectrum([samples2[20:60],samples4],['s2','s4'])

In [14]:
def f_widget_individual(arr,label,idx_range=(0,50),Fig_type='pixel',normalize=True,log_scale=True,rescale=True,mode='simple'):
    '''
    Module to plot pixel intensity or power spectrum for a given sample set of images
    Options for normalization, log-scal, and rescale
    Rescale converts image pixel values from (-1,1) to the original pixel range
    2 Fig_type: pixel-> pixel intensity and spectrum -> power spectrum
    '''
    
    start,end=idx_range[0],idx_range[1]
    print('Index Range %s - %s'%(start,end))
    try :
        sliced_arr=arr[start:end]
        if sliced_arr.shape[0]<1:
            print('Input indices %s %s are invalid.\nUsing full array'%(start,end))
            start0,end=0,'end'
            sliced_arr=arr[:]
    except Exception as e:
        print(e)
    if rescale: ### Converting from pixel intensity range (-1,1) to original range
        sliced_arr=f_invtransform(sliced_arr)
    print('Array size used',sliced_arr.shape)
    
    if Fig_type=='pixel':
        f_pixel_intensity(sliced_arr,label=label+': {0}-{1}'.format(str(start),str(end)),normalize=normalize,log_scale=log_scale,mode=mode)
    elif Fig_type=='spectrum':
        f_compute_spectrum(sliced_arr,label=label+': {0}-{1}'.format(str(start),str(end)),log_scale=log_scale)


In [15]:
interact_manual(f_widget_individual,arr=fixed(samples2),label=fixed('s1'),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),mode=['avg','simple'],
                idx_range=IntRangeSlider(value=(30,70),min=0,max=200,step=1),)

interactive(children=(IntRangeSlider(value=(30, 70), description='idx_range', max=200), ToggleButtons(descript…

<function __main__.f_widget_individual(arr, label, idx_range=(0, 50), Fig_type='pixel', normalize=True, log_scale=True, rescale=True, mode='simple')>

## Compare samples

In [16]:
def f_widget_compare(sample_names,sample_dict,Fig_type='pixel',rescale=True,log_scale=True,bins=25,mode='avg',normalize=True):
    '''
    Module to make widget plots for pixel intensity or spectrum comparison for multiple sample sets
    '''
    img_list=[sample_dict[key] for key in sample_names]
    label_list=list(sample_names)
    
    if rescale: 
        for count,img in enumerate(img_list):
            img_list[count]=f_invtransform(img)
    
    assert Fig_type in ['pixel','spectrum'],"Invalid mode %s"%(mode)
    
    if Fig_type=='pixel':
        f_compare_pixel_intensity(img_lst=img_list,label_lst=label_list,normalize=normalize,log_scale=log_scale, mode=mode,bins=bins)
    elif Fig_type=='spectrum':
        f_compare_spectrum(img_lst=img_list,label_lst=label_list,log_scale=log_scale)
        

def f_get_sample_epochs(samples):
    ''' Module to get list of different epoch slices. Useful to identify training stability'''
    
    size=samples.shape[0]
    img_list,labels_list=[],[]
    for i in range(0,size,10):
        i1,i2=i,i+10
        img_list.append(samples[i1:i2])
#         img_list.append(f_invtransform(samples[i1:i2]))

        labels_list.append('%s:%s'%(str(i1),str(i2)))
    img_list.append(samples)
    labels_list.append('0:end')
    
    return img_list,labels_list

### Compare different epochs

In [17]:
img_list,labels_list=f_get_sample_epochs(samples3)
dict_samples=dict.fromkeys(labels_list)
for key,val in zip(labels_list,img_list): dict_samples[key]=val


### Compare with input
dict_samples['s4']=samples4
interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),bins=IntText(value=50),mode=['avg','simple'])

interactive(children=(SelectMultiple(description='sample_names', options=('0:10', '10:20', '20:30', '30:40', '…

<function __main__.f_widget_compare(sample_names, sample_dict, Fig_type='pixel', rescale=True, log_scale=True, bins=25, mode='avg', normalize=True)>

### Compare different sample sets

In [20]:
dict_samples={'s1':samples1, 's2':samples2[35:60],
              's3':samples3[30:60], 's4':samples4}

interact_manual(f_widget_compare,sample_dict=fixed(dict_samples),
                sample_names=SelectMultiple(options=dict_samples.keys()),
                Fig_type=ToggleButtons(options=['pixel','spectrum']),
                bins=SelectionSlider(options=np.arange(10,200,10),value=50),
                mode=['avg','simple'])

interactive(children=(SelectMultiple(description='sample_names', options=('s1', 's2', 's3', 's4'), value=()), …

<function __main__.f_widget_compare(sample_names, sample_dict, Fig_type='pixel', rescale=True, log_scale=True, bins=25, mode='avg', normalize=True)>

### Plot grid of intensity histograms

In [None]:
# f_plot_intensity_grid(samples2[40:80][::5],cols=6)
# f_plot_intensity_grid(f_invtransform(samples2[22:52][::3]),cols=6)