# Extract data from output files
### Analyze the output from a single LBANN run
March 9, 2020

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import subprocess as sp
import os
import glob
import sys

import time
from scipy import fftpack

In [2]:
sys.path.append('/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/LBANN/lbann_cosmogan/3_analysis/')
from modules_image_analysis import *

[NbConvertApp] Converting notebook modules_image_analysis.ipynb to script
[NbConvertApp] Writing 8884 bytes to modules_image_analysis.py


In [3]:
### Transformation functions for image pixel values
def f_transform(x):
    return 2.*x/(x + 4. + 1e-5) - 1.

def f_invtransform(s):
    return 4.*(1. + s)/(1. - s + 1e-5)

In [4]:

def f_get_samples(df,key):
    '''
    Extract array of samples from the DataFrame with images
    Images are of two types:
    1. *_gen have shape (64,1,128,128)
    2. *_input have shape (64,16384)
    '''
    
    keys=['train_gen','train_input','val_gen','val_input']
    assert key in keys,"Given key %s is not the the list of keys %s"%(key,keys)
    
    lst=df[df.type==key]['image'].values
    
    if key.endswith('input'):
        size=np.int(np.sqrt(lst[0].shape[-1])) ### Extract size of images (=128)
        samples=np.array([ii[0,:].reshape(size,size) for ii in lst])
    else : 
        samples=np.array([ii[0,0,:,:] for ii in lst])
    
    return samples

## Extract image data 

In [5]:
fldr_name='20200316_112134_exagan'
fldr_name='20200331_131011_exagan'
fldr_name='20200403_132121_exagan'
fldr_name='20200406_080207_exagan'

# f_list=['20200401_125919_exagan_0.1_1','20200401_130321_exagan_0.1_4',
#         '20200401_130907_exagan_0.3_1','20200401_130646_exagan_0.3_4']
# fldr_name=f_list[0]

main_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/{0}/dump_outs/'.format(fldr_name)
print(main_dir)


/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200406_080207_exagan/dump_outs/


In [6]:

## Get images files and .npy arrays for each image in dump_outs folder
files_dict={}
keys=['train_gen','train_input','val_gen','val_input']
file_strg_lst=['model0-training*-gen_img*-output0.npy','model0-training*-inp_img*-output0.npy','model0-validation*-gen_img*-output0.npy','model0-validation*-inp_img*-output0.npy']
for key,file_strg in zip(keys,file_strg_lst):
    files_dict[key]=np.array(glob.glob(main_dir+file_strg))
    if files_dict[key].shape[0]>1000 : 
        print('Warning the number of files is very large. Possibility of memory overload')

df_files=pd.DataFrame([])
dict1={}
t1=time.time()
### First get sorted Dataframe with file names
for key in keys: 
    files_arr=files_dict[key]  # Get array of files
    print(key,len(files_arr))
    for fname in files_arr:
        ### Extract the Epoch number and step number from the file name
        dict1['type']=key
        dict1['epoch']=np.int32(fname.split('epoch')[-1].split('-')[0])
        dict1['step']=np.int64(fname.split('step')[-1].split('-')[0])
        dict1['fname']=fname
        
        df_files=df_files.append(dict1,ignore_index=True)
## Sort values
df_files=df_files.sort_values(by=['type','epoch','step']).reset_index(drop=True)
# df_files
print("Sorting done")
t2=time.time()
### Then read images one by one into a numpy array and create a new DataFrame
sorted_fnames=df_files.fname.values
### Read images one by one. This is time-consuming.
### Deliberately kept as list because some of the input arrays have different dimensions, causing creation of array of arrays in some cases
images=[np.load(fname) for fname in sorted_fnames]  

##### Create new Dataframe with sorted images
df=pd.DataFrame([])
df['image']=images
t3=time.time()
for col in ['epoch','step','type','fname']: df[col]=df_files[col].values

    
# df['epoch']=df_files.epoch.values
# df['step']=df_files.step.values
# df['type']=df_files.type.values
# df['image'].values[0].shape
print("Extraction done")
print(t3-t2,t2-t1)

train_gen 1847
train_input 1847
val_gen 208
val_input 208
Sorting done
Extraction done
152.9934754371643 19.918874740600586


## Extract samples 

In [7]:
# df_files.head(30)

In [None]:
### Available options : keys=['train_gen','train_input','val_gen','val_input']
samples1=f_get_samples(df,'train_input')
print(samples1.shape)
samples2=f_get_samples(df,'val_gen')
print(samples2.shape)

samples3=f_get_samples(df,'train_gen')
print(samples3.shape)
samples4=f_get_samples(df,'val_input')
print(samples4.shape)

## Compare images

In [None]:
f_pixel_intensity(samples1,normalize=False)
f_pixel_intensity(samples2,normalize=False)

f_compare_pixel_intensity(samples1,samples2,label1='input',label2='generated',normalize=True)
# plt.savefig('comparison_intensity.png')


In [None]:

def f_plot_intensity_grid(arr,cols=5):
    
    size=arr.shape[0]
    assert cols<=size, "cols %s greater than array size %s"%(cols,size)
    
    num=arr.shape[0]
    rows=int(num/cols)+1
#     print("Plotting %s images" %(rows*cols))
    fig,axarr=plt.subplots(rows,cols,figsize=(12,12),constrained_layout=True)
    for i in range(rows*cols):
        row,col=int(i/cols),i%cols
        ### Get histogram
        try: 
            img_arr=arr[i]
            norm=False
            hist, bin_edges = np.histogram(img_arr.flatten(), bins=25, density=norm)
            centers = (bin_edges[:-1] + bin_edges[1:]) / 2
            axarr[row,col].errorbar(centers,hist,fmt='o-')
#         fig.subplots_adjust(left=0.01,bottom=0.01,right=0.1,top=0.1,wspace=0.001,hspace=0.0001)
        except: 
            pass

f_plot_intensity_grid(samples2[20:40],cols=8)

In [None]:
# f_pixel_intensity(f_invtransform(samples3[400:]),normalize=False)

In [None]:
f_pixel_intensity(f_invtransform(samples1),normalize=False)
f_pixel_intensity(f_invtransform(samples2),normalize=True)
f_pixel_intensity(f_invtransform(samples2[:30]),normalize=False)
f_pixel_intensity(f_invtransform(samples2[30:60]),normalize=False)
f_pixel_intensity(f_invtransform(samples2[60:90]),normalize=False)
f_pixel_intensity(f_invtransform(samples2[90:]),normalize=False)

In [None]:
f_compare_spectrum(samples1,samples2,label1='input',label2='generated')
# ? f_compare_spectrum

### Inspect individual sample arrays

In [None]:
f_compute_spectrum(samples1)
f_compute_spectrum(samples2)