# Extract data from output files
### Analyze the output from a single LBANN run
March 9, 2020 \
April 6, 2020 : to store files in order of epochs \
April 21, 2020: added jupyter widgets to compare pixel intensity plots \
May 8, 2020: using all images for a given batch \
May 29, 2020: Modified for new update of LBANN. File names of images changed, so new extraction code. Also added code for computing chi-squared. \
June 17, 2020: Removed train_inp, train_gen and val_inp to reduce memory overhead. From now on, the code only analyzes val_gen \
June 26, 2020: Added gathering of steps and new chi-square quantities.\
July 1, 2020: Switched back to storing mainly train_gen with large steps (10 steps saved for 256 batchsize).\
July 29, 2020: Perform analysis without storing images. Store histograms

In [1]:
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import pandas as pd

import subprocess as sp
import os
import glob
import sys

import itertools
import time


In [2]:
from pandarallel import pandarallel

In [3]:
# %matplotlib widget

In [4]:
sys.path.append('/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/3_analysis')
from modules_image_analysis import *

In [5]:
### Transformation functions for image pixel values
def f_transform(x):
    return 2.*x/(x + 4. + 1e-8) - 1.

def f_invtransform(s):
    return 4.*(1. + s)/(1. - s + 1e-8)

### Modules for Extraction

In [17]:
def f_get_sorted_df(main_dir):
    
    '''
    Module to create Dataframe with filenames for each epoch and step
    Sorts by step and epoch
    '''
    def f_get_info_from_fname(fname):
        ''' Read file and return dictionary with epoch, step'''
        dict1={}
        dict1['epoch']=np.int32(fname.split('epoch')[-1].split('.')[1])
        dict1['step']=np.int64(fname.split('step')[-1].split('.')[1].split('_')[0])
        return dict1
    
    t1=time.time()
    ### get list of file names
    fldr_loc=main_dir+'/dump_outs/trainer0/model0/'
#        keys=['train_gen','train_input','val_gen','val_input']
#     file_strg_dict={'train_gen': sgd.training*_gen_img*_output0.npy','train_input':sgd.training*_inp_img*_output0.npy','val_gen':sgd.validation*_gen_img*_output0.npy','val_input':sgd.validation*_inp_img*_output0.npy'}
    file_strg_dict={'train_gen':'sgd.training*_gen_img*_output0.npy'}
    keys=['train_gen']

    files_arr,img_arr=np.array([]),np.array([])
    for key in keys:
        print(key)
        files=glob.glob(fldr_loc+file_strg_dict[key])
        files_arr=np.append(files_arr,files)
        img_arr=np.append(img_arr,[key] *len(files))

    print('Number of files',len(files_arr))
#     files_arr=np.array([glob.glob(fldr_loc+file_strg_dict[key]) for key in keys][0])

    ### Create dataframe
    df_files=pd.DataFrame()
    df_files['img_type']=np.array(img_arr)
    df_files['fname']=np.array(files_arr).astype(str)

    # Create list of dictionaries
    dict1=df_files.apply(lambda row : f_get_info_from_fname(row.fname),axis=1)
    keys=dict1[0].keys() # Extract keys of dictionary
    # print(keys)
    # ### Convert list of dicts to dict of lists
    dict_list={key:[k[key] for k in dict1] for key in keys}
    # ### Add columns to Dataframe
    for key in dict_list.keys():
        df_files[key]=dict_list[key]

    df_files=df_files.sort_values(by=['img_type','epoch','step']).reset_index(drop=True) ### sort df by epoch and step
    
    t2=time.time()
    print("time for sorting",t2-t1)

    return df_files[['epoch','step','img_type','fname']]


In [8]:
def f_compute_hist_spect(sample,bins):
    ''' Compute pixel intensity histograms and radial spectrum for 2D arrays
    Input : Image arrays and bins
    Output: dictionary with 5 arrays : Histogram values, errors and bin centers, Spectrum values and errors.
    '''
    ### Compute pixel histogram for row
    gen_hist,gen_err,hist_bins=f_batch_histogram(sample,bins=bins,norm=True,hist_range=None)
    ### Compute spectrum for row
    spec,spec_err=f_compute_spectrum(sample,plot=False)

    dict1={'hist_val':gen_hist,'hist_err':gen_err,'hist_bin_centers':hist_bins,'spec_val':spec,'spec_err':spec_err }
    return dict1

def f_get_images(fname,img_type):
    '''
    Extract image using file name
    '''
    fname,key=fname,img_type
    a1=np.load(fname)
    if key.endswith('input'): 
        size=np.int(np.sqrt(a1.shape[-1])) ### Extract size of images (=128)
        batch_size=a1.shape[0] ### Number of batches
        samples=a1.reshape(batch_size,size,size)
    elif key.endswith('gen') : samples=a1[:,0,:,:]
    else : raise SystemError
    
    return samples
    
def f_high_pixel(images,cutoff=0.9966):
    '''
    Get number of images with a pixel about max cut-off value
    '''
    max_arr=np.amax(images,axis=(1,2))
    num_large=max_arr[max_arr>cutoff].shape[0]

    return num_large

def f_compute_chisqr(dict_val,dict_sample):
    '''
    Compute chi-square values for sample w.r.t input images
    Input: 2 dictionaries with 4 keys for histogram and spectrum values and errors
    '''
    ### !!Both pixel histograms MUST have same bins and normalization!
    ### Compute chi-sqr
    ### Used in keras code : np.sum(np.divide(np.power(valhist - samphist, 2.0), valhist))
    ###  chi_sqr :: sum((Obs-Val)^2/(Val))
    
    chisqr_dict={}
    
    val_dr=dict_val['hist_val'].copy()
    val_dr[val_dr<=0.]=1.0    ### Avoiding division by zero for zero bins
    
    sq_diff=(dict_val['hist_val']-dict_sample['hist_val'])**2
    
    size=len(dict_val['hist_val'])
    l1,l2=int(size*0.3),int(size*0.7)
    keys=['chi_1a','chi_1b','chi_1c','chi_1']
    
    for (key,start,end) in zip(keys,[0,l1,l2,0],[l1,l2,None,None]):  # 4 lists : small, medium, large pixel values and full 
        chisqr_dict.update({key:np.sum(np.divide(sq_diff[start:end],val_dr[start:end]))})
    
    idx=None  # Choosing the number of histograms to use. Eg : -5 to skip last 5 bins
#     chisqr_dict.update({'chi_sqr1':})
    
    chisqr_dict.update({'chi_2':np.sum(np.divide(sq_diff[:idx],1.0))}) ## chi-sqr without denominator division
    chisqr_dict.update({'chi_imgvar':np.sum(dict_sample['hist_err'][:idx])/np.sum(dict_val['hist_err'][:idx])}) ## measures total spread in histograms wrt to input data
    
    idx=60
    spec_diff=(dict_val['spec_val']-dict_sample['spec_val'])**2
    ### computing the spectral loss chi-square
    chisqr_dict.update({'chi_spec1':np.sum(spec_diff[:idx]/dict_sample['spec_val'][:idx]**2)})
    
    ### computing the spectral loss chi-square
    chisqr_dict.update({'chi_spec2':np.sum(spec_diff[:idx]/dict_sample['spec_err'][:idx]**2)})
    
    return chisqr_dict

    
def f_get_computed_dict(fname,img_type,bins,dict_val):
    '''
    Get dictionary with chisquare values and histogram and spectrum lists
    '''
    
    ### Get images from file
    images=f_get_images(fname,img_type)    
    ### Compute high pixel values
    high_pixel=f_high_pixel(images,cutoff=0.9898)
    ### Compute spectrum and histograms
    dict_sample=f_compute_hist_spect(images,bins) ## list of 5 numpy arrays 
    ### Compute chi squares
    dict_chisqrs=f_compute_chisqr(dict_val,dict_sample)
    
    dict1={}
    dict1.update(dict_chisqrs)
    dict1.update({'num_large':high_pixel})
    dict1.update(dict_sample)
    
    return dict1

## Extract image data 

In [14]:
# fldr_name='20200718_114324_batchsize_512'
# fldr_name='20200718_135530_batchsize_256'
# fldr_name='20200725_204329_batchsize_256'
fldr_name='20200725_172458_batchsize_64'
# fldr_name='20200803_055550_batchsize_256'
fldr_name='20200804_152954_batchsize_256/'

main_dir='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/{0}'.format(fldr_name)
print(main_dir)

/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/20200725_172458_batchsize_64/


In [21]:
### Get dataframe with file names, sorted by epoch and step
df_files=f_get_sorted_df(main_dir)
print(df_files.shape)

train_gen
63440 63440
time for sorting 3.073791027069092
(63440, 4)


In [20]:
### Extract validation data
fname='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/raw_data/128_square/dataset_2_smoothing_200k/norm_1_train_val.npy'
s_val=np.load(fname,mmap_mode='r')[:8000][:,0,:,:]
print(s_val.shape)

(8000, 128, 128)


### Compute 

In [11]:
t1=time.time()

transform=False ## Images are in transformed space (-1,1), convert bins to the same space
bins=np.concatenate([np.array([-0.5]),np.arange(0.5,20.5,1),np.arange(20.5,100.5,5),np.arange(100.5,1000.5,50),np.array([2000])]) #bin edges to use
# bins=np.concatenate([np.array([-0.5]),np.arange(0.5,20.5,5),np.arange(20.5,100.5,20),np.arange(100.5,1000.5,100),np.array([2000])]) #bin edges to use
if not transform: bins=f_transform(bins)   ### scale to (-1,1) 
### Compute histogram and spectrum of raw data 
dict_val=f_compute_hist_spect(s_val,bins)

### Serial CPU test

In [None]:
df=df_files.copy().head(10)

t2=time.time()
dict1=df.apply(lambda row: f_get_computed_dict(fname=row.fname,img_type='train_gen',bins=bins,dict_val=dict_val),axis=1)
keys=dict1[0].keys()
### Convert list of dicts to dict of lists
dict_list={key:[k[key] for k in dict1] for key in keys}
### Add columns to Dataframe
for key in dict_list.keys():
    df[key]=dict_list[key]
    
t3=time.time()
print("Time ",t3-t2)

In [None]:
df.head(5)

### Parallel CPU test
Using pandarallel : https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply


In [16]:
df=df_files.copy().head(20)

pandarallel.initialize(progress_bar=True)

t2=time.time()
dict1=df.parallel_apply(lambda row: f_get_computed_dict(fname=row.fname,img_type='train_gen',bins=bins,dict_val=dict_val),axis=1)
keys=dict1[0].keys()
print(keys)
### Convert list of dicts to dict of lists
dict_list={key:[k[key] for k in dict1] for key in keys}
### Add columns to Dataframe
for key in dict_list.keys():
    df[key]=dict_list[key]
    
t3=time.time()
print("Time ",t3-t2)
df.head(5)

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

dict_keys(['chi_1a', 'chi_1b', 'chi_1c', 'chi_1', 'chi_2', 'chi_imgvar', 'chi_spec1', 'chi_spec2', 'num_large', 'hist_val', 'hist_err', 'hist_bin_centers', 'spec_val', 'spec_err'])
Time  1.446944236755371


Unnamed: 0,epoch,step,img_type,fname,chi_1a,chi_1b,chi_1c,chi_1,chi_2,chi_imgvar,chi_spec1,chi_spec2,num_large,hist_val,hist_err,hist_bin_centers,spec_val,spec_err
0,0,0,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,189.955298,4.778167,0.013186,194.746651,12.417616,21.439721,18236.350265,35499750.0,0,"[0.00021028518730998038, 0.058279842228064906,...","[4.0617537070536625e-05, 0.001978581126841826,...","[-1.031746031584782, -0.6161616169043975, -0.3...","[56690758.12141779, 2306.343328230083, 1060.85...","[41895.07346110348, 69.45253457053425, 36.8556..."
1,0,1,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,189.319593,6.005049,0.013186,195.337827,12.1706,22.901184,18164.160802,35325050.0,0,"[0.0003285706051718441, 0.06978061805983715, 0...","[4.1581796383402746e-05, 0.0024182402181228547...","[-1.031746031584782, -0.6161616169043975, -0.3...","[56823907.517639935, 2174.3364949659444, 980.9...","[41871.344270795744, 89.13511950909188, 33.405..."
2,0,2,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,174.543797,4.840065,0.013186,179.397048,11.733181,22.045882,36208.258443,68420880.0,0,"[0.0002290606504626572, 0.07263958465321514, 0...","[2.8931124514245006e-05, 0.0026939925494827064...","[-1.031746031584782, -0.6161616169043975, -0.3...","[58477676.14610486, 778.594840059377, 658.1762...","[25157.625568626616, 32.80743395783755, 24.049..."
3,0,3,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,155.529899,4.09153,0.013186,159.634615,11.026773,17.925114,36735.654257,58854970.0,0,"[0.0020108521036516867, 0.10408231632053087, 0...","[0.00011222143762225864, 0.002870345820699504,...","[-1.031746031584782, -0.6161616169043975, -0.3...","[60850329.67851047, 819.156503082253, 644.8699...","[32182.221657235295, 35.85308697820658, 21.235..."
4,0,4,train_gen,/global/cfs/cdirs/m3363/vayyar/cosmogan_data/r...,138.318907,3.584549,0.013186,141.916641,10.247189,17.516094,15366.733078,29794240.0,0,"[0.006141453997240586, 0.12620168943771828, 0....","[0.0003093124891087808, 0.003087684381940943, ...","[-1.031746031584782, -0.6161616169043975, -0.3...","[63578109.04957635, 3592.258982752144, 1342.44...","[77600.30036902921, 97.77704891208928, 40.8790..."


In [None]:
# from pandarallel import pandarallel
# pandarallel.initialize()
# def func(x):
#     return np.sin(x**2)

# df.parallel_apply(func,axis=1)

### GPU test

In [None]:
# import cudf; print('cuDF Version:', cudf.__version__)

In [None]:
# def f_get_computed_dict_gpu(fname,img_type,num_large,dict_val,bins):
#     '''
#     '''
    
#     ### Get images from file
#     for i,(_fname,_img_type) in enumerate(zip(fname,img_type)):
#         images=f_get_images(_fname,_img_type)    
#         ### Compute high pixel values
#         high_pixel=f_high_pixel(images,cutoff=0.9898)
#         ### Compute spectrum and histograms
# #         dict_sample=f_compute_hist_spect(images,bins) ## list of 4 numpy arrays 
#         ### Compute chi squares
# #         dict_chisqrs=f_compute_chisqr(dict_val,dict_sample)

# #         dict1.update(dict_chisqrs)
# #         dict1.update({'num_large':high_pixel})
# #         dict1.update(dict_sample)
    
#         num_large[i]=high_pixel
    
    
    
# t1=time.time()
# df=df_files.copy().head(50)
# df = cudf.DataFrame.from_pandas(df)

# dict1={}
# t2=time.time()
# df_temp=df.apply_rows(f_get_computed_dict_gpu,
#                     incols=['fname','img_type'],
#                     outcols={'num_large':np.float64},
#                     kwargs={'dict_val':dict_val,'bins':bins})
# # keys=dict1[0].keys()
# # ### Convert list of dicts to dict of lists
# # dict_list={key:[k[key] for k in dict1] for key in keys}
# # ### Add columns to Dataframe
# # for key in dict_list.keys():
# #     df[key]=dict_list[key]
    
# t3=time.time()
# print("Time ",t3-t2)

### Save file

In [None]:
### Save to file
# df.to_csv(main_dir+'df_processed.csv',sep=',',index=False)
df[['epoch','step']]=df[['epoch','step']].astype(int)
df.to_pickle(main_dir+'df_processed.pkle')

In [None]:
### Load data
# df_2=pd.read_csv(main_dir+'df_processed.csv',sep=',')
df_2=pd.read_pickle(main_dir+'df_processed.pkle')

In [None]:
df_2.head()