# Code to explore structure of hdf5 data files

June 9, 2020: Adding gaussian smoothing

In [1]:
import numpy as np
import h5py
import os

import glob
import time

from scipy.ndimage import gaussian_filter   ### For gausian filtering

import matplotlib.pyplot as plt

In [2]:
%matplotlib widget

In [3]:
### Explore the hdf5 file
def f_explore_file(fname):
    '''
    Explore the structure of the hdf5 file
    Attributes are : ['dataset_tag','seed9','universe_tag']
    The Keys are : ['full', 'namePar', 'physPar', 'redshifts', 'unitPar']
    
    'full' is an array of shape (512,512,512,4)
    The last index 4 corresponds to red-shift. Eg. 0, 0.5, 1.5, 3.0
    '''
    dta=h5py.File(fname,'r') 
    
    ### Attributes
    attrs=dta.attrs
    print('Attributes',[(i,attrs[i]) for i in attrs])
    
    ### Keys
    keys=dta.keys()
    print('\nKeys',keys)
    
    print("\nThe key: 'full' ")
    print('Shape of the array',dta['full'].shape)
    
    print('\nOther keys')
    for key in ['namePar', 'physPar', 'redshifts', 'unitPar']:
        print(key,dta[key][:])

    

#### Sample exploration of files

In [15]:
fname='/global/cfs/cdirs/m3363/www/cosmoUniverse_2020_11_4parE_cGAN/Sg0.5/univ_ics_2019-03_a10582192.hdf5'
fname='/global/cfs/cdirs/m3363/www/cosmoUniverse_2020_08_4parEgrid/Om0.15_Sg0.5_H100.0/univ_ics_2019-03_a16305120.hdf5'
f_explore_file(fname)

Attributes [('dataset_tag', b'4parE'), ('seed9', 16305120), ('universe_tag', b'4parE_33007379-11')]

Keys <KeysViewHDF5 ['full', 'namePar', 'physPar', 'redshifts', 'unitPar']>

The key: 'full' 
Shape of the array (512, 512, 512, 4)

Other keys
namePar [b'Omega_m' b'sigma_8' b'N_spec' b'H_0']
physPar [  0.15   0.5    0.96 100.  ]
redshifts [0.  0.5 1.5 3. ]
unitPar [-0.5        -0.375       0.          0.42857143]


In [5]:
512**3/(64**3)

512.0

### Read in list of file

In [9]:
### Location of hdf5 files
data_dir='/global/project/projectdirs/m3363/www/cosmoUniverse_2019_08_const/'
### Extract list of hdf5 files
f_list=glob.glob(data_dir+'*.hdf5')
len(f_list)


200

In [14]:
h5py.File(f_list[0],'r')['full'][:,:,:,0].shape

(512, 512, 512)

In [10]:
# for i in f_list[:5]:
#     f_explore_file(i)

### Exploring Gaussian filtering
**Gaussian blurring**: https://en.wikipedia.org/wiki/Gaussian_blur#:~:text=In%20image%20processing%2C%20a%20Gaussian,image%20noise%20and%20reduce%20detail \
**Paper using it**: https://arxiv.org/abs/1801.09070


In [12]:
dta=h5py.File(fname,'r') 
arr=np.array(dta['full'])


In [23]:
%timeit filtered_arr=gaussian_filter(arr, sigma=0.5,mode='wrap')

48.9 s ± 1.38 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
def f_compare_pixel_intensity(img_lst,label_lst=['img1','img2'],bkgnd_arr=None,log_scale=True, normalize=True, mode='avg',bins=25, hist_range=None):
    '''
    Module to compute and plot histogram for pixel intensity of images
    Has 2 modes : simple and avg
    simple mode: No errors. Just flatten the input image array and compute histogram of full data
    avg mode(Default) : 
        - Compute histogram for each image in the image array
        - Compute errors across each histogram 
    bkgnd_arr : histogram of this array is plotting with +/- sigma band
    
    '''

    norm=normalize # Whether to normalize the histogram
    
    def f_batch_histogram(img_arr,bins,norm,hist_range):
        ''' Compute histogram statistics for a batch of images'''
        
        ## Extracting the range. This is important to ensure that the different histograms are compared correctly
        if hist_range==None : ulim,llim=np.max(img_arr),np.min(img_arr)
        else: ulim,llim=hist_range[1],hist_range[0]
#         print(ulim,llim)
        ### array of histogram of each image
        hist_arr=np.array([np.histogram(arr.flatten(), bins=bins, range=(llim,ulim), density=norm) for arr in img_arr]) ## range is important
        hist=np.stack(hist_arr[:,0]) # First element is histogram array
#         print(hist.shape)

        bin_list=np.stack(hist_arr[:,1]) # Second element is bin value 
        ### Compute statistics over histograms of individual images
        mean,err=np.mean(hist,axis=0),np.std(hist,axis=0)/np.sqrt(hist.shape[0])
        bin_edges=bin_list[0]
        centers = (bin_edges[:-1] + bin_edges[1:]) / 2
#         print(bin_edges,centers)

        return mean,err,centers
    
    plt.figure()
    
    ## Plot background distribution
    if bkgnd_arr is not None:
        if mode=='simple':
            hist, bin_edges = np.histogram(bkgnd_arr.flatten(), bins=bins, density=norm, range=hist_range)
            centers = (bin_edges[:-1] + bin_edges[1:]) / 2
            plt.errorbar(centers, hist, color='k',marker='*',linestyle=':', label='bkgnd')

        elif mode=='avg':
            ### Compute histogram for each image. 
            mean,err,centers=f_batch_histogram(bkgnd_arr,bins,norm,hist_range)
            plt.plot(centers,mean,linestyle=':',color='k',label='bkgnd')
            plt.fill_between(centers, mean - err, mean + err, color='k', alpha=0.4)
    
    ### Plot the rest of the datasets
    for img,label in zip(img_lst,label_lst):     
        if mode=='simple':
            hist, bin_edges = np.histogram(img.flatten(), bins=bins, density=norm, range=hist_range)
            centers = (bin_edges[:-1] + bin_edges[1:]) / 2
            plt.errorbar(centers, hist, fmt='o-', label=label)

        elif mode=='avg':
            ### Compute histogram for each image. 
            mean,err,centers=f_batch_histogram(img,bins,norm,hist_range)
#             print('Centers',centers)
            plt.errorbar(centers,mean,yerr=err,fmt='o-',label=label)

    if log_scale: 
        plt.yscale('log')
        plt.xscale('log')

    plt.legend()
    plt.xlabel('Pixel value')
    plt.ylabel('Counts')
    plt.title('Pixel Intensity Histogram')

In [46]:
f_compare_pixel_intensity([arr,filtered_arr],label_lst=['raw','filtered'],mode='simple',normalize=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [11]:
fname='/global/cfs/cdirs/m3363/vayyar/cosmogan_data/raw_data/3d_data/dataset5_3dcgan_4univs_64cube_simple_splicing/Om0.3_Sg0.5_H70.0.npy'
a1=np.load(fname,mmap_mode='r')

In [12]:
print(a1.shape)

(16384, 1, 64, 64, 64)


32.0

In [None]:
512**3/