# Code to extract slices from hdf5 data files

April 7, 2020

In [1]:
import numpy as np
import h5py
import os

import glob
import argparse
import time

In [2]:

### Location of hdf5 files
data_dir='/global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/'
### Extract list of hdf5 files
f_list=glob.glob(data_dir+'*.hdf5')
len(f_list)

19

In [3]:
### Explore the hdf5 file
def f_explore_file(fname):
    '''
    Explore the structure of the hdf5 file
    The Keys are : ['full', 'namePar', 'physPar', 'redshifts', 'unitPar']
    'full' has many elements. Each is a numpy array with shape (512,512,4) 
    The last index 4 corresponds to red-shift. Eg. 0, 0.5, 1.5, 3.0
    '''
    dta=h5py.File(fname,'r') 
    print('Keys',[i for i in dta])
    print('Size of full array',len(dta['full']))
    print('Size of each element',dta['full'][0].shape)
    print(len(dta['full']))
    for key in ['namePar', 'physPar', 'redshifts', 'unitPar']:
        print(key)
        print([i for i in dta[key]])
    
f_explore_file(f_list[0])

Keys ['full', 'namePar', 'physPar', 'redshifts', 'unitPar']
Size of full array 512
Size of each element (512, 512, 4)
512
namePar
[b'Omega_m', b'sigma_8', b'N_spec', b'H_0']
physPar
[0.32781, 0.849941, 0.938285, 57.778255]
redshifts
[0.0, 0.5, 1.5, 3.0]
unitPar
[0.3089974, 0.2080883, -0.0753999, -0.5819879]


In [4]:

def f_get_slices_all_axes(f_list,splice_interval=8):
    '''
    Get 2D slices of 512^3 images along all 3 axes
    splice_interval is the spacing between layers 
    '''
    
    slices = []
    img_dim = 128
    perside = 512//img_dim
    
    for fname in f_list:
        with h5py.File(fname, 'r') as inputdata:
            for i1 in range(perside):
                for i2 in range(perside):
                    # Select slices along planes : xy,yz, zx, for redshift=0 
                    # (128 * 128 images from 512 x 512 images-> 16 images)
                    ## yz axis: 
                    data = inputdata['full'][::splice_interval, i1*img_dim:(i1+1)*img_dim, i2*img_dim:(i2+1)*img_dim, 0]
                    data2=np.transpose(data,(0,1,2)) ### Transpose to get array in the form (samples,128,128)
                    np.random.shuffle(data2) ### Shuffle samples (along first axis)
                    slices.append(np.expand_dims(data2, axis=-1))

                    ## xy axis: 
                    data = inputdata['full'][i1*img_dim:(i1+1)*img_dim,i2*img_dim:(i2+1)*img_dim,::splice_interval,0]
                    data2=np.transpose(data,(2,0,1)) ### Transpose to get array in the form (samples,128,128)
                    np.random.shuffle(data2) ### Shuffle samples (along first axis)
                    slices.append(np.expand_dims(data2, axis=-1))      

                    ## xz axis: 
                    data = inputdata['full'][i1*img_dim:(i1+1)*img_dim,::splice_interval,i2*img_dim:(i2+1)*img_dim,0]
                    data2=np.transpose(data,(1,0,2))  ### Transpose to get array in the form (samples,128,128)
                    np.random.shuffle(data2) ### Shuffle samples (along first axis)
                    slices.append(np.expand_dims(data2, axis=-1))

        print('Sliced %s'%fname)
    slices = np.concatenate(slices)
    print(slices.shape)
    
    return slices


# slices=f_get_slices_all_axes(f_list[:2])

In [11]:
# np.vsplit(data,data.shape[0])[0].shape

In [5]:
t1=time.time()
slices=f_get_slices_all_axes(f_list,4)
t2=time.time()
print('Time taken',t2-t1)

Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a1313490.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a12530935.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a7894967.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a8186150.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a3610230.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a12980342.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a16082405.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a6202700.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a2025313.hdf5
Sliced /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/univ_ics_2019-03_a

In [6]:
### Save data to files

### Location to store the .npy files generated by this code
dest_dir='/global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/'
file_prefix='large_dataset'

train_index=np.int(0.90*len(slices))
train = slices[:train_index]
val = slices[train_index:]

train_fname = dest_dir+file_prefix+'_train.npy'
print('Saving file %s'%train_fname)
print('shape='+str(train.shape))
np.save(train_fname, train)

val_fname = dest_dir+file_prefix+'_val.npy'
print('Saving file %s'%val_fname)
print('shape='+str(val.shape))
np.save(val_fname, val)

Saving file /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/large_dataset_train.npy
shape=(105062, 128, 128, 1)
Saving file /global/project/projectdirs/dasrepo/vpa/cosmogan/data/raw_data/large_dataset_val.npy
shape=(11674, 128, 128, 1)


0_slice_universes.py  1_get_slices.ipynb  slice_universes.py
