# Convert ascii to pytables

## prepare conversion of ascii tables into pytables

- author : Sylvie Dagoret-Campagne
- affilication : IJCLab/IN2P3/CNRS
- Creation date : January 22th 2022

## Imports

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
import sys,os
sys.path.append('../')
from delight.io import *
from delight.utils import *
from delight.photoz_gp import PhotozGP

In [2]:
from matplotlib.colors import LogNorm

In [3]:
import logging
import coloredlogs
logger = logging.getLogger(__name__)
coloredlogs.install(level='DEBUG', logger=logger,fmt='%(asctime)s,%(msecs)03d %(programname)s %(name)s[%(process)d] %(levelname)s %(message)s')

In [4]:
print(sys.executable)
print(sys.version)
#print(sys.version_info)

/Users/dagoret/anaconda3/bin/python
3.8.8 (default, Apr 13 2021, 12:59:45) 
[Clang 10.0.0 ]


In [5]:
plt.rcParams["figure.figsize"] = (12,6)
plt.rcParams["axes.labelsize"] = 'xx-large'
plt.rcParams['axes.titlesize'] = 'xx-large'
plt.rcParams['xtick.labelsize']= 'xx-large'
plt.rcParams['ytick.labelsize']= 'xx-large'

In [6]:
from delight.io import *

In [7]:
import tables as tb

In [8]:
!ls tmp

[1m[31mdata[m[m                  parametersTest_14.cfg parametersTest_21.cfg
[1m[31mdelight_data[m[m          parametersTest_15.cfg parametersTest_3.cfg
parametersTest.cfg    parametersTest_16.cfg parametersTest_4.cfg
parametersTest_1.cfg  parametersTest_17.cfg parametersTest_5.cfg
parametersTest_10.cfg parametersTest_18.cfg parametersTest_6.cfg
parametersTest_11.cfg parametersTest_19.cfg parametersTest_7.cfg
parametersTest_12.cfg parametersTest_2.cfg  parametersTest_8.cfg
parametersTest_13.cfg parametersTest_20.cfg parametersTest_9.cfg


## configuration

In [9]:
# config file
configdir          = "./tmp"
configfilename     = "parametersTest.cfg"
configfullfilename = os.path.join(configdir,configfilename)
# data
workdir               = "./tmp/delight_data"
filename_galaxies     = "galaxies-fluxredshifts.txt"
fullfilename_galaxies = os.path.join(workdir,filename_galaxies)

In [10]:
paramFileName = configfullfilename
params        = parseParamFile(paramFileName, verbose=False)

Input parameter file: ./tmp/parametersTest.cfg
Parameters read:
>  rootDir              ./
>  bands_directory      ./tmp/data/FILTERS
>  bandNames            DC2LSST_u DC2LSST_g DC2LSST_r DC2LSST_i DC2LSST_z DC2LSST_y
>  numCoefs             15
>  bands_fmt            res
>  bands_verbose        True
>  bands_debug          True
>  bands_makeplots      False
>  templates_directory  ./tmp/data/CWW_SEDs
>  sed_fmt              sed
>  lambdaRef            4500.0
>  templates_names      El_B2004a Sbc_B2004a Scd_B2004a SB3_B2004a SB2_B2004a Im_B2004a ssp_25Myr_z008 ssp_5Myr_z008
>  p_t                  [0.27   0.26   0.25   0.069  0.021  0.11   0.0061 0.0079]
>  p_z_t                [0.23 0.39 0.33 0.31 1.1  0.34 1.2  0.14]
>  training_numChunks   1
>  training_paramFile   ./tmp/delight_data/galaxies-gpparams.txt
>  training_catFile     ./tmp/delight_data/galaxies-fluxredshifts.txt
>  training_referenceBand DC2LSST_i
>  training_bandOrder   DC2LSST_u DC2LSST_u_var DC2LSST_g DC2LSST_g_var DC

# Delight Parameters

In [11]:
DL = approx_DL()
redshiftDistGrid, redshiftGrid, redshiftGridGP = createGrids(params)
numZ = redshiftGrid.size

# Locate which columns of the catalog correspond to which bands.

bandIndices, bandNames, bandColumns, bandVarColumns, redshiftColumn,refBandColumn = readColumnPositions(params, prefix="target_")

In [12]:
print(bandIndices)

[0 1 2 3 4 5]


In [13]:
print(bandNames)

['DC2LSST_u' 'DC2LSST_g' 'DC2LSST_r' 'DC2LSST_i' 'DC2LSST_z' 'DC2LSST_y']


In [14]:
print(bandColumns)

[ 0  2  4  6  8 10]


In [15]:
print(bandVarColumns)

[ 1  3  5  7  9 11]


In [16]:
print(redshiftColumn)

12


In [17]:
print(refBandColumn)

4


# Training

In [18]:
nbcol_train = len(params['training_bandOrder'])
nbcol_train

13

In [19]:
numThreads = 1
threadNum = 0

numObjectsTraining = np.sum(1 for line in open(params['training_catFile']))

msg= 'Number of Training Objects ' + str(numObjectsTraining)
logger.info(msg)

firstLine = int(threadNum * numObjectsTraining / numThreads)
lastLine = int(min(numObjectsTraining,(threadNum + 1) * numObjectsTraining / numThreads))
numLines = lastLine - firstLine

msg ='Thread ' +  str(threadNum) + ' , analyzes lines ' + str(firstLine) + ' , to ' + str(lastLine)
logger.info(msg)

  numObjectsTraining = np.sum(1 for line in open(params['training_catFile']))
2022-01-23 14:46:05,876 ipykernel_launcher.py __main__[76528] INFO Number of Training Objects 3755
2022-01-23 14:46:05,878 ipykernel_launcher.py __main__[76528] INFO Thread 0 , analyzes lines 0 , to 3755


In [20]:
crossValidate_flag = params['training_crossValidate']
crossValidate_flag

False

In [21]:
trainingDataIter1 = getDataFromFile(params, firstLine, lastLine,prefix="training_", getXY=True,CV=crossValidate_flag)
loc = 0
for z, normedRefFlux,bands, fluxes, fluxesVar,bandsCV, fluxesCV, fluxesVarCV,X, Y, Yvar in trainingDataIter1:
    print(loc,"\t z",z,"\t Nflux",normedRefFlux,"\t bands",bands,"\t fluxes",fluxes,"\t fluxesVar",fluxesVar,
          "\t bandsCV",bandsCV,"\t fluxesCV", fluxesCV,"\t fluxesVarCV", fluxesVarCV,"\t X",X,"\t Y",Y,"\t Yvar",Yvar)
    
    if loc>2:
        break
    loc += 1

0 	 z 0.02043498739444738 	 Nflux 3031650.7320414316 	 bands [0 1 2 3 4 5] 	 fluxes [1349.60090537 3647.50826794 4841.59244168 5544.0759444  5751.77741451
 5981.23969488] 	 fluxesVar [ 46.38848211 332.9087944  586.39517919 768.95409146 827.86598966
 895.94544786] 	 bandsCV None 	 fluxesCV None 	 fluxesVarCV None 	 X [[0.00000000e+00 2.04349874e-02 2.60222573e+11]
 [1.00000000e+00 2.04349874e-02 2.60222573e+11]
 [2.00000000e+00 2.04349874e-02 2.60222573e+11]
 [3.00000000e+00 2.04349874e-02 2.60222573e+11]
 [4.00000000e+00 2.04349874e-02 2.60222573e+11]
 [5.00000000e+00 2.04349874e-02 2.60222573e+11]] 	 Y [[1349.60090537]
 [3647.50826794]
 [4841.59244168]
 [5544.0759444 ]
 [5751.77741451]
 [5981.23969488]] 	 Yvar [[ 46.38848211]
 [332.9087944 ]
 [586.39517919]
 [768.95409146]
 [827.86598966]
 [895.94544786]]
1 	 z 0.01936132305993654 	 Nflux 81129.16941288905 	 bands [0 1 2 3 4 5] 	 fluxes [ 50.13226396 115.50270113 135.77265951 148.36348784 152.33461248
 155.26331547] 	 fluxesVar [0.229

## Create a pytable

- https://www.pytables.org/usersguide/introduction.html

In [22]:
# data
workdir               = "./tmp/delight_data"
filename_galaxies_h5     = "galaxies-fluxredshifts.h5"
fullfilename_galaxies_h5 = os.path.join(workdir,filename_galaxies_h5)

In [23]:
# Open a file in "w"rite mode
fileh = tb.open_file(fullfilename_galaxies_h5, mode = "w")

In [24]:
# Get the HDF5 root group
root = fileh.root

In [25]:
#Create the groups
group = fileh.create_group(root, "group")

### data structure

In [26]:
class Galaxy(tb.IsDescription):
    idnumber       = tb.Int64Col()      # Signed 64-bit integer
    redshift       = tb.Float32Col()    # float32 redshift
    normedRedfFlux = tb.Float32Col()    # float32 normalized Flux
    bands          = tb.Int32Col(shape = (len(bandNames),)) 
    fluxes         = tb.Float32Col(shape = (len(bandNames),)) 
    fluxesVar      = tb.Float32Col(shape = (len(bandNames),)) 
    if crossValidate_flag:
        bandsCV        = tb.Int32Col(shape = (len(bandNames),)) 
        fluxesCV       = tb.Float32Col(shape = (len(bandNames),)) 
        fluxesCVVar    = tb.Float32Col(shape = (len(bandNames),)) 
    X              = tb.Float32Col(shape = (len(bandNames),3))   # Gaussian Process X
    Y              = tb.Float32Col(shape = (len(bandNames),1))   # Gaussian Process Y
    Yvar           = tb.Float32Col(shape = (len(bandNames),1))   # Gaussian Process Yvar

In [27]:
# Create 1 table in group
table = fileh.create_table(group, "table", Galaxy)

In [28]:
# Get the record object associated with the table:
row = table.row

In [29]:
trainingDataIter1 = getDataFromFile(params, firstLine, lastLine,prefix="training_", getXY=True,CV=crossValidate_flag)
loc = 0
for z, normedRefFlux,bands, fluxes, fluxesVar,bandsCV, fluxesCV, fluxesVarCV,X, Y, Yvar in trainingDataIter1:
       
    row['idnumber']       = loc
    row['redshift']       = z
    row['normedRedfFlux'] = normedRefFlux
    row['bands']          = bands
    row['fluxes']         = fluxes
    row['fluxesVar']      = fluxesVar
    if crossValidate_flag:
        row['bandsCV']        = bandsCV
        row['fluxesCV']       = fluxsCV
        row['fluxesVarCV']    = fluxsVarCV
    row['X']              = X
    row['Y']              = Y
    row['Yvar']           = Yvar
        # This injects the Record values
    row.append()
    loc += 1
    
 # Flush the table buffers
table.flush()

# Finally, close the file (this also will flush all the remaining buffers!)
fileh.close()    