In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
img_fl = np.load('../../downloading_data/images.npz')
ids = img_fl['ids']

In [3]:
def mag_to_flx(mag):
    """
    Input: an AB magnitude
    Output: a flux in microjansky (muJy)
    Works with np arrays and pd Series
    """
    return 10**(29-(48.60/2.5)) * 10**(-mag/2.5)

def read_lephare_spectrum_file(id):
    """
    Reads the file containing the output spectra for the inputted ID
    """
    with open(f'output_spectra/Id{str(id)[-9:]}.spec','r') as f:
        specfile_lines = f.read().split('\n')
    specfile_lines = [row.split() for row in specfile_lines]
    return specfile_lines

def get_lephare_spectrum(id, i): # i from 0 to 2
    """
    Inputs: An ID, and an integer in [0,1,2]
    Outputs: wavelengths, and a spectrum in muJy. The spectrum is the 
        LePHARE best fit spectrum for:
            0: Galaxy
            1: Quasar
            2: Star
    """
    file_lines = read_lephare_spectrum_file(id)
    spectra = np.array(file_lines[171:][:-1]).astype('float32')
    restart_indices = np.where(np.diff(spectra, axis=0)[:,0]<0)[0]
    spec_list = np.split(spectra, restart_indices+1, axis=0)
    x,y_mag = spec_list[i][:,0],spec_list[i][:,1]
    x,y_mag = x[(y_mag<100) & (y_mag!=0)], y_mag[(y_mag<100)&(y_mag!=0)] # Clipping ceilinged SED values
    y = mag_to_flx(y_mag)
    return x,y

Now the various wavelength and flux arrays generated by the above function are not of the same size (not just because some values have been clipped, it seems LePHARE always outputs non-standard lengths of spectra, so that's just great). In order to store the spectra in a nice way, and because we're only interested in a certain wavelength range anyway, we choose a standardised set of wavelength values between 0.4$\mu$m to 5$\mu$m and interpolate the LePHARE spectra.

In [4]:
num_samples = 1000
wavelength_samples = np.linspace(4e3, 5e4, num_samples)
spectrum_array = np.zeros((len(ids), 3, num_samples)) # ID x Spectrum Type (G/Q/S) x wavelength

for i, idd in tqdm(enumerate(ids), total=len(ids)):
    for obj_num in range(3):
        wavs, flxs = get_lephare_spectrum(idd,obj_num)
        flux_samples = np.interp(wavelength_samples,
                                 wavs, flxs)
        spectrum_array[i,obj_num,:] = flux_samples

100%|██████████████████████████████████████████████████████████████████████████████| 6690/6690 [05:15<00:00, 21.20it/s]


In [5]:
with open('lephare_output.out','r') as f:
    lephare_output_text = f.read()
lephare_df = pd.DataFrame([line.split() for line in lephare_output_text.split('\n')[55:]],
                          columns = ['COADD_OBJECT_ID',
                                     'Z_BEST',
                                     'Z_BEST68_LOW',
                                     'Z_BEST68_HIGH',
                                     'Z_ML',
                                     'CHI_BEST',
                                     'MOD_BEST',
                                     'MAG_ABS_BEST',
                                     'PDZ_BEST',
                                     'SCALE_BEST',
                                     'DIST_MOD_BEST',
                                     'NBAND_USED',
                                     'Z_SEC',
                                     'CHI_SEC',
                                     'MOD_SEC',
                                     'AGE_SEC',
                                     'Z_QSO',
                                     'CHI_QSO',
                                     'MOD_QSO',
                                     'MAG_ABS_QSO',
                                     'DIST_MOD_QSO',
                                     'MOD_STAR',
                                     'CHI_STAR',
                                     'CONTEXT',
                                     'ZSPEC'
                                    ]
                         )[:-1].set_index('COADD_OBJECT_ID')
lephare_df

Unnamed: 0_level_0,Z_BEST,Z_BEST68_LOW,Z_BEST68_HIGH,Z_ML,CHI_BEST,MOD_BEST,MAG_ABS_BEST,PDZ_BEST,SCALE_BEST,DIST_MOD_BEST,...,AGE_SEC,Z_QSO,CHI_QSO,MOD_QSO,MAG_ABS_QSO,DIST_MOD_QSO,MOD_STAR,CHI_STAR,CONTEXT,ZSPEC
COADD_OBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
931684428,0.0000,0.0000,0.0000,-99.0000,0.227919E+07,6,-13.705,100.000,0.324420E-01,0.000000E+00,...,-.990000E+02,0.3600,0.942194E+06,5,-25.169,0.414170E+02,53,0.362753E+07,511.0000,-.990000E+02
556041733,0.0000,0.0000,0.0000,-99.0000,0.487865E+07,6,-13.376,100.000,0.197160E-01,0.000000E+00,...,-.990000E+02,0.3600,0.335500E+07,21,-24.505,0.414170E+02,249,0.553548E+07,511.0000,-.990000E+02
979002481,0.0000,0.0000,0.0000,-99.0000,0.461533E+07,6,-13.591,100.000,0.240236E-01,0.000000E+00,...,-.990000E+02,0.4000,0.313324E+07,21,-25.016,0.416860E+02,18,0.358855E+07,511.0000,-.990000E+02
995403105,0.3643,0.3600,0.3600,-99.0000,0.475196E+07,6,-25.512,100.000,0.157847E+16,0.414473E+02,...,-.990000E+02,1.2800,0.443516E+07,28,-29.570,0.447649E+02,85,0.639085E+07,511.0000,-.990000E+02
979097465,0.3576,0.3600,0.3600,-99.0000,0.239603E+07,6,-25.100,100.000,0.700016E+15,0.413998E+02,...,-.990000E+02,0.0400,0.242060E+07,17,-19.882,0.362358E+02,51,0.337972E+07,511.0000,-.990000E+02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887791189,0.8467,0.8379,0.8442,0.8403,0.372530E+03,1,-22.925,100.000,0.178580E+06,0.436556E+02,...,-.990000E+02,5.8800,0.460257E+02,21,-26.258,0.487571E+02,189,0.130568E+02,510.0000,-.990000E+02
697986932,0.7860,0.7960,0.8007,0.7997,0.702270E+03,1,-22.712,100.000,0.137892E+06,0.434569E+02,...,-.990000E+02,5.8800,0.488564E+02,14,-26.073,0.487571E+02,190,0.167626E+02,510.0000,-.990000E+02
11682001,1.0668,1.0700,1.0820,1.0752,0.772800E+02,1,-24.321,100.000,0.614753E+06,0.442753E+02,...,-.990000E+02,1.4000,0.605848E+02,10,-24.971,0.450055E+02,178,0.631042E+03,511.0000,-.990000E+02
959369723,1.2766,1.2685,1.2882,1.2770,0.123348E+03,2,-24.467,100.000,0.328572E+06,0.447578E+02,...,-.990000E+02,1.2400,0.108058E+03,10,-24.422,0.446796E+02,167,0.863850E+03,511.0000,-.990000E+02


The above df has 7102 objects, some of which don't have images (there are 6690 objects with 5-band imaging, in `ids`)

In [6]:
lephare_df_with_imgs = pd.DataFrame(lephare_df.loc[[str(int(str(idd)[-9:])) for idd in ids]].to_numpy(),
                                    columns = lephare_df.columns,
                                    index=ids)
lephare_df_with_imgs.to_csv('lephare_output_data.csv')
lephare_df_with_imgs

Unnamed: 0,Z_BEST,Z_BEST68_LOW,Z_BEST68_HIGH,Z_ML,CHI_BEST,MOD_BEST,MAG_ABS_BEST,PDZ_BEST,SCALE_BEST,DIST_MOD_BEST,...,AGE_SEC,Z_QSO,CHI_QSO,MOD_QSO,MAG_ABS_QSO,DIST_MOD_QSO,MOD_STAR,CHI_STAR,CONTEXT,ZSPEC
870313315,0.7017,0.6573,0.7271,0.6928,0.189933E+03,1,-22.533,100.000,0.114667E+06,0.431548E+02,...,-.990000E+02,5.8400,0.219392E+02,14,-26.297,0.487399E+02,193,0.168248E+02,510.0000,-.990000E+02
870331581,0.7411,0.7143,0.7681,0.7415,0.153618E+03,1,-22.972,100.000,0.171943E+06,0.433001E+02,...,-.990000E+02,5.5200,0.293205E+02,24,-27.162,0.485972E+02,212,0.330628E+02,510.0000,-.990000E+02
870480016,0.6614,0.6386,0.6839,0.6620,0.270960E+03,1,-22.773,100.000,0.142328E+06,0.429979E+02,...,-.990000E+02,5.8000,0.821098E+02,21,-27.015,0.487225E+02,214,0.429503E+02,511.0000,-.990000E+02
870527456,0.9602,0.9557,0.9643,0.9600,0.175423E+03,1,-23.814,100.000,0.397690E+06,0.439927E+02,...,-.990000E+02,5.9200,0.101707E+03,17,-27.933,0.487743E+02,182,0.392456E+02,510.0000,-.990000E+02
870548042,0.8273,0.8366,0.8408,0.8399,0.784365E+03,1,-23.743,100.000,0.358389E+06,0.435938E+02,...,-.990000E+02,5.8800,0.705955E+02,21,-27.156,0.487571E+02,187,0.662338E+02,510.0000,-.990000E+02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700117776,1.1943,1.1906,1.2052,1.1965,0.811343E+02,2,-24.153,100.000,0.244856E+06,0.445787E+02,...,-.990000E+02,1.0000,0.109160E+03,2,-23.692,0.441017E+02,217,0.639927E+03,511.0000,-.990000E+02
1700212535,1.2145,1.1983,1.2107,1.2053,0.220035E+03,2,-24.585,100.000,0.379695E+06,0.446238E+02,...,-.990000E+02,1.2000,0.292356E+03,10,-24.669,0.445915E+02,216,0.145681E+04,511.0000,-.990000E+02
1700230002,0.8702,0.8784,0.8806,-99.0000,0.207618E+04,1,-24.066,100.000,0.487545E+06,0.437291E+02,...,-.990000E+02,5.6000,0.216203E+03,24,-27.670,0.486336E+02,187,0.728934E+02,511.0000,-.990000E+02
1700237805,1.2527,1.2365,1.2557,1.2486,0.202257E+03,2,-24.520,100.000,0.356043E+06,0.447070E+02,...,-.990000E+02,1.2400,0.152747E+03,10,-24.619,0.446796E+02,167,0.117834E+04,511.0000,-.990000E+02


In [7]:
# A 6690x3 array (IDs x G/Q/S)
chi2s = lephare_df_with_imgs.loc[ids][['CHI_BEST','CHI_QSO','CHI_STAR']].to_numpy().astype('float32')

In [8]:
np.savez_compressed('lephare_spectra_chi2s.npz',
                   ids=ids,
                    wavelengths = wavelength_samples,
                   spectra=spectrum_array,
                   chi2s=chi2s)