In [None]:
import numpy as np

from my_functions import *

import pandas as pd

import glob

In [None]:
w_central = central_wavelength()
nb_fwhm_Arr = nb_fwhm(range(60))
w_lya = 1215.67

In [None]:
## Load SF catalog

filename = '/home/alberto/almacen/Source_cats/LAE_10deg_z2-4_0/'
files = glob.glob(filename +'data*')
files.sort()
fi = []

for name in files:
    fi.append(pd.read_csv(name))

data = pd.concat(fi, axis=0, ignore_index=True)

sf_flx = data.to_numpy()[:, 1 : 60 + 1].T
sf_err = data.to_numpy()[:, 60 + 1 : 120 + 1].T

## Load my QSO catalog

filename = '/home/alberto/almacen/Source_cats/QSO_100000_0/'
files = glob.glob(filename +'data*')
files.sort()
fi = []

for name in files:
    fi.append(pd.read_csv(name))

data_qso = pd.concat(fi, axis=0, ignore_index=True)

qso_flx = data_qso.to_numpy()[:, 1 : 60 + 1].T
qso_err = data_qso.to_numpy()[:, 60 + 1 : 120 + 1].T

sf_L = data['L_lya'].to_numpy()
qso_L = data_qso['L_lya'].to_numpy()

In [None]:
def nice_lya_search(flx, err, L_lya, mag_min, mag_max):
    # Lya search
    cont_est_lya, cont_err_lya = estimate_continuum(flx, err, IGM_T_correct=True)
    line = is_there_line(flx, err, cont_est_lya, cont_err_lya, 20)
    lya_lines, lya_cont_lines, _ = identify_lines(
        line, flx, err, first=True, return_line_width=True
    )
    lya_lines = np.array(lya_lines)

    # Other lines
    cont_est_other, cont_err_other = estimate_continuum(flx, err, IGM_T_correct=False)
    line_other = is_there_line(flx, err, cont_est_other, cont_err_other,
        400, obs=True)
    other_lines = identify_lines(line_other, flx, err)

    # Compute z
    N_sources = flx.shape[1]
    z_Arr = np.zeros(N_sources)
    z_Arr[np.where(np.array(lya_lines) != -1)] =\
        z_NB(np.array(lya_cont_lines)[np.where(np.array(lya_lines) != -1)])

    nb_min = 3
    nb_max = 20

    z_min = (w_central[nb_min] - nb_fwhm_Arr[nb_min] * 0.5) / w_lya - 1
    z_max = (w_central[nb_max] + nb_fwhm_Arr[nb_max] * 0.5) / w_lya - 1

    z_cut = (z_min < z_Arr) & (z_Arr < z_max)

    mag = flux_to_mag(flx[-2], w_central[-2])
    mag[np.isnan(mag)] = 99.

    # nice_lya = nice_lya_select(
    #     lya_lines, other_lines, flx, err, cont_est_lya, z_Arr
    # )
    # nice_lya = (nice_lya & z_cut & (L_lya > 0))
    nice_lya = z_cut & (L_lya > 0) & (mag > mag_min) & (mag < mag_max)


    _, _, L_Arr, _, _, _ = EW_L_NB(
        flx, err, cont_est_lya, cont_err_lya, z_Arr, lya_lines, N_nb=0
    )

    return nice_lya, z_Arr, L_Arr

In [None]:
def sample_sources(flx, err, L_lya, mag_min, mag_max, N_samples=500_000):
    out_flx = np.array([])
    n_iter = 0
    while True:
        n_iter += 1
        print(f'n_iter = {n_iter}')
        this_flx = flx + err * np.random.normal(size=err.shape)

        this_nice_lya, this_z_Arr, this_L_Arr = nice_lya_search(
            this_flx, err, L_lya, mag_min, mag_max
        )

        if len(out_flx) == 0:
            out_flx = flx[:, this_nice_lya]
            out_err = err[:, this_nice_lya]
            out_z = this_z_Arr[this_nice_lya]
            out_L = this_L_Arr[this_nice_lya]
            out_L_lya = L_lya[this_nice_lya]
        else:
            out_flx = np.hstack((out_flx, this_flx[:, this_nice_lya]))
            out_err = np.hstack((out_err, err[:, this_nice_lya]))
            out_z = np.concatenate((out_z, this_z_Arr[this_nice_lya]))
            out_L = np.concatenate((out_L, this_L_Arr[this_nice_lya]))
            out_L_lya = np.concatenate((out_L_lya, L_lya[this_nice_lya]))
        
        print(f'Sampled {len(out_z)} / {N_samples}')
        
        if len(out_z) >= N_samples:
            break

    randomize = np.random.choice(np.arange(0, len(out_L)), N_samples)

    out_flx = out_flx[:, randomize]
    out_err = out_err[:, randomize]
    out_z = out_z[randomize]
    out_L = out_L[randomize]
    out_L_lya = out_L_lya[randomize]

    return out_flx, out_err, out_z, out_L, out_L_lya

In [None]:
def ensemble_dataset(mag_min, mag_max):
    sf_flx_data, sf_err_data, sf_z_data, sf_L_data, sf_L_Lya_data =\
        sample_sources(sf_flx, sf_err, sf_L, mag_min, mag_max)
    qso_flx_data, qso_err_data, qso_z_data, qso_L_data, qso_L_Lya_data =\
        sample_sources(qso_flx, qso_err, qso_L, mag_min, mag_max)

    pm_flx = np.hstack((qso_flx_data, sf_flx_data))
    pm_err = np.hstack((qso_err_data, sf_err_data))
    z_Arr = np.concatenate((qso_z_data, sf_z_data))
    L_Arr = np.concatenate((qso_L_data, sf_L_data))
    L_tags = np.concatenate((qso_L_Lya_data, sf_L_Lya_data))

    dataset = np.hstack(
        (
            pm_flx[:55].T,
            pm_flx[-4:].T,
            np.abs(pm_err[:55].T / pm_flx[:55].T),
            np.abs(pm_err[-4:].T / pm_flx[-4:].T),
            L_Arr.reshape(-1, 1),
            z_Arr.reshape(-1, 1)
        )
    )

    return dataset, L_tags

In [14]:
mag_min_list = [15, 23, 23.5]
mag_max_list = [23, 23.5, 24]

for mag_min, mag_max in zip(mag_min_list, mag_max_list):
    dataset, L_tags = ensemble_dataset(mag_min, mag_max)

    pd.DataFrame(dataset).to_csv(f'MLmodels/dataset_mag{mag_min}-{mag_max}_test.csv')
    pd.DataFrame(L_tags).to_csv(f'MLmodels/tags_mag{mag_min}-{mag_max}_test.csv')

n_iter = 1


  return -2.5 * np.log10(f * w**2/c * 1e-8) - 48.60
  EW_nb_Arr = flambda / cont / (1 + z_Arr)
  EW_nb_e = flambda_e / cont / (1 + z_Arr)
  L_Arr = np.log10(flambda * 4*np.pi * dL ** 2)


Sampled 5282 / 500000
n_iter = 2
Sampled 10694 / 500000
n_iter = 3
Sampled 16112 / 500000
n_iter = 4
Sampled 21585 / 500000
n_iter = 5
Sampled 26966 / 500000
n_iter = 6
Sampled 32359 / 500000
n_iter = 7
Sampled 37788 / 500000
n_iter = 8
Sampled 43175 / 500000
n_iter = 9
Sampled 48584 / 500000
n_iter = 10
Sampled 54024 / 500000
n_iter = 11
Sampled 59344 / 500000
n_iter = 12
Sampled 64820 / 500000
n_iter = 13
Sampled 70210 / 500000
n_iter = 14
Sampled 75557 / 500000
n_iter = 15
Sampled 80988 / 500000
n_iter = 16
Sampled 86410 / 500000
n_iter = 17
Sampled 91835 / 500000
n_iter = 18
Sampled 97097 / 500000
n_iter = 19
Sampled 102441 / 500000
n_iter = 20
Sampled 107802 / 500000
n_iter = 21
Sampled 113210 / 500000
n_iter = 22
Sampled 118539 / 500000
n_iter = 23
Sampled 123910 / 500000
n_iter = 24
Sampled 129330 / 500000
n_iter = 25
Sampled 134775 / 500000
n_iter = 26
Sampled 140176 / 500000
n_iter = 27
Sampled 145628 / 500000
n_iter = 28
Sampled 150935 / 500000
n_iter = 29
Sampled 156178 / 50