In [None]:
from my_functions import *

import numpy as np

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 12})

import pickle

In [None]:
w_central = central_wavelength()
nb_fwhm_Arr = nb_fwhm(range(60))
w_lya = 1215.67

In [None]:
def prepare_dataset(regname):
    '''
    The initial features are:
    - The fluxes of the first 53 NBs
    - The errors of the first 53 NBs
    - 4 BB fluxes
    - 4 BB errors
    - The estimated L
    - The estimated z
    TOTAL = 120 features
    (PCA to be applied below)
    '''

    # The data set is the nice_lya sample
    NNdata = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_5fold_nice_train.csv').to_numpy()[:, 1:]
    NNdata_L_input = pd.read_csv(f'MLmodels/datasets/tags_{regname}_5fold_nice_train.csv').to_numpy()[:, 1:]
    NNdata_L_Arr = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_test.csv').to_numpy()[:, -2].reshape(-1,)
    NNdata_test = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_test.csv').to_numpy()[:, 1:]
    NNlabels_test = pd.read_csv(f'MLmodels/datasets/tags_{regname}_test.csv').to_numpy()[:, 1:]
    print(NNdata.shape)

    # Take the relative fluxes to the selected one
    NB_lya_position = NB_z(NNdata[:, -1].reshape(-1,))
    for i, nb in enumerate(NB_lya_position):
        NNdata[i, 2:55] = (
            flux_to_mag(NNdata[i, 2:55], w_central[2:55])
            - flux_to_mag(NNdata[i, 2:55][nb - 2], w_central[nb - 2])
        )
        NNdata[i, 55 : 55 + 4] = flux_to_mag(NNdata[i, 55 : 55 + 4], w_central[-4:])
    NB_lya_position = NB_z(NNdata_test[:, -1].reshape(-1,))
    for i, nb in enumerate(NB_lya_position - 2):
        NNdata_test[i, :53] = (
            flux_to_mag(NNdata_test[i, :53], w_central[:53])
            - flux_to_mag(NNdata_test[i, :53][nb - 2], w_central[nb - 2])
        )
        NNdata_test[i, 53 : 53 + 4] = flux_to_mag(NNdata_test[i, 53 : 53 + 4], w_central[-4:])

    NNdata = np.hstack(
        [
            NNdata[:, 2:55],
            NNdata[:, 55 : 55 + 4],
            NNdata[:, 55 + 4 + 2:]
        ]
    )
    
    print(NNdata.shape)


    N_sources_NN = NNdata.shape[0]

    # Shuffle data
    # shuffle_idx = np.random.permutation(np.arange(N_sources_NN))
    shuffle_idx = np.arange(N_sources_NN)
    NNdata = NNdata[shuffle_idx]

    NNdata_L_input = NNdata_L_input[shuffle_idx].reshape(-1,)
    NNdata_L_input[np.isnan(NNdata_L_input)] = 0
    NNdata_is_qso = np.ones(N_sources_NN).astype(bool)
    NNdata_is_qso[int(N_sources_NN / 2):] = False
    NNdata_is_qso = NNdata_is_qso[shuffle_idx]

    # Take logs
    # NNdata[:, :53 + 4] = np.log10(NNdata[:, :53 + 4])
    NNdata[np.isnan(NNdata)] = 99.
    # NNdata[NNdata > 99.] = 99.

    # NNdata_test[:, :53 + 4] = np.log10(NNdata_test[:, :53 + 4])
    NNdata_test[np.isnan(NNdata_test)] = 99.
    # NNdata_test[NNdata_test > 99.] = 99.

    # Rescale data
    mms = MinMaxScaler()
    mms.fit(NNdata)
    NNdata = mms.transform(NNdata)
    NNdata_test = mms.transform(NNdata_test)
    with open(f'MLmodels/RF{regname}_QSO-SF_scaler.sav', 'wb') as file:
        pickle.dump(mms, file)

    # Apply PCA
    pca = PCA(n_components=0.95, svd_solver='full')

    pca.fit(NNdata)
    with open(f'MLmodels/RF{regname}_QSO-SF_pca.sav', 'wb') as file:
        pickle.dump(pca, file)
    NNdata = pca.transform(NNdata)
    NNdata_test = pca.transform(NNdata_test)

    NNdata_train = NNdata
    NNlabels_train = NNdata_L_input

    NNlabels_train = NNlabels_train.reshape(-1,)
    NNlabels_test = NNlabels_test.reshape(-1,)

    return NNdata_L_input, NNdata_train, NNdata_test, NNlabels_train, NNlabels_test, NNdata_L_Arr

regname = 'mag15-23'
NNdata_L_input, NNdata_train, NNdata_test, NNlabels_train, NNlabels_test, NNdata_L_Arr = prepare_dataset(regname)

In [None]:
import sys
(sys.getsizeof(NNdata_train)) / 1e6 * 10

In [None]:
def do_grid_search():
    # Create the parameter grid based on the results of random search 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [50, 75, 100, 125, 150],
        'max_features': [0.3],
        'min_samples_leaf': [3, 4, 5, 6],
        'min_samples_split': [4, 5, 6],
        'n_estimators': [100]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(
        estimator=rf, param_grid=param_grid, 
        cv=KFold(5), n_jobs=10, pre_dispatch='2*n_jobs',
        verbose=3
    )

    grid_search.fit(NNdata_train, NNlabels_train)

    return grid_search.best_params_

# best_params = do_grid_search()

In [None]:
# The regressor
best_params = {
    'bootstrap': False,
    'max_depth': 300,
    'max_features': 0.3,
    'min_samples_leaf': 6,
    'min_samples_split': 5,
    'n_estimators': 200,
    'verbose': True,
    'n_jobs': -1
}
reg = RandomForestRegressor(**best_params)
reg.set_params(n_estimators=400, n_jobs=-1)

# Train it
reg.fit(NNdata_test, NNlabels_test)

# with open(f'MLmodels/RF{regname}_QSO-SF_regressor.sav', 'rb') as file:
#     reg = pickle.load(file)

In [None]:
best_params

In [None]:
# fig, ax = plt.subplots(figsize=(7, 6))

# ax.plot(grid_search.cv_results_['mean_train_score'], marker='s')
# ax.plot(grid_search.cv_results_['mean_test_score'], marker='s')

# plt.show()

In [None]:
L_Arr_pred = reg.predict(NNdata_test)

NNdata_test_raw = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_test.csv').to_numpy()[:, 1:]

test_mag = flux_to_mag(NNdata_test_raw[:, 56], 6750)
test_mask = (test_mag < 25)

print(f'Test score = {reg.score(NNdata_test[test_mask], NNlabels_test[test_mask])}')
print(f'Train score = {reg.score(NNdata_train, NNlabels_train)}')

In [None]:
with open(f'MLmodels/RF{regname}_QSO-SF_regressor.sav', 'wb') as file:
    pickle.dump(reg, file)

In [None]:
def plot_contours(is_qso, maskk, title='', nb_c=-3):
    fig, ax = plt.subplots(figsize=(7, 6))

    mask = is_qso & maskk
    Z, x, y = np.histogram2d(
        L_lya_test[mask].reshape(-1,), L_Arr_pred[mask],
        bins=(np.linspace(41, 47, 30), np.linspace(41, 47, 30))
    )

    H_min = np.amin(Z)
    H_max = np.amax(Z)

    y_centers = 0.5 * (y[1:] + y[:-1])
    x_centers = 0.5 * (x[1:] + x[:-1])

    N_bins = 10000

    H_Arr = np.linspace(H_min, H_max, N_bins)[::-1]

    fact_up_Arr = np.zeros(N_bins)

    TOTAL_H = np.sum(Z)

    for iii in range(0, N_bins):

        mask = Z > H_Arr[iii]

        fact_up_Arr[iii] = np.sum(Z[mask]) / TOTAL_H

    H_value_68 = np.interp(0.683, fact_up_Arr, H_Arr) # 1 sigma
    H_value_95 = np.interp(0.954, fact_up_Arr, H_Arr) # 2 sigma
    H_value_99 = np.interp(0.997, fact_up_Arr, H_Arr) # 3 sigma

    ax.contour(
        x_centers, y_centers, Z.T, levels=[H_value_99, H_value_95, H_value_68],
        colors='C0'
    )

    mask = ~is_qso & maskk
    Z, x, y = np.histogram2d(
        L_lya_test[mask].reshape(-1,), L_Arr_pred[mask],
        bins=(np.linspace(41, 47, 30), np.linspace(41, 47, 30))
    )

    H_min = np.amin(Z)
    H_max = np.amax(Z)

    y_centers = 0.5 * (y[1:] + y[:-1])
    x_centers = 0.5 * (x[1:] + x[:-1])

    N_bins = 10000

    H_Arr = np.linspace(H_min , H_max , N_bins )[::-1]

    fact_up_Arr = np.zeros(N_bins)

    TOTAL_H = np.sum(Z)

    for iii in range(0, N_bins):

        mask = Z > H_Arr[iii]

        fact_up_Arr[iii] = np.sum(Z[ mask ]) / TOTAL_H

    H_value_68 = np.interp(0.683, fact_up_Arr, H_Arr) # 1sigma
    H_value_95 = np.interp(0.954, fact_up_Arr, H_Arr) # 2sigma
    H_value_99 = np.interp(0.997, fact_up_Arr, H_Arr) # 2sigma

    ax.contour(
        x_centers, y_centers, Z.T, levels=[H_value_99, H_value_95, H_value_68],
        colors='C1'
    )

    mask = (is_qso | ~is_qso) & maskk
    Z, x, y = np.histogram2d(
        L_lya_test[mask].reshape(-1,), NNdata_L_Arr[mask],
        bins=(np.linspace(41, 47, 30), np.linspace(41, 47, 30))
    )

    H_min = np.amin(Z)
    H_max = np.amax(Z)

    y_centers = 0.5 * (y[1:] + y[:-1])
    x_centers = 0.5 * (x[1:] + x[:-1])

    N_bins = 10000

    H_Arr = np.linspace(H_min, H_max, N_bins)[::-1]

    fact_up_Arr = np.zeros(N_bins)

    TOTAL_H = np.sum(Z)

    for iii in range(0, N_bins):

        mask = Z > H_Arr[iii]

        fact_up_Arr[iii] = np.sum(Z[mask]) / TOTAL_H

    H_value_68 = np.interp(0.683, fact_up_Arr, H_Arr) # 1 sigma
    H_value_95 = np.interp(0.954, fact_up_Arr, H_Arr) # 2 sigma
    H_value_99 = np.interp(0.997, fact_up_Arr, H_Arr) # 3 sigma

    ax.contour(
        x_centers, y_centers, Z.T, levels=[H_value_99, H_value_95, H_value_68],
        colors='k', linestyles='--'
    )

    x = np.linspace(40, 48, 100)
    ax.plot(x, x, linestyle='--', color='red', label='1:1')

    ax.set_ylabel('Retrieved $\log L$', fontsize=15)
    ax.set_xlabel('Real $\log L$', fontsize=15)

    ax.set_ylim((41, 47))
    ax.set_xlim((41, 47))

    if len(title) > 0:
        ax.set_title(title, fontsize=20)

    # Detec lim

    detec_lim = np.vstack(
        (
            pd.read_csv('csv/5sigma_depths_NB.csv', header=None),
            pd.read_csv('csv/5sigma_depths_BB.csv', header=None)
        )
    )[:, 1]

    flambda_lim = mag_to_flux(detec_lim[nb_c], w_central[nb_c]) * 3

    ew0_lim = 20 # A
    z = w_central[nb_c] / 1215.67 - 1
    Fline_lim = ew0_lim * flambda_lim * (1 + z)
    dL = cosmo.luminosity_distance(z).to(u.cm).value
    L_lim = np.log10(Fline_lim * 4*np.pi * dL**2)

    ax.axhline(L_lim, ls='--', color='green', label='L limit')

    ax.legend(fontsize=15)
    # plt.savefig(f'/home/alberto/Desktop/{title}')
    plt.show()

In [None]:
is_qso_test = np.ones(len(NNlabels_test)).astype(bool)
is_qso_test[int(len(is_qso_test) / 2) :] = False

L_lya_test = NNlabels_test

NB_lya_position = NB_z(NNdata_test_raw[:, -1].reshape(-1,))
nb_c = 7
ftags = load_filter_tags()
maskk = (NB_lya_position == nb_c)
plot_contours(is_qso_test, maskk, title=f'{regname}, {ftags[nb_c]}', nb_c=nb_c)

In [None]:
batch_size = len(NNdata_test) // 5
train_sizes = np.array([batch_size * (j + 1) for j in range(5)]).astype(int)
train_score = np.empty(5)
test_score = np.empty(5)

reg.set_params(n_jobs=-1)

for k in range(5):
    reg.fit(
        NNdata_test[: (k + 1) * (batch_size)],
        NNlabels_test[:(k + 1) * (batch_size)]
    )

    train_score[k] = reg.score(
        NNdata_test[: (k + 1) * (batch_size)],
        NNlabels_test[:(k + 1) * (batch_size)]
    )
    test_score[k] = reg.score(
        NNdata_train,
        NNlabels_train
    )

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))

x_ticks = train_sizes
ax.plot(x_ticks, train_score, marker='s', label='Training score')
ax.plot(x_ticks, test_score, marker='s', label='Test score')

ax.legend(fontsize=15)

ax.set_ylabel('Score', fontsize=15)
ax.set_xlabel('Training set fraction', fontsize=15)
ax.set_title(f'Random Forest, {regname}', fontsize=20)

ax.tick_params(labelsize=10)

plt.show()

In [None]:
a = reg.feature_importances_
np.flip(np.argsort(a))[:23]

In [None]:
with open(f'MLmodels/RF{regname}_QSO-SF_pca.sav', 'rb') as file:
    pca = pickle.load(file)

In [None]:
pca.components_[0]