In [None]:
from my_functions import *

import numpy as np

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

import pickle

In [None]:
w_central = central_wavelength()
nb_fwhm_Arr = nb_fwhm(range(60))
w_lya = 1215.67

In [None]:
'''
The initial features are:
- The fluxes of the first 55 NBs
- The errors of the first 55 NBs
- 4 BB fluxes
- 4 BB errors
- The estimated L
- The estimated z
TOTAL = 120 features
(PCA to be applied below)
'''

# The data set is the nice_lya sample
NNdata = pd.read_csv('MLmodels/dataset100_000.csv').to_numpy()[:, 1:]
NNdata_L_input = pd.read_csv('MLmodels/tags100_000.csv').to_numpy()[:, 1:]

N_sources_NN = NNdata.shape[0]

is_qso = np.ones(N_sources_NN).astype(bool)

# Shuffle data
shuffle_idx = np.random.permutation(np.arange(N_sources_NN))
NNdata = NNdata[shuffle_idx]

NNdata_L_input = NNdata_L_input[shuffle_idx]
NNdata_L_input[np.isnan(NNdata_L_input)] = 0
NNdata_is_qso = np.ones(100000).astype(bool)
NNdata_is_qso[50000:] = False
NNdata_is_qso = NNdata_is_qso[shuffle_idx]

# Take logs
NNdata[:, :55 + 4] = np.log10(NNdata[:, :55 + 4])

NNdata[np.isnan(NNdata)] = -99.
NNdata[NNdata > 99.] = 99.

# Rescale data
mms = MinMaxScaler()
mms.fit(NNdata)
NNdata = mms.transform(NNdata)
with open('MLmodels/RF_QSO-SF_scaler.sav', 'wb') as file:
    pickle.dump(mms, file)

# Apply PCA
pca = PCA(n_components=0.95, svd_solver='full')

pca.fit(NNdata)
with open('MLmodels/RF_QSO-SF_pca.sav', 'wb') as file:
    pickle.dump(pca, file)

NNdata = pca.transform(NNdata)

# Split dataset
NNdata_train, NNdata_test, NNlabels_train, NNlabels_test =\
    train_test_split(NNdata, NNdata_L_input, test_size=0.2, shuffle=False)

NNlabels_train = NNlabels_train.reshape(-1,)
NNlabels_test = NNlabels_test.reshape(-1,)

N_train = len(NNlabels_train)
N_test = len(NNlabels_test)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False, True],
    'max_depth': [40, 50, 60, 100],
    'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
    'min_samples_leaf': [2, 3, 4, 5, 10],
    'min_samples_split': [3, 4, 5, 10],
    'n_estimators': [150]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, 
    cv=5, n_jobs=-1, verbose=3, return_train_score=True
)

grid_search.fit(NNdata_train, NNlabels_train)

In [None]:
L_lya_test = NNlabels_test[N_train:]

# The regressor
# best_params = {
#     'bootstrap': False,
#     'max_depth': 50,
#     'max_features': 'sqrt',
#     'min_samples_leaf': 3,
#     'min_samples_split': 4,
#     'n_estimators': 200,
#     'verbose': True,
#     'n_jobs': -1
# }
best_params = grid_search.best_params_
reg = RandomForestRegressor(**best_params)
reg.set_params(n_estimators=500)

# Train it
reg.fit(NNdata_train, NNlabels_train)

In [None]:
L_Arr_pred = reg.predict(NNdata_test)
print(f'Score = {reg.score(NNdata_test, NNlabels_test)}')

In [None]:
with open('MLmodels/RF_QSO-SF_regressor.sav', 'wb') as file:
    pickle.dump(reg, file)

In [None]:
def plot_contours(is_qso, title='', nb_c=-3):
    fig, ax = plt.subplots(figsize=(7, 6))

    mask = is_qso
    Z, x, y = np.histogram2d(
        L_lya_test[mask].reshape(-1,), L_Arr_pred[mask],
        bins=(np.linspace(41, 47, 30), np.linspace(41, 47, 30))
    )

    H_min = np.amin(Z)
    H_max = np.amax(Z)

    y_centers = 0.5 * (y[1:] + y[:-1])
    x_centers = 0.5 * (x[1:] + x[:-1])

    N_bins = 10000

    H_Arr = np.linspace(H_min, H_max, N_bins)[::-1]

    fact_up_Arr = np.zeros(N_bins)

    TOTAL_H = np.sum(Z)

    for iii in range(0, N_bins):

        mask = Z > H_Arr[iii]

        fact_up_Arr[iii] = np.sum(Z[mask]) / TOTAL_H

    H_value_68 = np.interp(0.683, fact_up_Arr, H_Arr) # 1 sigma
    H_value_95 = np.interp(0.954, fact_up_Arr, H_Arr) # 2 sigma
    H_value_99 = np.interp(0.997, fact_up_Arr, H_Arr) # 3 sigma

    ax.contour(
        x_centers, y_centers, Z.T, levels=[H_value_99, H_value_95, H_value_68],
        colors='C0'
    )

    mask = ~is_qso
    Z, x, y = np.histogram2d(
        L_lya_test[mask].reshape(-1,), L_Arr_pred[mask],
        bins=(np.linspace(41, 47, 30), np.linspace(41, 47, 30))
    )

    H_min = np.amin(Z)
    H_max = np.amax(Z)

    y_centers = 0.5 * (y[1:] + y[:-1])
    x_centers = 0.5 * (x[1:] + x[:-1])

    N_bins = 10000

    H_Arr = np.linspace(H_min , H_max , N_bins )[::-1]

    fact_up_Arr = np.zeros(N_bins)

    TOTAL_H = np.sum(Z)

    for iii in range(0, N_bins):

        mask = Z > H_Arr[iii]

        fact_up_Arr[iii] = np.sum(Z[ mask ]) / TOTAL_H

    H_value_68 = np.interp(0.683, fact_up_Arr, H_Arr) # 1sigma
    H_value_95 = np.interp(0.954, fact_up_Arr, H_Arr) # 2sigma
    H_value_99 = np.interp(0.997, fact_up_Arr, H_Arr) # 2sigma

    ax.contour(
        x_centers, y_centers, Z.T, levels=[H_value_99, H_value_95, H_value_68],
        colors='C1'
    )

    x = np.linspace(40, 48, 100)
    ax.plot(x, x, linestyle='--', color='red', label='1:1')

    ax.set_ylabel('Retrieved $\log L$', fontsize=15)
    ax.set_xlabel('Real $\log L$', fontsize=15)

    ax.set_ylim((41, 47))
    ax.set_xlim((41, 47))

    if len(title) > 0:
        ax.set_title(title, fontsize=20)

    # Detec lim

    detec_lim = np.vstack(
        (
            pd.read_csv('csv/5sigma_depths_NB.csv', header=None),
            pd.read_csv('csv/5sigma_depths_BB.csv', header=None)
        )
    )[:, 1]

    flambda_lim = mag_to_flux(detec_lim[nb_c], w_central[nb_c]) * 3

    ew0_lim = 20 # A
    z = w_central[nb_c] / 1215.67 - 1
    Fline_lim = ew0_lim * flambda_lim * (1 + z)
    dL = cosmo.luminosity_distance(z).to(u.cm).value
    L_lim = np.log10(Fline_lim * 4*np.pi * dL**2)

    ax.axhline(L_lim, ls='--', color='green', label='L limit')

    ax.legend(fontsize=15)
    # plt.savefig(f'/home/alberto/Desktop/{title}')
    plt.show()

In [None]:
plot_contours(NNdata_is_qso[-20000:])

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
reg.set_params(verbose=1)
a = learning_curve(reg, NNdata, NNdata_L_input)

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))

x_ticks = np.arange(len(a[1]))
ax.plot(x_ticks, a[1].sum(axis=1)/len(x_ticks), marker='s', label='train')
ax.plot(x_ticks, a[2].sum(axis=1)/len(x_ticks), marker='s', label='test')

ax.legend(fontsize=15)

ax.set_ylabel('score', fontsize=15)

plt.show()