In [None]:
from my_functions import *

import numpy as np

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 12})

import time

import pickle

In [None]:
w_central = central_wavelength()
nb_fwhm_Arr = nb_fwhm(range(60))
w_lya = 1215.67

In [None]:
def prepare_dataset(regname):
    '''
    The initial features are:
    - The fluxes of the first 53 NBs
    - The errors of the first 53 NBs
    - 4 BB fluxes
    - 4 BB errors
    - The estimated L
    - The estimated z
    TOTAL = 120 features
    (PCA to be applied below)
    '''

    # The data set is the nice_lya sample
    NNdata = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_train.csv').to_numpy()[:, 1:]
    NNdata_L_input = pd.read_csv(f'MLmodels/datasets/tags_{regname}_train.csv').to_numpy()[:, 1:]
    NNdata_L_Arr = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_test.csv').to_numpy()[:, -2].reshape(-1,)
    NNdata_test = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_test.csv').to_numpy()[:, 1:]
    NNlabels_test = pd.read_csv(f'MLmodels/datasets/tags_{regname}_test.csv').to_numpy()[:, 1:]
    print(NNdata_test.shape)

    # Take the relative fluxes to the selected one
    NB_lya_position = NB_z(NNdata[:, -1].reshape(-1,))
    for i, nb in enumerate(NB_lya_position - 2):
        NNdata[i, :53] = (
            flux_to_mag(NNdata[i, :53], w_central[:53])
            - flux_to_mag(NNdata[i, :53][nb - 2], w_central[nb - 2])
        )
        NNdata[i, 53 : 53 + 4] = flux_to_mag(NNdata[i, 53 : 53 + 4], w_central[-4:])
    NB_lya_position = NB_z(NNdata_test[:, -1].reshape(-1,))
    for i, nb in enumerate(NB_lya_position - 2):
        NNdata_test[i, :53] = (
            flux_to_mag(NNdata_test[i, :53], w_central[:53])
            - flux_to_mag(NNdata_test[i, :53][nb - 2], w_central[nb - 2])
        )
        NNdata_test[i, 53 : 53 + 4] = flux_to_mag(NNdata_test[i, 53 : 53 + 4], w_central[-4:])

    # NNdata = np.hstack(
    #     [
    #         NNdata[:, 2:55],
    #         NNdata[:, 55 : 55 + 4],
    #         NNdata[:, 55 + 4 + 2:]
    #     ]
    # )
    
    print(NNdata_test.shape)

    N_sources_NN = NNdata.shape[0]

    # Shuffle data
    # shuffle_idx = np.random.permutation(np.arange(N_sources_NN))
    shuffle_idx = np.arange(N_sources_NN)
    NNdata = NNdata[shuffle_idx]

    NNdata_L_input = NNdata_L_input[shuffle_idx].reshape(-1,)
    NNdata_L_input[np.isnan(NNdata_L_input)] = 0
    NNdata_is_qso = np.ones(N_sources_NN).astype(bool)
    NNdata_is_qso[int(N_sources_NN / 2):] = False
    NNdata_is_qso = NNdata_is_qso[shuffle_idx]

    # Take logs
    # NNdata[:, :53 + 4] = np.log10(NNdata[:, :53 + 4])
    NNdata[~np.isfinite(NNdata)] = 0
    # NNdata[NNdata > 99.] = 99.

    # NNdata_test[:, :53 + 4] = np.log10(NNdata_test[:, :53 + 4])
    NNdata_test[~np.isfinite(NNdata_test)] = 0
    # NNdata_test[NNdata_test > 99.] = 99.

    # Rescale data
    mms = MinMaxScaler()
    mms.fit(NNdata_test)
    NNdata = mms.transform(NNdata)
    NNdata_test = mms.transform(NNdata_test)
    with open(f'MLmodels/RF{regname}_QSO-SF_scaler.sav', 'wb') as file:
        pickle.dump(mms, file)

    # Apply PCA
    pca = PCA(n_components=0.95, svd_solver='full')

    pca.fit(NNdata_test)
    with open(f'MLmodels/RF{regname}_QSO-SF_pca.sav', 'wb') as file:
        pickle.dump(pca, file)
    NNdata = pca.transform(NNdata)
    NNdata_test = pca.transform(NNdata_test)

    NNdata_train = NNdata
    NNlabels_train = NNdata_L_input

    NNlabels_train = NNlabels_train.reshape(-1,)
    NNlabels_test = NNlabels_test.reshape(-1,)

    return NNdata_L_input, NNdata_train, NNdata_test, NNlabels_train, NNlabels_test, NNdata_L_Arr

# regname = 'mag23-24'
# NNdata_L_input, NNdata_train, NNdata_test, NNlabels_train, NNlabels_test, NNdata_L_Arr = prepare_dataset(regname)

In [None]:
def do_grid_search(NNdata_test, NNlabels_test):
    # Create the parameter grid based on the results of random search 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 100, 125],
        'max_features': [0.3],
        'min_samples_leaf': [3, 4, 5, 6, 7, 8],
        'min_samples_split': [3, 4, 5, 6, 7, 10, 20, 40, 50],
        'n_estimators': [100]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(
        estimator=rf, param_grid=param_grid, 
        cv=KFold(3), n_jobs=-1, pre_dispatch='2*n_jobs',
        verbose=1
    )

    grid_search.fit(NNdata_test, NNlabels_test)

    return grid_search.best_params_

# best_params = do_grid_search()

In [None]:
def grid_search_and_train(regname):
    print(f'#### {regname} ####')
    _, NNdata_train, NNdata_test, NNlabels_train, NNlabels_test, _ = prepare_dataset(regname)
    
    best_params = do_grid_search(NNdata_test, NNlabels_test)

    print('Best params:')
    print(best_params)

    reg = RandomForestRegressor(**best_params)
    reg.set_params(n_estimators=400, n_jobs=-1)
    reg.fit(NNdata_test, NNlabels_test)

    NNdata_test_raw = pd.read_csv(f'MLmodels/datasets/dataset_{regname}_test.csv').to_numpy()[:, 1:]

    test_mag = flux_to_mag(NNdata_test_raw[:, 56], 6750)
    test_mask = (test_mag < 25)

    print(f'Test score = {reg.score(NNdata_test[test_mask], NNlabels_test[test_mask])}')
    print(f'Train score = {reg.score(NNdata_train, NNlabels_train)}')

    with open(f'MLmodels/RF{regname}_QSO-SF_regressor.sav', 'wb') as file:
        pickle.dump(reg, file)

grid_search_and_train('mag15-23')
grid_search_and_train('mag23-23.5')