In [None]:
import sys
sys.path.insert(0, '..')

from paus_utils import w_central, z_NB

from jpasLAEs.utils import flux_to_mag, bin_centers

import pickle

import numpy as np

from load_paus_mocks import load_mock_dict

# from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import model_selection

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 12})

In [None]:
field_name = 'W3'
savedir = '/home/alberto/almacen/PAUS_data/LF_corrections'

nb_min, nb_max = 0, 18

with open(f'{savedir}/mock_dict_{field_name}_nb{nb_min}-{nb_max}.pkl', 'rb') as f:
    mock_dict = pickle.load(f)

del mock_dict['SFG']
del mock_dict['GAL']
# del mock_dict['QSO_cont']

In [None]:
# Get the minimum number of candidates to set the set length
N_candidates_list = []
for mock_name, mock in mock_dict.items():
    z_phot = z_NB(mock['lya_NB'])
    nice_z = np.abs(mock['zspec'] - z_phot) < 0.12

    if mock_name in ['QSO_LAEs_loL', 'QSO_LAEs_hiL', 'QSO_cont']:
        N_candidates_list.append(sum(mock['nice_lya_0'][nice_z]))
    else:
        N_candidates_list.append(sum(mock['nice_lya_0']))

set_len = np.min(N_candidates_list)
print(N_candidates_list)
print(f'{set_len=}')

In [None]:
# Make the set for each class
tt_set = None
labels = None
rmag = None
zspec = None
zphot = None
L_Arr = None

nice_z_list = []

labels = None

for mock_name, mock in mock_dict.items():
    mock_len = len(mock['zspec'])
    nice_lya = mock['nice_lya_0']
    r_mag = mock['r_mag']

    z_phot = z_NB(mock['lya_NB'])
    nice_z = np.abs(np.array(mock['zspec']) - z_phot) < 0.12

    # np.random.seed(299792458)
    # selection = np.random.choice(np.arange(mock_len)[nice_lya], set_len,
    #                              replace=False)
    selection = np.arange(mock_len)[nice_z & (r_mag > 0)]

    this_set = np.hstack([
        mock['flx'][:40, selection].T * 1e17, # NBs
        mock['r_mag'][selection].reshape(-1, 1),
        mock['flx'][40:45, selection].T * 1e17, # BBs
        mock['lya_NB'][selection].reshape(-1, 1),
    ])

    if tt_set is None:
        tt_set = this_set
        this_rmag = flux_to_mag(mock['flx'][-4, selection], w_central[-4])
        rmag = this_rmag
        zspec = mock['zspec'][selection]
        L_Arr = mock['L_lya'][selection]
        zphot = z_NB(mock['lya_NB'])[selection]
    else:
        tt_set = np.vstack([tt_set, this_set])

        this_rmag = flux_to_mag(mock['flx'][-4, selection], w_central[-4])
        rmag = np.concatenate([rmag, this_rmag])
        zspec = np.concatenate([zspec, mock['zspec'][selection]])
        L_Arr = np.concatenate([L_Arr, mock['L_lya'][selection]])
        zphot = np.concatenate([zphot, z_NB(mock['lya_NB'])[selection]])

    nice_z_list.append(nice_z[selection])
    
# Labels are z_spec
labels = zspec

In [None]:
# Train/Test split
split_seed = 299792458
x_train, x_test, y_train, y_test =\
    model_selection.train_test_split(tt_set, labels, test_size=0.2,
                                     random_state=split_seed)

## Pre-processing ##
x_train[:, :40] /= np.sum(x_train[:, :40], axis=1).reshape(-1, 1)
x_train[:, 41:46] /= np.sum(x_train[:, 41:46], axis=1).reshape(-1, 1)

x_test[:, :40] /= np.sum(x_test[:, :40], axis=1).reshape(-1, 1)
x_test[:, 41:46] /= np.sum(x_test[:, 41:46], axis=1).reshape(-1, 1)

## Scaler
# scaler = MinMaxScaler()
x_train[:, 40] /= 30
x_test[:, 40] /= 30
x_train[:, 46] /= 30
x_test[:, 46] /= 30
# Apply scaling only to fluxes
# scaler.fit(x_train[:, :46])
# x_train[:, :46] = scaler.transform(x_train[:, :46])
# x_test[:, :46] = scaler.transform(x_test[:, :46])

# PCA
# pca = PCA(n_components=0.99, svd_solver='full')

# pca.fit(x_train)
# x_train = pca.transform(x_train)
# x_test = pca.transform(x_test)

print(x_train.shape)

In [None]:
def do_grid_search(algorithm, search_mode='random'):
    # Create the parameter grid based on the results of random search
    if algorithm == 'nn':
        param_grid = {
            'hidden_layer_sizes': [(60, 60), (60, 60, 60),
                                   (40, 40, 20),
                                   (60, 40, 60)],
            'solver': ['adam'],
            'alpha': [1e-3, 1e-4, 1e-5],
            'batch_size': [50, 100, 250],
            'learning_rate': ['adaptive', 'constant'],
            'max_iter': [10000]
        }
        # Create a based model
        model = MLPRegressor()
    elif algorithm == 'rf':
        param_grid = {
            'random_state': [22],
            'n_estimators': [50, 100, 200],
            'bootstrap': [True, False],
            'max_depth': [20, 50, 70, 100],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        model = RandomForestRegressor()
    else:
        raise Exception('Model not known')

    # Instantiate the grid search model
    if search_mode == 'grid':
        grid_search = GridSearchCV(
            estimator=model, param_grid=param_grid,
            cv=3, n_jobs=-1, pre_dispatch='2*n_jobs',
            verbose=3,
        )
    elif search_mode == 'random':
        grid_search = RandomizedSearchCV(
            estimator=model, param_distributions=param_grid,
            cv=3, n_jobs=-1, pre_dispatch='2*n_jobs',
            verbose=3,
        )
    else:
        raise Exception('What?')

    grid_search.fit(x_train, y_train)

    return grid_search.best_params_

model = 'rf'
search_mode = 'grid'

best_params = do_grid_search(model, search_mode=search_mode)
# if model == 'nn':
#     best_params = {'solver': 'adam', 'max_iter': 10000, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (40, 40, 20), 'batch_size': 50, 'alpha': 0.001}
# elif model == 'rf':
#     best_params = {'random_state': 22, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20, 'bootstrap': False}
print(best_params)

In [None]:
if model == 'nn':
    cl_best = MLPRegressor(**best_params)
elif model == 'rf':
    cl_best = RandomForestRegressor(**best_params)

cl_best.fit(x_train, y_train)
test_score = cl_best.score(x_test, y_test)
train_score = cl_best.score(x_train, y_train)
print(f'Score\n\nTrain: {train_score:0.3f}\nTest: {test_score:0.3f}')

In [None]:
# Predict test
pred_test = cl_best.predict(x_test)

In [None]:
import os
# Save the regressor
save_dir = '/home/alberto/almacen/PAUS_data/ML_z_reg'
os.makedirs(save_dir, exist_ok=True)
with open(f'{save_dir}/z_fit_NN_reg.sav', 'wb') as file:
    pickle.dump(cl_best, file)
# with open(f'{save_dir}/source_scaler_z_fit.sav', 'wb') as file:
#     pickle.dump(scaler, file)

In [None]:
rmag_train, rmag_test =\
    model_selection.train_test_split(rmag, test_size=0.2, random_state=split_seed)
zspec_train, zspec_test =\
    model_selection.train_test_split(zspec, test_size=0.2, random_state=split_seed)
L_Arr_train, L_Arr_test =\
    model_selection.train_test_split(L_Arr, test_size=0.2, random_state=split_seed)
zphot_train, zphot_test =\
    model_selection.train_test_split(zphot, test_size=0.2, random_state=split_seed)

In [None]:
fig, ax = plt.subplots()

ax.scatter(y_test, zphot_test - y_test, s=0.1)
ax.scatter(y_test, pred_test - y_test, s=0.1)
ax.axhline(0, c='k')

print(np.mean(pred_test - y_test))
print(np.std(pred_test - y_test))
print()
print(np.mean(zphot_test - y_test))
print(np.std(zphot_test - y_test))

plt.ylim(-0.2, 0.2)

plt.show()