In [None]:
from my_functions import *

import numpy as np

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 12})

import pickle

import dask.array as da
import dask.dataframe as dd
from dask_ml.preprocessing import MinMaxScaler
from dask_ml.decomposition import IncrementalPCA
from dask.distributed import Client

import joblib

In [None]:
w_central = central_wavelength()
nb_fwhm_Arr = nb_fwhm(range(60))
w_lya = 1215.67

In [None]:
'''
The initial features are:
- The fluxes of the first 55 NBs
- The errors of the first 55 NBs
- 4 BB fluxes
- 4 BB errors
- The estimated L
- The estimated z
TOTAL = 120 features
(PCA to be applied below)
'''

# The data set is the nice_lya sample
NNdata = dd.read_csv(
    'MLmodels/datasets/dataset_mag0_2_000_000.csv',
    usecols=range(1, 121)
).to_dask_array(lengths=True)
NNdata_L_input = dd.read_csv(
    'MLmodels/datasets/tags_mag0_2_000_000.csv',
    usecols=[1]
).to_dask_array(lengths=True)

# Rechunk
chunksize = '200 MiB'
NNdata = da.rechunk(NNdata, chunks=chunksize)
NNdata_L_input = da.rechunk(NNdata_L_input, chunks=chunksize)

N_sources_NN = NNdata.shape[0]

is_qso = da.ones(N_sources_NN, chunks=chunksize).astype(bool)

# Shuffle data
shuffle_idx = da.random.permutation(da.arange(N_sources_NN))
NNdata = da.rechunk(NNdata[shuffle_idx], chunks=chunksize)

NNdata_L_input = NNdata_L_input[shuffle_idx].reshape(-1,)
NNdata_L_input[da.isnan(NNdata_L_input)] = 0
NNdata_is_qso = da.ones(N_sources_NN).astype(bool)
NNdata_is_qso[int(N_sources_NN / 2):] = False
NNdata_is_qso = NNdata_is_qso[shuffle_idx]

# Take logs
NNdata[:, :55 + 4] = da.log10(NNdata[:, :55 + 4])

NNdata[da.isnan(NNdata)] = -99.
NNdata[NNdata > 99.] = 99.

# Rechunk
NNdata = da.rechunk(NNdata, chunks=chunksize)
NNdata_L_input = da.rechunk(NNdata_L_input, chunks=chunksize)

# Rescale data
client = Client(processes=False)

mms = MinMaxScaler()
with joblib.parallel_backend('dask'):
    mms.fit(NNdata)
NNdata = mms.transform(NNdata)
with open('MLmodels/RFmag0_QSO-SF_scaler.sav', 'wb') as file:
    pickle.dump(mms, file)

# Rechunk
NNdata = da.rechunk(NNdata, chunks=[1, NNdata.shape[1]])
NNdata_L_input = da.rechunk(NNdata_L_input, chunks=chunksize)

# Apply PCA
pca = IncrementalPCA(n_components=60, svd_solver='auto')

with joblib.parallel_backend('dask'):
    pca.fit(NNdata)
with open('MLmodels/RFmag0_QSO-SF_pca.sav', 'wb') as file:
    pickle.dump(pca, file)

NNdata = pca.transform(NNdata)

# Rechunk
NNdata = da.rechunk(NNdata, chunks=chunksize)
NNdata_L_input = da.rechunk(NNdata_L_input, chunks=chunksize)

# Split dataset
NNdata_train, NNdata_test, NNlabels_train, NNlabels_test =\
    train_test_split(NNdata, NNdata_L_input, test_size=0.2, shuffle=False)

NNlabels_train = NNlabels_train.reshape(-1,)
NNlabels_test = NNlabels_test.reshape(-1,)

N_train = len(NNlabels_train)
N_test = len(NNlabels_test)

In [None]:
# The data set is the nice_lya sample
NNdata = dd.read_csv(
    'MLmodels/datasets/dataset_mag0_2_000_000.csv',
    usecols=range(1, 121)
).to_dask_array(lengths=True)
NNdata_L_input = dd.read_csv(
    'MLmodels/datasets/tags_mag0_2_000_000.csv',
    usecols=[1]
).to_dask_array(lengths=True)

# Rechunk
chunksize = '200 MiB'
NNdata = da.rechunk(NNdata, chunks=chunksize)
NNdata_L_input = da.rechunk(NNdata_L_input, chunks=chunksize)

N_sources_NN = NNdata.shape[0]

is_qso = da.ones(N_sources_NN, chunks=chunksize).astype(bool)

# Shuffle data
shuffle_idx = da.random.permutation(da.arange(N_sources_NN))
NNdata = da.rechunk(NNdata[shuffle_idx], chunks=chunksize)

NNdata_L_input = NNdata_L_input[shuffle_idx].reshape(-1,)
NNdata_L_input[da.isnan(NNdata_L_input)] = 0
NNdata_is_qso = da.ones(N_sources_NN).astype(bool)
NNdata_is_qso[int(N_sources_NN / 2):] = False
NNdata_is_qso = NNdata_is_qso[shuffle_idx]

# Take logs
NNdata[:, :55 + 4] = da.log10(NNdata[:, :55 + 4])

NNdata[da.isnan(NNdata)] = -99.
NNdata[NNdata > 99.] = 99.

# Rechunk
NNdata = da.rechunk(NNdata, chunks=chunksize)
NNdata_L_input = da.rechunk(NNdata_L_input, chunks=chunksize)

# Rescale data
client = Client(processes=False)

mms = MinMaxScaler()
with joblib.parallel_backend('dask'):
    mms.fit(NNdata)
NNdata = mms.transform(NNdata)
with open('MLmodels/RFmag0_QSO-SF_scaler.sav', 'wb') as file:
    pickle.dump(mms, file)

# Rechunk
NNdata = da.rechunk(NNdata, chunks=NNdata.shape[0] / 5)
NNdata_L_input = da.rechunk(NNdata_L_input, chunks=chunksize)

# Apply PCA
pca = IncrementalPCA(n_components=60, svd_solver='auto')

print('Lets fitttttt')
with joblib.parallel_backend('dask'):
    pca.fit(NNdata)

In [9]:
# Apply PCA
pca = IncrementalPCA(n_components=60, svd_solver='full')

print('Lets fitttttt')
with joblib.parallel_backend('dask'):
    pca.fit(NNdata)

In [None]:
# Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [False],
#     'max_depth': [100, 200, 300],
#     'max_features': [0.3],
#     'min_samples_leaf': [2],
#     'min_samples_split': [4],
#     'n_estimators': [150]
# }
# # Create a based model
# rf = RandomForestRegressor()
# # Instantiate the grid search model
# grid_search = GridSearchCV(
#     estimator=rf, param_grid=param_grid, 
#     cv=5, n_jobs=15, verbose=3, return_train_score=True
# )

# grid_search.fit(NNdata_train, NNlabels_train)

In [None]:
L_lya_test = NNlabels_test

# The regressor
best_params = {
    'bootstrap': False,
    'max_depth': 100,
    'max_features': 0.3,
    'min_samples_leaf': 2,
    'min_samples_split': 4,
    'n_estimators': 200,
    'verbose': True,
    'n_jobs': -1,
}
# best_params = grid_search.best_params_
reg = RandomForestRegressor(**best_params)
reg.set_params(n_estimators=200)

# Train it
with joblib.parallel_backend('dask'):
    reg.fit(NNdata_train, NNlabels_train)

In [None]:
# fig, ax = plt.subplots(figsize=(7, 6))

# ax.plot(grid_search.cv_results_['mean_train_score'], marker='s')
# ax.plot(grid_search.cv_results_['mean_test_score'], marker='s')

# plt.show()

In [None]:
L_Arr_pred = reg.predict(NNdata_test)
print(f'Test score = {reg.score(NNdata_test, NNlabels_test)}')
print(f'Train score = {reg.score(NNdata_train, NNlabels_train)}')

L_Arr_pred

In [None]:
with open('MLmodels/RFmag0_QSO-SF_regressor.sav', 'wb') as file:
    pickle.dump(reg, file)

In [None]:
def plot_contours(is_qso, title='', nb_c=-3):
    fig, ax = plt.subplots(figsize=(7, 6))

    mask = is_qso
    Z, x, y = np.histogram2d(
        L_lya_test[mask].reshape(-1,), L_Arr_pred[mask],
        bins=(np.linspace(41, 47, 30), np.linspace(41, 47, 30))
    )

    H_min = np.amin(Z)
    H_max = np.amax(Z)

    y_centers = 0.5 * (y[1:] + y[:-1])
    x_centers = 0.5 * (x[1:] + x[:-1])

    N_bins = 10000

    H_Arr = np.linspace(H_min, H_max, N_bins)[::-1]

    fact_up_Arr = np.zeros(N_bins)

    TOTAL_H = np.sum(Z)

    for iii in range(0, N_bins):

        mask = Z > H_Arr[iii]

        fact_up_Arr[iii] = np.sum(Z[mask]) / TOTAL_H

    H_value_68 = np.interp(0.683, fact_up_Arr, H_Arr) # 1 sigma
    H_value_95 = np.interp(0.954, fact_up_Arr, H_Arr) # 2 sigma
    H_value_99 = np.interp(0.997, fact_up_Arr, H_Arr) # 3 sigma

    ax.contour(
        x_centers, y_centers, Z.T, levels=[H_value_99, H_value_95, H_value_68],
        colors='C0'
    )

    mask = ~is_qso
    Z, x, y = np.histogram2d(
        L_lya_test[mask].reshape(-1,), L_Arr_pred[mask],
        bins=(np.linspace(41, 47, 30), np.linspace(41, 47, 30))
    )

    H_min = np.amin(Z)
    H_max = np.amax(Z)

    y_centers = 0.5 * (y[1:] + y[:-1])
    x_centers = 0.5 * (x[1:] + x[:-1])

    N_bins = 10000

    H_Arr = np.linspace(H_min , H_max , N_bins )[::-1]

    fact_up_Arr = np.zeros(N_bins)

    TOTAL_H = np.sum(Z)

    for iii in range(0, N_bins):

        mask = Z > H_Arr[iii]

        fact_up_Arr[iii] = np.sum(Z[ mask ]) / TOTAL_H

    H_value_68 = np.interp(0.683, fact_up_Arr, H_Arr) # 1sigma
    H_value_95 = np.interp(0.954, fact_up_Arr, H_Arr) # 2sigma
    H_value_99 = np.interp(0.997, fact_up_Arr, H_Arr) # 2sigma

    ax.contour(
        x_centers, y_centers, Z.T, levels=[H_value_99, H_value_95, H_value_68],
        colors='C1'
    )

    x = np.linspace(40, 48, 100)
    ax.plot(x, x, linestyle='--', color='red', label='1:1')

    ax.set_ylabel('Retrieved $\log L$', fontsize=15)
    ax.set_xlabel('Real $\log L$', fontsize=15)

    ax.set_ylim((41, 47))
    ax.set_xlim((41, 47))

    if len(title) > 0:
        ax.set_title(title, fontsize=20)

    # Detec lim

    detec_lim = np.vstack(
        (
            pd.read_csv('csv/5sigma_depths_NB.csv', header=None),
            pd.read_csv('csv/5sigma_depths_BB.csv', header=None)
        )
    )[:, 1]

    flambda_lim = mag_to_flux(detec_lim[nb_c], w_central[nb_c]) * 3

    ew0_lim = 20 # A
    z = w_central[nb_c] / 1215.67 - 1
    Fline_lim = ew0_lim * flambda_lim * (1 + z)
    dL = cosmo.luminosity_distance(z).to(u.cm).value
    L_lim = np.log10(Fline_lim * 4*np.pi * dL**2)

    ax.axhline(L_lim, ls='--', color='green', label='L limit')

    ax.legend(fontsize=15)
    # plt.savefig(f'/home/alberto/Desktop/{title}')
    plt.show()

In [None]:
L_lya_test = np.array(L_lya_test)
plot_contours(NNdata_is_qso[-int(N_sources_NN / 5):])

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
reg.set_params(verbose=1)
a = learning_curve(reg, NNdata, NNdata_L_input)

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))

x_ticks = a[0]
ax.plot(x_ticks, a[1].sum(axis=1)/len(x_ticks), marker='s', label='Training score')
ax.plot(x_ticks, a[2].sum(axis=1)/len(x_ticks), marker='s', label='Test score')

ax.legend(fontsize=15)

ax.set_ylabel('Score', fontsize=15)
ax.set_xlabel('Training set fraction', fontsize=15)

plt.show()