<img align="left" src = figs/logos/logo-IJCLab_v1.png height=40, style="padding: 10px"> 
<b>PhotoZ estimation with scikit learn Machine learning </b> <br>
Last verified to run on 2022-11-18 with LSST Science Pipelines release w_2022_40 <br>
Contact authors: Sylvie Dagoret-Campagne (DP0 Delegate) <br>
Target audience: DP0 delegates member <br>

**Credit:** Originally developed by Sylvie Dagoret-Campagne in the framework provided by Rubin DP0.1 (reference DP0.1 tutorials)

Acknowledgement: to the Rubin Engagement team ..;

# Learning Objectives : Compare PhotoZ estimators performances using simple Machine Learning algorithms from scikit learn.

Three typical regressors in scikit learn are evaluated and compared together. 
No optimisation is performed in this notebook. This will be the subject of another complementary notebook.


**Note:** : 
-All plots are made with Holoview.
- **Better select the maximum of CPU (4 CPU on RSP)**

### Imports

In [1]:
# Import general python packages
import numpy as np
import re
import pandas as pd
import pickle
from pandas.testing import assert_frame_equal
import os
import errno
import shutil
import getpass
import datetime
# Import the Rubin TAP service utilities
from lsst.rsp import get_tap_service, retrieve_query

# LSST Science Pipelines (Stack) packages
import lsst.daf.butler as dafButler
import lsst.afw.display as afwDisplay
import lsst.geom as geom
import lsst.afw.coord as afwCoord
#afwDisplay.setDefaultBackend('matplotlib')

#
from lsst import skymap

# Astropy
from astropy import units as u
from astropy.table import Table
from astropy.coordinates import SkyCoord
from astropy.units.quantity import Quantity
from astropy.visualization import (MinMaxInterval, SqrtStretch,ZScaleInterval,PercentileInterval,
                                   ImageNormalize,imshow_norm)
from astropy.visualization.stretch import SinhStretch, LinearStretch,AsinhStretch,LogStretch


# Bokeh for interactive visualization
import bokeh
from bokeh.io import output_file, output_notebook, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, CDSView, GroupFilter, HoverTool
from bokeh.plotting import figure
from bokeh.transform import factor_cmap

import holoviews as hv
from holoviews import streams, opts
from holoviews.operation.datashader import rasterize
from holoviews.operation.datashader import datashade, dynspread
from holoviews.plotting.util import process_cmap

import datashader as dsh


# Set the maximum number of rows to display from pandas
pd.set_option('display.max_rows', 20)


# Set the holoviews plotting library to be bokeh
# You will see the holoviews + bokeh icons displayed when the library is loaded successfully
hv.extension('bokeh')
#hv.extension('bokeh', 'matplotlib')
#hv.extension('matplotlib')

# Display bokeh plots inline in the notebook
output_notebook()

In [2]:
# What versions of bokeh and holoviews nd datashader are we working with?
# This is important when referring to online documentation as
# APIs can change between versions.
print("Bokeh version: " + bokeh.__version__)
print("Holoviews version: " + hv.__version__)
print("Datashader version: " + dsh.__version__)

Bokeh version: 3.3.3
Holoviews version: 1.18.1
Datashader version: 0.16.0


In [3]:
#  What version of the Stack are we using?
! echo $IMAGE_DESCRIPTION
! eups list -s | grep lsst_distrib

Weekly 2024_04
lsst_distrib          g4213664e8e+b08e1c1b0b 	current w_2024_04 setup


In [4]:
# allow for matplotlib to create inline plots in our notebook
import matplotlib.pyplot as plt      # imports matplotlib.pyplot as plt
from matplotlib.colors import Normalize

import warnings                      # imports the warnings library
import gc                            # imports python's garbage collector

# Ignore warnings
from astropy.units import UnitsWarning
warnings.simplefilter("ignore", category=UnitsWarning)

In [5]:
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [6]:
warnings.filterwarnings("ignore")

In [7]:
# Set up some plotting defaults:

params = {'axes.labelsize': 28,
          'font.size': 24,
          'legend.fontsize': 14,
          'xtick.major.width': 3,
          'xtick.minor.width': 2,
          'xtick.major.size': 12,
          'xtick.minor.size': 6,
          'xtick.direction': 'in',
          'xtick.top': True,
          'lines.linewidth': 3,
          'axes.linewidth': 3,
          'axes.labelweight': 3,
          'axes.titleweight': 3,
          'ytick.major.width': 3,
          'ytick.minor.width': 2,
          'ytick.major.size': 12,
          'ytick.minor.size': 6,
          'ytick.direction': 'in',
          'ytick.right': True,
          'figure.figsize': [18, 10],
          'figure.facecolor': 'White'
          }

plt.rcParams.update(params)

## Configurations and initialisation

### Holoview Configuration

In [8]:
HV_CURVE_SINGLE_WIDTH  = 400
HV_CURVE_SINGLE_HEIGHT = 350
HV_CURVE_MULTI_WIDTH  = 300
HV_CURVE_MULTI_HEIGHT = 300
HV_CURVE_MULTI_FRAME_WIDTH = 300
HV_CURVE_MULTI_COLS   = 3

In [9]:
NBINS_HISTO = 50

In [10]:
HV_HISTO_SINGLE_WIDTH  = 600
HV_HISTO_SINGLE_HEIGHT = 600
HV_HISTO_MULTI_WIDTH  = 300
HV_HISTO_MULTI_HEIGHT = 300
HV_HISTO_MULTI_FRAME_WIDTH = 300
HV_HISTO_MULTI_COLS   = 3

In [11]:
HV_IMAGE_SINGLE_WIDTH  = 400
HV_IMAGE_SINGLE_HEIGHT = 400
HV_IMAGE_SINGLE_FRAME_WIDTH = 600
HV_IMAGE_MULTI_WIDTH  = 400
HV_IMAGE_MULTI_HEIGHT = 400
HV_IMAGE_MULTI_FRAME_WIDTH = 300
HV_IMAGE_MULTI_COLS   = 3

## Notebook Configuration

#### Setup paths

In [12]:
# username
myusername=getpass.getuser()

In [13]:
# temporary folders if necessary
NBDIR       = 'photoz_dp02'                            # relative path for this notebook output
TMPTOPDIR   = "/scratch"                               # always write some output in /scratch, never in user HOME 
TMPUSERDIR  = os.path.join(TMPTOPDIR,myusername)       # defines the path of user outputs in /scratch 
TMPNBDIR    = os.path.join(TMPUSERDIR,NBDIR)           # output path for this particular notebook
FLAG_READ_DATAFRAMEFROMDISK = True                     # must read data from disk

In [14]:
filename_result=f'fluxesredshift_result.pkl'
fullfilename_result=os.path.join(TMPNBDIR,filename_result)

### Selection flags

- put boolean flags here to avoid execution of some sections

In [15]:
# Show plots on redshift distribution
FLAG_SHOW_TRUE_REDSHIFT_DISTRIB = True

In [16]:
# Show plots to check the photometry selected for photoz
# For a pure demo on photoZ, this section can be skipped
FLAG_SHOW_PHOTOMETRY_DETECTION = True

# START HERE

## Read input data

In [17]:
if FLAG_READ_DATAFRAMEFROMDISK and os.path.exists(fullfilename_result):
    sql_results = pd.read_pickle(fullfilename_result)

In [18]:
data = sql_results

In [19]:
data.head()

Unnamed: 0,mt_id_truth_type,mt_match_objectId,ts_ra,ts_dec,ts_truth_type,ts_mag_r,ts_is_pointsource,ts_redshift,ts_flux_u,ts_flux_g,...,obj_r_cModelFlux,obj_i_cModelFlux,obj_z_cModelFlux,obj_y_cModelFlux,obj_u_cModelMag,obj_g_cModelMag,obj_r_cModelMag,obj_i_cModelMag,obj_z_cModelMag,obj_y_cModelMag
0,7939458371_1,1651413688361450903,61.842387,-36.524608,1,23.649401,0,0.951227,901.932007,893.60498,...,2062.920998,3226.156045,5167.70294,5953.202104,23.544281,23.413564,23.113794,22.628287,22.116756,21.963123
1,7940601535_1,1651413688361451151,61.887583,-36.523576,1,23.4786,0,1.14267,1177.569946,1248.369995,...,1555.517213,2007.930923,3577.964091,4435.082726,23.65467,23.55367,23.420313,23.143128,22.51591,22.282746
2,7938042672_1,1651413688361451083,61.949615,-36.527649,1,23.2017,0,0.576006,573.070984,878.140015,...,1943.691304,3379.122989,3879.091103,4958.231052,24.590838,24.023397,23.178432,22.57799,22.428175,22.161683
3,7937797456_1,1651413688361451150,61.887049,-36.525188,1,22.4175,0,0.442611,399.596985,1157.119995,...,4398.087847,6030.980909,6976.460761,7867.026482,25.030332,23.661947,22.29184,21.94903,21.790912,21.660473
4,7944880657_1,1651413688361451062,61.924857,-36.527904,1,24.007401,0,1.78489,648.911987,738.971008,...,752.674832,937.97664,1608.215672,2100.901486,24.924957,24.45548,24.208482,23.96952,23.384139,23.093986


In [20]:
del data['mt_id_truth_type']
del data['mt_match_objectId']
del data['ts_ra']
del data['ts_dec']
del data['ts_truth_type']
del data['ts_mag_r']
del data['ts_is_pointsource']
del data['ts_flux_u']
del data['ts_flux_g']
del data['ts_flux_r']
del data['ts_flux_i']
del data['ts_flux_z']
del data['ts_flux_y']
del data['obj_coord_ra'] 	
del data['obj_coord_dec'] 	
del data['obj_u_cModelFlux'] 	
del data['obj_g_cModelFlux'] 	
del data['obj_r_cModelFlux'] 	
del data['obj_i_cModelFlux'] 	
del data['obj_z_cModelFlux']	
del data['obj_y_cModelFlux']

In [21]:
# for shorter names
data.rename(columns={"obj_u_cModelMag": "mag_u", "obj_g_cModelMag": "mag_g","obj_r_cModelMag": "mag_r",
                     "obj_i_cModelMag": "mag_i", "obj_z_cModelMag": "mag_z","obj_y_cModelMag": "mag_y",
                     "ts_redshift":"redshift"
                    },inplace=True)

In [22]:
data

Unnamed: 0,redshift,mag_u,mag_g,mag_r,mag_i,mag_z,mag_y
0,0.951227,23.544281,23.413564,23.113794,22.628287,22.116756,21.963123
1,1.142670,23.654670,23.553670,23.420313,23.143128,22.515910,22.282746
2,0.576006,24.590838,24.023397,23.178432,22.577990,22.428175,22.161683
3,0.442611,25.030332,23.661947,22.291840,21.949030,21.790912,21.660473
4,1.784890,24.924957,24.455480,24.208482,23.969520,23.384139,23.093986
...,...,...,...,...,...,...,...
56173,1.181640,24.478454,24.677930,24.415583,24.230565,23.764589,23.400803
56174,0.701075,25.402165,24.514681,23.352261,22.587281,22.316529,22.082402
56175,0.714574,23.760397,23.768756,23.193998,22.345509,21.853552,21.621882
56176,0.824669,24.096307,23.557741,22.409664,21.378226,20.745974,20.503063


In [23]:
# drop NA
data = data.dropna()

In [24]:
len(data)

55564

### add color

In [25]:
pd.options.mode.chained_assignment = None  # default='warn'
data["umg"]=data["mag_u"]- data["mag_g"]
data["gmr"]=data["mag_g"]- data["mag_r"]
data["rmi"]=data["mag_r"]- data["mag_i"]
data["imz"]=data["mag_i"]- data["mag_z"]
data["zmy"]=data["mag_z"]- data["mag_y"]

# Check input data

## Redshifts distribution

In [26]:
(count, z_bin) = np.histogram(data.redshift, bins=NBINS_HISTO)
z_distribution = hv.Histogram((z_bin, count)).opts(title=f"redshift distribution",color='darkblue', xlabel='redshift', fontscale=1.2,height=HV_HISTO_SINGLE_HEIGHT, width=HV_HISTO_SINGLE_WIDTH,tools=['hover'])
z_distribution

# Machine Learning for Photo-Z estimation

## Requirements from LSST Science book:
(https://www.lsst.org/sites/default/files/docs/sciencebook/SB_3.pdf)

Photometric redshifts for LSST will be applied and calibrated over the redshift range $0 < z < 4$
for galaxies to $r  \simeq 27.5$. 
For the majority of science cases, such as weak lensing and BAO, a subset
of galaxies with $i < 25.3$ will be used. For this high S/N gold standard subset over the
redshift interval, $0 < z < 3$, the photometric redshift requirements are:

- The root-mean-square scatter in photometric redshifts, $ \sigma_z/(1+z)$, must be smaller than 0.05, with a goal of 0.02.
- The fraction of $3\sigma $  outliers at all redshifts must be below 10%.
- The bias in $e_z = (z_{photo}−z_{spec})/(1+z_{spec})$ 
must be below 0.003 (or 0.01 for combined,analyses of weak lensing and baryon acoustic oscillations); 
- The uncertainty in  $\sigma_z/(1+z)$ must also be known to similar accuracy.



### other definitions

- **the photo-z accuracy is the absolute value of the difference between the true and photometric redshifts**.

-  **the photo-z uncertainty is the standard deviation of the true redshifts** 

## Utility functions


- from DE School IV, University of Oxford, July 18, 2016 :  **Jeff Newman - photometric redshifts for LSST**

The tools for PhotoZ evaluation are givenin the notebook and also described in the LSST science book (Performance Chapter) 

### Performances Evaluation lines

In [27]:
#A function that we will call a lot: makes the zphot/zspec plot and calculates key statistics
def plot_lines(zmin=0,zmax=3,zstep=0.05,slope=0.15):
    
    x = np.arange(zmin,zmax,zstep)
    outlier_upper = x + slope*(1+x)
    outlier_lower = x - slope*(1+x)

    curv_bisect=hv.Curve(zip(x,x)).opts(color="red") 
    curv_outupper=hv.Curve(zip(x,outlier_upper)).opts(color="red",line_dash='dashed') 
    curv_outlower=hv.Curve(zip(x,outlier_lower)).opts(color="red",line_dash='dashed') 
    
    layout = curv_bisect * curv_outupper * curv_outlower
    return layout
    
    

In [28]:
plot_lines()

### Statistic lines

In [29]:
def get_stats(z_spec,z_phot,slope=0.15):
    """
    input : 
       - z_spec : spectroscopic redshift or true redshift
       - z_phot : photo-z reedshift
       - slope : slope of line defining the outliers  3 x sigma_z with sigma_z = 5%, so slope = 3 x 0.05 = 0.15 
    """
    
    mask = np.abs((z_phot - z_spec)/(1 + z_spec)) > slope
    notmask = ~mask 
    
    # Standard Deviation of the predicted redshifts compared to the data:
    #-----------------------------------------------------------------
    std_result = np.std((z_phot - z_spec)/(1 + z_spec), ddof=1)
    print('Standard Deviation: %6.4f' % std_result)
    

    # Normalized MAD (Median Absolute Deviation):
    #------------------------------------------
    nmad = 1.48 * np.median(np.abs((z_phot - z_spec)/(1 + z_spec)))
    print('Normalized MAD: %6.4f' % nmad)

    # Percentage of delta-z > 0.15(1+z) outliers:
    #-------------------------------------------
    eta = np.sum(np.abs((z_phot - z_spec)/(1 + z_spec)) > 0.15)/len(z_spec)
    print('Delta z >0.15(1+z) outliers: %6.3f percent' % (100.*eta))
    
    # Median offset (normalized by (1+z); i.e., bias:
    #-----------------------------------------------
    bias = np.median(((z_phot - z_spec)/(1 + z_spec)))
    sigbias=std_result/np.sqrt(0.64*len(z_phot))
    print('Median offset: %6.3f +/- %6.3f' % (bias,sigbias))
    
    
     # overlay statistics with titles left-aligned and numbers right-aligned
    stats_txt = '\n'.join([
        'NMAD  = {:0.2f}'.format(nmad),
        'STDEV = {:0.2f}'.format(std_result),
        'BIAS  = {:0.2f}'.format(bias),
        'ETA   = {:0.2f}'.format(eta)
    ])
    
    
    return nmad,std_result,bias,eta,stats_txt
    

## START ML here

### Prepare Features and Target

Because we want to estimate the performance of photoz estimator itself, not the total performance including intrinsic redshift fluctuations. Thus only average magnitudes data will be used : detected magnitude are dropped. 

In [30]:
target = data["redshift"]

In [31]:
features = data[["mag_u","mag_g","mag_r","mag_i","mag_z","mag_y"]]

- Total number of samples to split in training, validation and test dataset

In [32]:
Ntot = len(target)
Ntot

55564

### Split in training / test set

- speed of the notebook must be tuned with the training sample size

#### number of samples to be used in training

- depending on the required speed of the demo 

In [33]:
Ntrain = 10000
Ntest = Ntot-Ntrain

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
# Test fraction
test_sample_size_fraction=Ntest/Ntot
test_sample_size_fraction

0.820027355841912

In [36]:
# adapt the train dataset size according required running time 
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_sample_size_fraction, random_state=0)

In [37]:
X_train.shape

(10000, 6)

In [38]:
X_test.shape

(45564, 6)

** Note**
- because the model fit (training) may be long, we should limit the training dataset size for this demo.

## Regressors definitions

### Regularized Linear model

- Instead of using the LinearRegressor, we start by using the regularized Ridge regressor with the alpha parameter setting the regularization.
- Linear model features should be always normalized,
- For non linearities, we include the possibility to develop the model as a polynomial of features

Scikit-Learn offer to define pipelines of tasks in an easy way:
- PolynomialFeatures() task extend features dataset in powers of thos features up to a power degree,
- StandardScaler() preprocess the features to normalize them,
- Ridge is the regularized version of the LinearRegressor

In [39]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

ridge_regressor = make_pipeline(PolynomialFeatures(degree=4), StandardScaler(),Ridge(alpha=0.001))

### RandomForest regressor

- RandomForest regressor is an Enssemble regressor of type Bagging (Bootstrap and Aggregation).  

- RandomForest regressor combines the regression of multiple decision tree regressors fitted in parallel 
on bootstrapped samples from the training sample.

- Each individual decision tree is deep, meaning they individually overfit the bootstrapped samples. 

- The aggregation of the overfitting parallel decision tree model reduce the over-fitting.
- For Random Forest, each Decision Tree node feature are drawn randomly. This reduce the error correlation of the various trees.
- From this caracteristics, RandomForest is expected to be one of the best non-linear regressor on column-tabulated datasets.


- RandomForst includes a number of hyper-parameters.
- We use the hyper-parameters chosen byJeff Newmann for the DE-School at Oxford 2016.

In [64]:
from sklearn.ensemble import RandomForestRegressor
#randomforest_regressor = RandomForestRegressor(n_estimators = 50, max_depth = 30, max_features = 'auto')
randomforest_regressor = RandomForestRegressor(n_estimators = 50, max_depth = 30)

### Gradient Boosting Regressor

The INRIA MOOC (2022) on scikit-Learn ( Machine learning in Python with scikit-learn: https://lms.fun-mooc.fr/courses/course-v1:inria+41026+session02/info)
recommend the histogram-binned version of GradientBoostingRegressor, expected to have a good balance between underfitting and overfitting.

- Boosting Regressor perform shallow Decision trees (underfitting) fit sequencially. The subsequent Decision Tree fitting improve the fit quality.
Among the set of bossting regressors, Gradient Boosting Regressor is expected to decrease the bias but avoid overfitting. Among them the Histogram Gradient Boosting regressor is expected to run faster 

- To increase the fitting time, we allow here early stopping

In [41]:
from sklearn.ensemble import HistGradientBoostingRegressor
#from sklearn.preprocessing import KBinsDiscretizer
#discretizer = KBinsDiscretizer(n_bins=64, encode="ordinal", strategy="quantile")
#histogram_gradient_boosting_regressor = make_pipeline(discretizer, HistGradientBoostingRegressor(max_iter=30))
histogram_gradient_boosting_regressor =  HistGradientBoostingRegressor(max_iter=1_000,early_stopping=True)

## Evaluation metrics

In [42]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.metrics import make_scorer
scoring = {'r2': make_scorer(r2_score),'mae': make_scorer(mean_absolute_error),'mse': make_scorer(mean_squared_error)}

## Definition of the $R^2$ score for a regression (not a classification)

- see https://en.wikipedia.org/wiki/Coefficient_of_determination

- The R^2 score is also called the coefficient of determination

$$
R^2 = 1 - \frac{SSR}{SST} = \frac{SSE}{SST}
$$

where:

- $SST = \sum_i (y_i - \overline{y})^2$ , for total sum of squares,
- $SSR = \sum_i (y_i - \hat{y}_i)^2$ , for the residual sum of squares,
- $SSE = SST - SSR $, is the variance explained


The fit is good when $SSR \simeq 0$, thus when $R^2$ score is 1

- another interpretation is $R^2 = corr(y_i, \hat{y}_i)^2$

## Use of cross-validation

- We use cross-validation to select a sub-sample of galaxies from the complete training sample and train the model with this subset.
- This sub-sampling is repeated several times (n_split=5).

The interest of this multi-subsampling is to have a set of almost similar but slightly different fitted models from which we can derive several predictions for a test sample, thus an average predicted value and its variation (or a PDF)

In [43]:
from sklearn.model_selection import cross_validate

In [44]:
from sklearn.model_selection import ShuffleSplit

In [45]:
# We don't know of the galaxies are ordered by redshift or come randomly. Thus we activate a pre-random-shuffling in the training dataset. 
cv = ShuffleSplit(n_splits=5, test_size=.80, random_state=0)

## Ridge model

- The cross_validate function performs fit on n_splits models from n_splits random subsamples
- The smaple is previously randomized
- The evaluation metric is given by the scoring (The INRIA MOOC use this coring="neg_mean_absolute_error"),
- The n_splits fitted models are retuned (to be able to make n_split prediction for a single test sample)

### Cross validation

In [46]:
%%time
t1 = datetime.datetime.now()
cv_results = cross_validate(ridge_regressor,X_train,y_train,cv=cv,scoring=scoring,return_estimator=True)
t2 = datetime.datetime.now()
deltat = (t2-t1).total_seconds() 
print(f"Ridge CV : elapsed time {deltat:.2f} sec")

Ridge CV : elapsed time 0.26 sec
CPU times: user 559 ms, sys: 372 ms, total: 931 ms
Wall time: 264 ms


In [47]:
df_cv_results = pd.DataFrame(cv_results)
df_cv_results

Unnamed: 0,fit_time,score_time,estimator,test_r2,test_mae,test_mse
0,0.027426,0.029396,"(PolynomialFeatures(degree=4), StandardScaler(...",0.620792,0.178575,0.103651
1,0.018495,0.027638,"(PolynomialFeatures(degree=4), StandardScaler(...",0.71192,0.17436,0.080297
2,0.018107,0.03924,"(PolynomialFeatures(degree=4), StandardScaler(...",0.690834,0.176396,0.085686
3,0.017594,0.028412,"(PolynomialFeatures(degree=4), StandardScaler(...",0.577838,0.178375,0.116163
4,0.018917,0.032344,"(PolynomialFeatures(degree=4), StandardScaler(...",0.685932,0.174262,0.08711


### estimation

In [48]:
# Choose one of the n_splits model, but all predictions for all estimators could be calculated (average and rms)
y_pred = cv_results["estimator"][0].predict(X_test)

### Performances

In [49]:
coords = zip(y_test,y_pred)
points = hv.Points(coords).opts(tools=['box_select', 'lasso_select'])

In [50]:
nmad,std_result,bias,eta,stats_txt1 = get_stats(y_test.values,y_pred)

Standard Deviation: 0.1653
Normalized MAD: 0.0776
Delta z >0.15(1+z) outliers: 18.721 percent
Median offset: -0.001 +/-  0.001


In [51]:
# Create a holoviews object to hold and plot data
# Create the linked streams instance
boundsxy = (0, 0, 0, 0)
box = streams.BoundsXY(source=points, bounds=boundsxy)
bounds = hv.DynamicMap(lambda bounds: hv.Bounds(bounds), streams=[box])

# Apply the datashader
p1 = dynspread(datashade(points, cmap="Viridis"))
p1 = p1.opts(width=HV_HISTO_SINGLE_WIDTH, height=HV_HISTO_SINGLE_HEIGHT,
    padding=0.05, show_grid=True,
    xlim=(0, 3), ylim=(0, 3.0),
    xlabel="z-spec", ylabel="z-phot",title="Ridge Regressor")

In [52]:
p1 * plot_lines() * hv.Text(0.5, 2.5, stats_txt1)

### performance metrics

In [53]:
msg_r2   = f"R2 score : \t\t {df_cv_results['test_r2'].mean():.3f} +/-  {df_cv_results['test_r2'].std():.3f}"
msg_mae  = f"MAE mean absolute error : \t {df_cv_results['test_mae'].mean():.3f} +/-  {df_cv_results['test_mae'].std():.3f}"
msg_rmsq = f"Root MSE error : \t\t {np.sqrt(df_cv_results['test_mse'].mean()):.3f} +/-  {np.sqrt(df_cv_results['test_mse'].std()):.3f}"

In [54]:
print(msg_r2)
print(msg_mae)
print(msg_rmsq)

R2 score : 		 0.657 +/-  0.056
MAE mean absolute error : 	 0.176 +/-  0.002
Root MSE error : 		 0.308 +/-  0.122


## Random Forest

- take the hyper-parameter of the DE school

### Select smaller training sample

- faster model fit

In [55]:
Ntrain = 8000
Ntest = Ntot-Ntrain
test_sample_size_fraction=Ntest/Ntot
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_sample_size_fraction, random_state=0)

In [59]:
X_train

Unnamed: 0,mag_u,mag_g,mag_r,mag_i,mag_z,mag_y
20846,24.238046,24.191747,23.719659,23.018097,22.750064,22.705056
48397,25.199966,24.729087,23.939738,23.116417,22.403343,22.135842
24201,25.128702,24.340519,23.854733,23.537867,23.448133,23.305779
21579,25.236629,24.764087,24.060958,23.643156,23.366110,23.211667
9646,24.938153,24.940533,24.700087,24.712424,23.781548,23.552881
...,...,...,...,...,...,...
46412,23.453401,23.625122,23.312155,22.745038,22.341317,22.145967
52998,23.549713,23.722823,23.820888,23.871670,23.465885,23.261725
43089,24.070401,23.968724,23.658184,23.012074,22.361847,22.446018
44052,25.686378,24.859971,23.672944,22.636371,22.320999,21.974828


In [61]:
y_train

20846    0.787099
48397    1.003950
24201    0.179241
21579    0.421729
9646     1.022580
           ...   
46412    0.971580
52998    1.193520
43089    0.960035
44052    0.734073
2766     0.542115
Name: redshift, Length: 8000, dtype: float32

### training

In [56]:
#cv_results = cross_validate(randomforest_regressor ,X_train,y_train,cv=cv,scoring=scoring,return_estimator=True)

In [65]:
%%time
# We simply use the fit method, not the cross_validate to accelerate the demo 
t1 = datetime.datetime.now()
randomforest_regressor.fit(X_train,y_train)
t2 = datetime.datetime.now()
deltat = (t2-t1).total_seconds() 
print(f"RandomForest : elapsed time {deltat:.2f} sec")

RandomForest : elapsed time 2.69 sec
CPU times: user 2.69 s, sys: 1.25 ms, total: 2.69 s
Wall time: 2.69 s


### Estimate

In [66]:
y_pred =  randomforest_regressor.predict(X_test)

### Performances

In [67]:
nmad,std_result,bias,eta,stats_txt2= get_stats(y_test.values,y_pred)

Standard Deviation: 0.1057
Normalized MAD: 0.0447
Delta z >0.15(1+z) outliers:  7.356 percent
Median offset:  0.003 +/-  0.001


In [68]:
coords = zip(y_test,y_pred)
points = hv.Points(coords).opts(tools=['box_select', 'lasso_select'])

In [69]:
boundsxy = (0, 0, 0, 0)
box = streams.BoundsXY(source=points, bounds=boundsxy)
bounds = hv.DynamicMap(lambda bounds: hv.Bounds(bounds), streams=[box])

# Apply the datashader
p2 = dynspread(datashade(points, cmap="Viridis"))
p2 = p2.opts(width=HV_HISTO_SINGLE_WIDTH, height=HV_HISTO_SINGLE_HEIGHT,
    padding=0.05, show_grid=True,
    xlim=(0, 3), ylim=(0, 3.0),
    xlabel="z-spec", ylabel="z-phot",title="Random Forest Regressor")

In [70]:
p2 * plot_lines() *  hv.Text(0.5, 2.5, stats_txt2)

#### Performance metrics in scikit learn

In [None]:
r2  = r2_score(y_pred,y_test)
mae = mean_absolute_error(y_pred,y_test)
mse = mean_squared_error(y_pred,y_test)

In [None]:
msg_r2   = f"R2 score : \t\t {r2:.3f}"
msg_mae  = f"MAE mean absolute error : \t {mae:.3f}"
msg_rmsq = f"Root MSE error : \t\t {np.sqrt(mse):.3f}"

In [None]:
print(msg_r2)
print(msg_mae)
print(msg_rmsq)

## Histogram Gradient Boosting regressor

- No particular optimisation of hyper parameters performed here 

### Select smaller training sample

- faster model fit

In [None]:
Ntrain = 10000
Ntest = Ntot-Ntrain
test_sample_size_fraction=Ntest/Ntot
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_sample_size_fraction, random_state=0)

### training (model fit)

In [None]:
%%time
# We simply use the fit method, not the cross_validate to accelerate the demo 
t1 = datetime.datetime.now()
histogram_gradient_boosting_regressor.fit(X_train,y_train) 
t2 = datetime.datetime.now()
deltat = (t2-t1).total_seconds() 
print(f"Histogram Gradient Boosting : elapsed time {deltat:.2f} sec")

### Estimation

In [None]:
y_pred =  histogram_gradient_boosting_regressor.predict(X_test)

### Performances

In [None]:
coords = zip(y_test,y_pred)
points = hv.Points(coords).opts(tools=['box_select', 'lasso_select'])

In [None]:
nmad,std_result,bias,eta,stats_txt3 = get_stats(y_test.values,y_pred)

In [None]:
boundsxy = (0, 0, 0, 0)
box = streams.BoundsXY(source=points, bounds=boundsxy)
bounds = hv.DynamicMap(lambda bounds: hv.Bounds(bounds), streams=[box])

# Apply the datashader
p3 = dynspread(datashade(points, cmap="Viridis"))
p3 = p3.opts(width=HV_HISTO_SINGLE_WIDTH, height=HV_HISTO_SINGLE_HEIGHT,
    padding=0.05, show_grid=True,
    xlim=(0, 3), ylim=(0, 3.0),
    xlabel="z-spec", ylabel="z-phot",title="Histogram Gradient Boosting Regressor")

In [None]:
p3 * plot_lines() *  hv.Text(0.5, 2.5, stats_txt3)

#### Performance metrics in scikit learn

In [None]:
r2  = r2_score(y_pred,y_test)
mae = mean_absolute_error(y_pred,y_test)
mse = mean_squared_error(y_pred,y_test)

In [None]:
msg_r2   = f"R2 score : \t\t {r2:.3f}"
msg_mae  = f"MAE mean absolute error : \t {mae:.3f}"
msg_rmsq = f"Root MSE error : \t\t {np.sqrt(mse):.3f}"

In [None]:
print(msg_r2)
print(msg_mae)
print(msg_rmsq)