# Do Periodograms with  Atmospheric parameters time sequences from Merra2

- author Sylvie Dagoret-Campagne
- affiliation : IJCLab
- creation date 2025-10-24 :
- last update : 2025-02-24
- Kernel @usdf **w_2024_50*
- Office emac : mamba_py311
- Home emac : base (conda)
- laptop : conda_py311

## Doc on periodograms
https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.periodogram.html

In [None]:
from platform import python_version
print(python_version())

In [None]:
import warnings
warnings.resetwarnings()
warnings.simplefilter('ignore')

In [None]:
from platform import python_version
print(python_version())

In [None]:
import os

In [None]:
# where are stored the figures
pathfigs = "figsPeriodogramsAtmosphereFomMerra2"
if not os.path.exists(pathfigs):
    os.makedirs(pathfigs) 
figtype = ".pdf"

In [None]:
import numpy as np
from numpy.linalg import inv
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LogNorm,SymLogNorm
from matplotlib.patches import Circle,Annulus
from astropy.visualization import ZScaleInterval
props = dict(boxstyle='round', facecolor="white", alpha=0.1)
#props = dict(boxstyle='round')

import matplotlib.colors as colors
import matplotlib.cm as cmx

import matplotlib.ticker                         # here's where the formatter is
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)

from matplotlib.gridspec import GridSpec

from astropy.visualization import (MinMaxInterval, SqrtStretch,ZScaleInterval,PercentileInterval,
                                   ImageNormalize,imshow_norm)
from astropy.visualization.stretch import SinhStretch, LinearStretch,AsinhStretch,LogStretch

from astropy.io import fits
from astropy.wcs import WCS
from astropy import units as u
from astropy import constants as c

from astropy.coordinates.earth import EarthLocation
from datetime import datetime
from pytz import timezone

from scipy import interpolate
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree, BallTree

import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', 100)

import matplotlib.ticker                         # here's where the formatter is
import os
import re
import pandas as pd
import pickle
from collections import OrderedDict

plt.rcParams["figure.figsize"] = (4,3)
plt.rcParams["axes.labelsize"] = 'xx-large'
plt.rcParams['axes.titlesize'] = 'xx-large'
plt.rcParams['xtick.labelsize']= 'xx-large'
plt.rcParams['ytick.labelsize']= 'xx-large'

import scipy
from scipy.optimize import curve_fit,least_squares


# new color correction model
import pickle
from scipy.interpolate import RegularGridInterpolator

In [None]:
from astropy.modeling import models

In [None]:
from numpy.random import lognormal

In [None]:
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
from astropy.visualization import (MinMaxInterval, SqrtStretch,ZScaleInterval,PercentileInterval,
                                   ImageNormalize,imshow_norm)
from astropy.visualization.stretch import SinhStretch, LinearStretch,AsinhStretch,LogStretch

from astropy.time import Time
from astropy.timeseries import TimeSeries

In [None]:
# Remove to run faster the notebook
#import ipywidgets as widgets
#%matplotlib widget

In [None]:
from importlib.metadata import version

In [None]:
# wavelength bin colors
#jet = plt.get_cmap('jet')
#cNorm = mpl.colors.Normalize(vmin=0, vmax=NSED)
#scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
#all_colors = scalarMap.to_rgba(np.arange(NSED), alpha=1)

In [None]:
np.__version__

In [None]:
pd.__version__

In [None]:
from scipy import signal

In [None]:
def convertNumToDatestr(num):
    year = num//10_000
    month= (num-year*10_000)//100
    day = (num-year*10_000-month*100)

    year_str = str(year).zfill(4)
    month_str = str(month).zfill(2)
    day_str = str(day).zfill(2)
    
    datestr = f"{year_str}-{month_str}-{day_str}"
    return pd.to_datetime(datestr)

In [None]:
def pdf_lognormal(x,a0,mu,sigma):
    """
    """
    pdf = a0*(np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2))/ (x * sigma * np.sqrt(2 * np.pi)))
    return pdf

https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_prior_posterior.html

In [None]:
def plot_gpr_samples(gpr_model, n_samples, ax , x, label):
    """Plot samples drawn from the Gaussian process model.

    If the Gaussian process model is not trained then the drawn samples are
    drawn from the prior distribution. Otherwise, the samples are drawn from
    the posterior distribution. Be aware that a sample here corresponds to a
    function.

    Parameters
    ----------
    gpr_model : `GaussianProcessRegressor`
        A :class:`~sklearn.gaussian_process.GaussianProcessRegressor` model.
    n_samples : int
        The number of samples to draw from the Gaussian process distribution.
    ax : matplotlib axis
        The matplotlib axis where to plot the samples.
    """
    #x = np.linspace(0, 5, 100)
    X = x.reshape(-1, 1)

    y_mean, y_std = gpr_model.predict(X, return_std=True)
    y_samples = gpr_model.sample_y(X, n_samples)

    for idx, single_prior in enumerate(y_samples.T):
        if idx==0:
            ax.plot(
                x,
                single_prior,
                linestyle="--",
                alpha=0.7,
                label=label
            )
        else:
            ax.plot(
                x,
                single_prior,
                linestyle="--",
                alpha=0.7
            )
            
        
        
    ax.plot(x, y_mean, color="black", label="Mean")
    ax.fill_between(
        x,
        y_mean - y_std,
        y_mean + y_std,
        alpha=0.1,
        color="black",
        label=r"$\pm$ 1 std. dev.",
    )
    
    #ax.set_ylim([-3, 3])

## Configuration

In [None]:
observing_location = EarthLocation.of_site('Rubin Observatory')
tz = timezone('America/Santiago')

### MERRA2

In [None]:
filename_m2 = "../../SpectroMerra2/MerradataMerged/Merge_inst1_2d_asm_Nx_M2I1NXASM-2021-2024.csv"
filename_m2b = "../../SpectroMerra2/MerradataMerged/Merge_tavg1_2d_aer_Nx_M2T1NXAER-2021-2024.csv"

In [None]:
df_m = pd.read_csv(filename_m2)
df_mb = pd.read_csv(filename_m2b)

In [None]:
Nm = len(df_m)
Nmb = len(df_mb)
print("Number of points :: ",Nm,Nmb)

In [None]:
df_mb.columns

In [None]:
TMIN = pd.to_datetime(df_m.time.min())
TMAX = pd.to_datetime(df_m.time.max())

### Convert in MJD

In [None]:
df_m["mjd"] = Time(pd.to_datetime(df_m.time.values)).mjd
df_mb["mjd"] = Time(pd.to_datetime(df_mb.time.values)).mjd

In [None]:
mjd_zoom_start = Time("2024-01-01").mjd
mjd_zoom_stop = Time("2025-06-30").mjd

In [None]:
mjd_obs_start = df_m["mjd"].min() 
mjd_obs_stop = df_m["mjd"].max() 

## PWV

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(18,10))
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_m.Time.values), df_m.TQV.values,c="b",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("Precipitable water vapor from Merra2")
ax1.legend()
ax1.set_ylabel("PWV (mm)")
#ax.set_xlim(TMIN,TMAX)

data = df_m.TQV.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected max-range for PWV : ",
                     f"- average : {mean:.2f} mm",
                     f"- median : {median:.2f} mm",
                     f"- sigma : {std:.2f} mm",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)


ax2.plot(df_m.mjd, df_m.TQV.values,c="b",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("PWV (mm)")

figname =f"{pathfigs}/pwv_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()


In [None]:
fig,ax =plt.subplots(1,1,figsize=(6,4))

hist_sampling = np.diff(df_m.mjd)
ax.hist(hist_sampling,bins=100,range=(0,0.2))
ax.set_title("sampling time for PWV")
ax.set_xlabel('day')
plt.show()

In [None]:
ts = hist_sampling.mean()
fs = 1/ts
x = df_m.TQV.values

In [None]:
print("sampling time in minutes",ts*24*60)

In [None]:
fig,ax =plt.subplots(1,1,figsize=(10,6))
f, Pxx_den = signal.periodogram(x, fs)
ax.semilogy(f, Pxx_den,color="b")
ax.set_ylim([1e-10, 1e4])
ax.set_xlabel('frequency [1/days]')
ax.set_ylabel('PSD [mm**2/day]')
ax.set_title("Periodogram for PWV at Merra2")
plt.show()

In [None]:
fig,ax =plt.subplots(1,1,figsize=(10,6))
f, Pxx_den = signal.periodogram(x, fs)
ax.loglog(f, Pxx_den,color="b")
ax.set_ylim([1e-10, 1e4])
ax.set_xlabel('frequency [1/days]')
ax.set_ylabel('PSD [mm**2/day]')
ax.set_title("Periodogram for PWV at Merra2")
plt.show()

In [None]:
assert False

## Ozone

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(18,10))
gs = GridSpec(2, 1,figure=fig)

ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_m.Time.values), df_m.TO3.values,c="r",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("Ozone from Merra2")
#ax1.legend()
ax1.set_ylabel("Ozone (DU)")
#ax.set_xlim(TMIN,TMAX)

data = df_m.TO3.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected range for Ozone : ",
                     f"- average : {mean:.2f} DU",
                     f"- median : {median:.2f} DU",
                     f"- sigma : {std:.2f} DU",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)

ax2.plot(df_m.mjd, df_m.TO3.values,c="r",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("Ozone (DU)")


figname =f"{pathfigs}/ozone_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()


### Define the kernels

In [None]:
long_term_trend_kernel = 5.0**2 * RBF(length_scale=50.0)

seasonal_kernel = (
    2.0**2
    * RBF(length_scale=1000.0)
    * ExpSineSquared(length_scale=365.25, periodicity=1.0, periodicity_bounds="fixed")
)

irregularities_kernel = 2.0**2 * RationalQuadratic(length_scale=10.0, alpha=1.0)

noise_kernel = 0.1**2 * RBF(length_scale=1.) + WhiteKernel(
    noise_level=1.**2, noise_level_bounds=(1e-5, 1e5)
)

In [None]:
ozone_kernel = (
    long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
    #seasonal_kernel + irregularities_kernel + noise_kernel
    #seasonal_kernel + irregularities_kernel 
)

In [None]:
x = df_m.mjd.values[index_selected]
X = x.reshape(-1, 1)
y = df_m.TO3.values[index_selected]
y_mean = y.mean()

gaussian_process = GaussianProcessRegressor(kernel=ozone_kernel, normalize_y=False)
gaussian_process.fit(X, y - y_mean)

### Prediction

In [None]:
x_test = np.arange(start=mjd_min, stop=mjd_max,step=2)
X_test = x_test.reshape(-1,1)
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
mean_y_pred += y_mean

In [None]:
fig = plt.figure(figsize=(18,10),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_test,mean_y_pred,color="tab:red", alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:red",
    alpha=0.2,
)
ax1.legend()
ax1.set_ylabel("Ozone (DU)")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Ozone with Gaussian process")


ax2.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax2.plot(x_test,mean_y_pred,color="tab:blue", alpha=1.0, label="Gaussian process")
ax2.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:red",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("Ozone (DU)")
ax2.set_xlabel("mjd")
ax2.set_title("Time-Zoom on Fit Ozone with Gaussian process")
ax2.set_xlim(mjd_zoom_start,mjd_zoom_stop)



figname =f"{pathfigs}/ozone_fitgp_merra2"+figtype
fig.savefig(figname)
plt.show()


### Interpretation of hyper-parameters


- model of CO2 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py
- see the different types of kernels here : https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels and implementation here  https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_prior_posterior.html

In [None]:
gaussian_process.kernel_

In [None]:
### pickup kernels one by one
the_kernel_a = 0.0931**2 * RBF(length_scale=1e-05)
label_a = "gpr_a : 0.0931**2 * RBF(length_scale=1e-05)"

the_kernel_b = 0.0209**2 * RBF(length_scale=1e+05) * ExpSineSquared(length_scale=42.9, periodicity=1)
label_b = "gpr_b : 0.0209**2 * RBF(length_scale=1e+05) * ExpSineSquared(length_scale=42.9, periodicity=1)"

the_kernel_c = 16.3**2 * RationalQuadratic(alpha=0.11, length_scale=0.687)
label_c = "gpr_c : 16.3**2 * RationalQuadratic(alpha=0.11, length_scale=0.687)"                                         
                                          
the_kernel_d = 15.4**2 * RBF(length_scale=83.6)
label_d = "gpr_d : 15.4**2 * RBF(length_scale=83.6)"
                                           
the_kernel_e =  WhiteKernel(noise_level=0.0448)
label_e= "gpr_e : WhiteKernel(noise_level=0.0448)"

### generate gaussian process from each kernel
gpr_a = GaussianProcessRegressor(kernel=the_kernel_a, random_state=0)
gpr_b = GaussianProcessRegressor(kernel=the_kernel_b, random_state=0)
gpr_c = GaussianProcessRegressor(kernel=the_kernel_c, random_state=0)
gpr_d = GaussianProcessRegressor(kernel=the_kernel_d, random_state=0)
gpr_e = GaussianProcessRegressor(kernel=the_kernel_e, random_state=0)

fig, axs = plt.subplots(nrows=5, sharex=True, sharey=True, figsize=(10, 8),layout="constrained")
n_samples=3

# plot prior
plot_gpr_samples(gpr_a, n_samples = n_samples, ax=axs[0], x=mjd_obs_range,label=label_a)
axs[0].legend()
plot_gpr_samples(gpr_b, n_samples = n_samples, ax=axs[1], x=mjd_obs_range, label=label_b)
axs[1].legend()
plot_gpr_samples(gpr_c, n_samples = n_samples, ax=axs[2], x=mjd_obs_range,label=label_c)
axs[2].legend()
plot_gpr_samples(gpr_d, n_samples = n_samples, ax=axs[3], x=mjd_obs_range,label=label_d)
axs[3].legend()
plot_gpr_samples(gpr_e, n_samples = n_samples, ax=axs[4], x=mjd_obs_range,label=label_e)
axs[4].legend()

axs[0].set_title("Kernel functions for Ozone")
figname =f"{pathfigs}/ozone_kernelcomponents_merra2"+figtype
fig.savefig(figname)
plt.show()


## Aerosol VAOD

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(18,10))
gs = GridSpec(2, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_mb.Time.values), df_mb.TOTEXTTAU.values,c="g",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("VAOD from Merra2")
ax1.legend()
ax1.set_ylabel("VAOD")
#ax.set_xlim(TMIN,TMAX)
data = df_mb.TOTEXTTAU.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected max-range for VAOD : ",
                     f"- average : {mean:.3f}",
                     f"- median : {median:.3f}",
                     f"- sigma : {std:.3f}",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)

ax2.plot(df_mb.mjd, df_mb.TOTEXTTAU.values,c="g",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("VAOD")


figname =f"{pathfigs}/vaod_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()


### Define the kernels

In [None]:
long_term_trend_kernel = 5.0**2 * RBF(length_scale=50.0)

seasonal_kernel = (
    2.0**2
    * RBF(length_scale=1000.0)
    * ExpSineSquared(length_scale=365.25, periodicity=1.0, periodicity_bounds="fixed")
)

irregularities_kernel = 2.0**2 * RationalQuadratic(length_scale=10.0, alpha=1.0)

noise_kernel = 0.1**2 * RBF(length_scale=1.) + WhiteKernel(
    noise_level=1.**2, noise_level_bounds=(1e-5, 1e5)
)

In [None]:
vaod_kernel = (
    long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
    #seasonal_kernel + irregularities_kernel + noise_kernel
    #seasonal_kernel + irregularities_kernel 
)

### Make a subsample

In [None]:
a = np.arange(0,Nmb ,1)
#b = np.random.choice(a, size=10000,replace=False)
b = np.random.choice(a, size=1000,replace=False)

In [None]:
unique, counts = np.unique(b, return_counts=True)

In [None]:
index_selected = np.sort(b)

### Fit the gaussian process

In [None]:
x = df_mb.mjd.values[index_selected]
X = x.reshape(-1, 1)
y = df_mb.TOTEXTTAU.values[index_selected]
y_mean = y.mean()

gaussian_process = GaussianProcessRegressor(kernel=vaod_kernel, normalize_y=False)
gaussian_process.fit(X, y - y_mean)

## Prediction

In [None]:
mjd_min_b = df_mb.mjd.values.min()
mjd_max_b = df_mb.mjd.values.max() + 360

In [None]:
x_test = np.arange(start=mjd_min_b, stop=mjd_max_b,step=2)
X_test = x_test.reshape(-1,1)
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
mean_y_pred += y_mean

In [None]:
fig = plt.figure(figsize=(18,10),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_test,mean_y_pred,color="tab:green", alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:green",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("VAOD")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Aerosol VAOD with Gaussian process")



ax2.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax2.plot(x_test,mean_y_pred,color="tab:blue", alpha=1.0, label="Gaussian process")
ax2.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:green",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("VAOD")
ax2.set_xlabel("mjd")
ax2.set_title("Time-Zoom on Fit VAOD with Gaussian process")
ax2.set_xlim(mjd_zoom_start,mjd_zoom_stop)



figname =f"{pathfigs}/aervaod_fitgp_merra2"+figtype
fig.savefig(figname)
plt.show()


### Interpretation of hyper-parameters


- model of CO2 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py
- see the different types of kernels here : https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels and implementation here  https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_prior_posterior.html

In [None]:
gaussian_process.kernel_

In [None]:
mjd_obs_start = df_mb["mjd"].min() 
mjd_obs_stop = df_mb["mjd"].max() 
mjd_obs_range = np.arange(mjd_obs_start,mjd_obs_stop,1)

In [None]:
### pickup kernels one by one
the_kernel_a = 0.00316**2 * RBF(length_scale=1e+05)
label_a = "gpr_a : 0.00316**2 * RBF(length_scale=1e+05)"

the_kernel_b =  0.00762**2 * RBF(length_scale=7.64e+04) * ExpSineSquared(length_scale=0.84, periodicity=1)
label_b = "gpr_b :  0.00762**2 * RBF(length_scale=7.64e+04) * ExpSineSquared(length_scale=0.84, periodicity=1)"

the_kernel_c = 0.00316**2 * RationalQuadratic(alpha=1.51, length_scale=1e+05)
label_c = "gpr_c : 0.00316**2 * RationalQuadratic(alpha=1.51, length_scale=1e+05)"                                         
                                          
the_kernel_d = 0.0355**2 * RBF(length_scale=0.507)
label_d = "gpr_d : 0.0355**2 * RBF(length_scale=0.507)"
                                           
the_kernel_e =  WhiteKernel(noise_level=4.57e-05)
label_e= "gpr_e : WhiteKernel(noise_level=4.57e-05)"

### generate gaussian process from each kernel
gpr_a = GaussianProcessRegressor(kernel=the_kernel_a, random_state=0)
gpr_b = GaussianProcessRegressor(kernel=the_kernel_b, random_state=0)
gpr_c = GaussianProcessRegressor(kernel=the_kernel_c, random_state=0)
gpr_d = GaussianProcessRegressor(kernel=the_kernel_d, random_state=0)
gpr_e = GaussianProcessRegressor(kernel=the_kernel_e, random_state=0)

fig, axs = plt.subplots(nrows=5, sharex=True, sharey=True, figsize=(10, 8),layout="constrained")
n_samples=3

# plot prior
plot_gpr_samples(gpr_a, n_samples = n_samples, ax=axs[0], x=mjd_obs_range,label=label_a)
axs[0].legend()
plot_gpr_samples(gpr_b, n_samples = n_samples, ax=axs[1], x=mjd_obs_range, label=label_b)
axs[1].legend()
plot_gpr_samples(gpr_c, n_samples = n_samples, ax=axs[2], x=mjd_obs_range,label=label_c)
axs[2].legend()
plot_gpr_samples(gpr_d, n_samples = n_samples, ax=axs[3], x=mjd_obs_range,label=label_d)
axs[3].legend()
plot_gpr_samples(gpr_e, n_samples = n_samples, ax=axs[4], x=mjd_obs_range,label=label_e)
axs[4].legend()

axs[0].set_title("Kernel functions for VAOD")
figname =f"{pathfigs}/vaod_kernelcomponents_merra2"+figtype
fig.savefig(figname)
plt.show()


## Aerosol Angstrom

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(18,10))
gs = GridSpec(2, 1,figure=fig)

ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_mb.Time.values), df_mb.TOTANGSTR.values,c="purple",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("VAOD_Angstrom from Merra2")
ax1.legend()
ax1.set_ylabel("VAOD_Angstrom")
#ax.set_xlim(TMIN,TMAX)
data = df_mb.TOTANGSTR.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected max-range for VAOD-Angstrom : ",
                     f"- average : {mean:.3f}",
                     f"- median : {median:.3f}",
                     f"- sigma : {std:.3f}",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)


ax2.plot(df_mb.mjd, df_mb.TOTANGSTR.values,c="purple",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("VAOD")



figname =f"{pathfigs}/vaodangstrom_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()

### Define kernels

In [None]:
long_term_trend_kernel = 5.0**2 * RBF(length_scale=50.0)

seasonal_kernel = (
    2.0**2
    * RBF(length_scale=1000.0)
    * ExpSineSquared(length_scale=365.25, periodicity=1.0, periodicity_bounds="fixed")
)

irregularities_kernel = 2.0**2 * RationalQuadratic(length_scale=10.0, alpha=1.0)

noise_kernel = 0.1**2 * RBF(length_scale=1.) + WhiteKernel(
    noise_level=1.**2, noise_level_bounds=(1e-5, 1e5)
)

In [None]:
angstrom_kernel = (
    long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
    #seasonal_kernel + irregularities_kernel + noise_kernel
    #seasonal_kernel + irregularities_kernel 
)

### Fit the gaussian process model

In [None]:
x = df_mb.mjd.values[index_selected]
X = x.reshape(-1, 1)
y = df_mb.TOTANGSTR.values[index_selected]
y_mean = y.mean()

gaussian_process = GaussianProcessRegressor(kernel=angstrom_kernel, normalize_y=False)
gaussian_process.fit(X, y - y_mean)

### Prediction

In [None]:
x_test = np.arange(start=mjd_min_b, stop=mjd_max_b,step=2)
X_test = x_test.reshape(-1,1)
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
mean_y_pred += y_mean

In [None]:
fig = plt.figure(figsize=(18,10),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_test,mean_y_pred,color="tab:purple", alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:purple",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("Angstrom")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Aerosol Angstrom exponent with Gaussian process")




ax2.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax2.plot(x_test,mean_y_pred,color="tab:blue", alpha=1.0, label="Gaussian process")
ax2.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:purple",
    alpha=0.2,
)
ax2.legend()


ax2.set_ylabel("Angstrom")
ax2.set_xlabel("mjd")
ax2.set_title("Time-Zoom on Angstrom exponent with Gaussian process")
ax2.set_xlim(mjd_zoom_start,mjd_zoom_stop)



figname =f"{pathfigs}/aerangstrom_fitgp_merra2"+figtype
fig.savefig(figname)
plt.show()


### Interpretation of hyper-parameters


- model of CO2 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py
- see the different types of kernels here : https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels and implementation here  https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_prior_posterior.html

In [None]:
gaussian_process.kernel_

In [None]:
### pickup kernels one by one
the_kernel_a = 0.00316**2 * RBF(length_scale=4.65e+04)
label_a = "gpr_a : 0.00316**2 * RBF(length_scale=4.65e+04)"

the_kernel_b =  0.035**2 * RBF(length_scale=1.31e+03) * ExpSineSquared(length_scale=1.34, periodicity=1)
label_b = "gpr_b : 0.035**2 * RBF(length_scale=1.31e+03) * ExpSineSquared(length_scale=1.34, periodicity=1)"

the_kernel_c = 0.122**2 * RationalQuadratic(alpha=0.184, length_scale=0.37)
label_c = "gpr_c : 0.122**2 * RationalQuadratic(alpha=0.184, length_scale=0.37)"                                         
                                          
the_kernel_d =  0.0436**2 * RBF(length_scale=0.447) 
label_d = "gpr_d : 0.0436**2 * RBF(length_scale=0.447) "
                                           
the_kernel_e =  WhiteKernel(noise_level=0.000249)
label_e= "gpr_e : WhiteKernel(noise_level=0.000249)"

### generate gaussian process from each kernel
gpr_a = GaussianProcessRegressor(kernel=the_kernel_a, random_state=0)
gpr_b = GaussianProcessRegressor(kernel=the_kernel_b, random_state=0)
gpr_c = GaussianProcessRegressor(kernel=the_kernel_c, random_state=0)
gpr_d = GaussianProcessRegressor(kernel=the_kernel_d, random_state=0)
gpr_e = GaussianProcessRegressor(kernel=the_kernel_e, random_state=0)

fig, axs = plt.subplots(nrows=5, sharex=True, sharey=True, figsize=(10, 8),layout="constrained")
n_samples=3

# plot prior
plot_gpr_samples(gpr_a, n_samples = n_samples, ax=axs[0], x=mjd_obs_range,label=label_a)
axs[0].legend()
plot_gpr_samples(gpr_b, n_samples = n_samples, ax=axs[1], x=mjd_obs_range, label=label_b)
axs[1].legend()
plot_gpr_samples(gpr_c, n_samples = n_samples, ax=axs[2], x=mjd_obs_range,label=label_c)
axs[2].legend()
plot_gpr_samples(gpr_d, n_samples = n_samples, ax=axs[3], x=mjd_obs_range,label=label_d)
axs[3].legend()
plot_gpr_samples(gpr_e, n_samples = n_samples, ax=axs[4], x=mjd_obs_range,label=label_e)
axs[4].legend()


axs[0].set_title("Kernel functions for Angstrom component")
figname =f"{pathfigs}/angstrom_kernelcomponents_merra2"+figtype
fig.savefig(figname)
plt.show()