# Fit Atmosphere time sequences with Gaussian Processes Periodic variations from Merra2

- author Sylvie Dagoret-Campagne
- affiliation : IJCLab
- creation date 2025-10-20 :
- last update : 2025-02-25 : Discuss with ChatGPT to do GP fit
- last update : 2025-02-26 : Do GP periodic fit first
- last update : 2025-03-04 : Save Gaussian process parameters and plot kernel in a function, be carefull with ozone that have not definit positive kernel thus use an alpha parameter for regularisation
- Kernel @usdf **w_2024_50*
- Office emac : mamba_py311
- Home emac : base (conda)
- laptop : conda_py311

**Goal** : Fit the variation of Merra2 parameter impact the transmission

- CO2 fit : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py

- Kernels : https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels

In [None]:
from platform import python_version
print(python_version())

In [None]:
import warnings
warnings.resetwarnings()
warnings.simplefilter('ignore')

In [None]:
from platform import python_version
print(python_version())

In [None]:
import os

In [None]:
# where are stored the figures
pathfigs = "figsFitGPPerAtmosphereFomMerra2"
if not os.path.exists(pathfigs):
    os.makedirs(pathfigs) 
figtype = ".png"

In [None]:
# where are stored the figures
pathdata = "dataFitGPPerAtmosphereFomMerra2"
if not os.path.exists(pathdata):
    os.makedirs(pathdata) 
datatype = ".csv"

In [None]:
import numpy as np
from numpy.linalg import inv
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LogNorm,SymLogNorm
from matplotlib.patches import Circle,Annulus
from astropy.visualization import ZScaleInterval
props = dict(boxstyle='round', facecolor="white", alpha=0.1)
#props = dict(boxstyle='round')

import matplotlib.colors as colors
import matplotlib.cm as cmx

import matplotlib.ticker                         # here's where the formatter is
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)

from matplotlib.gridspec import GridSpec

from astropy.visualization import (MinMaxInterval, SqrtStretch,ZScaleInterval,PercentileInterval,
                                   ImageNormalize,imshow_norm)
from astropy.visualization.stretch import SinhStretch, LinearStretch,AsinhStretch,LogStretch

from astropy.io import fits
from astropy.wcs import WCS
from astropy import units as u
from astropy import constants as c

from astropy.coordinates.earth import EarthLocation
from datetime import datetime
from pytz import timezone

from scipy import interpolate
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree, BallTree

import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', 100)

import matplotlib.ticker                         # here's where the formatter is
import os
import re
import pandas as pd
import pickle
from collections import OrderedDict

plt.rcParams["figure.figsize"] = (4,3)
plt.rcParams["axes.labelsize"] = 'xx-large'
plt.rcParams['axes.titlesize'] = 'xx-large'
plt.rcParams['xtick.labelsize']= 'xx-large'
plt.rcParams['ytick.labelsize']= 'xx-large'

import scipy
from scipy.optimize import curve_fit,least_squares


props = dict(boxstyle='round', facecolor='white', alpha=0.5)

In [None]:
# Remove to run faster the notebook
import ipywidgets as widgets
%matplotlib widget

In [None]:
from astropy.modeling import models

In [None]:
from numpy.random import lognormal

In [None]:
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
from astropy.visualization import (MinMaxInterval, SqrtStretch,ZScaleInterval,PercentileInterval,
                                   ImageNormalize,imshow_norm)
from astropy.visualization.stretch import SinhStretch, LinearStretch,AsinhStretch,LogStretch

from astropy.time import Time
from astropy.timeseries import TimeSeries

import pickle

In [None]:
# Remove to run faster the notebook
import ipywidgets as widgets
%matplotlib widget

In [None]:
from importlib.metadata import version

In [None]:
# wavelength bin colors
#jet = plt.get_cmap('jet')
#cNorm = mpl.colors.Normalize(vmin=0, vmax=NSED)
#scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
#all_colors = scalarMap.to_rgba(np.arange(NSED), alpha=1)

In [None]:
np.__version__

In [None]:
pd.__version__

In [None]:
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.gaussian_process.kernels import ConstantKernel

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor

In [None]:
from scipy.fftpack import fft, fftfreq

In [None]:
YEAR = 365.25
MONTHS6 = YEAR/2.
MONTHS4 = YEAR/3.
QUARTER = YEAR/4. 
DAY = 1.
MONTH = YEAR/12.
WEEK = 7*DAY

In [None]:
FIGXSIZE_1 = 14
FIGYSIZE_1 = 8

FIGXSIZE_0 = 14
FIGYSIZE_0 = 5

In [None]:
def convertNumToDatestr(num):
    year = num//10_000
    month= (num-year*10_000)//100
    day = (num-year*10_000-month*100)

    year_str = str(year).zfill(4)
    month_str = str(month).zfill(2)
    day_str = str(day).zfill(2)
    
    datestr = f"{year_str}-{month_str}-{day_str}"
    return pd.to_datetime(datestr)

In [None]:
def pdf_lognormal(x,a0,mu,sigma):
    """
    """
    pdf = a0*(np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2))/ (x * sigma * np.sqrt(2 * np.pi)))
    return pdf

https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_prior_posterior.html

In [None]:
def plot_gpr_samples(gpr_model, n_samples, ax , x, label):
    """Plot samples drawn from the Gaussian process model.

    If the Gaussian process model is not trained then the drawn samples are
    drawn from the prior distribution. Otherwise, the samples are drawn from
    the posterior distribution. Be aware that a sample here corresponds to a
    function.

    Parameters
    ----------
    gpr_model : `GaussianProcessRegressor`
        A :class:`~sklearn.gaussian_process.GaussianProcessRegressor` model.
    n_samples : int
        The number of samples to draw from the Gaussian process distribution.
    ax : matplotlib axis
        The matplotlib axis where to plot the samples.
    """
    #x = np.linspace(0, 5, 100)
    X = x.reshape(-1, 1)

    y_mean, y_std = gpr_model.predict(X, return_std=True)
    y_samples = gpr_model.sample_y(X, n_samples)

    for idx, single_prior in enumerate(y_samples.T):
        if idx==0:
            ax.plot(
                x,
                single_prior,
                linestyle="--",
                alpha=0.7,
                label=label
            )
        else:
            ax.plot(
                x,
                single_prior,
                linestyle="--",
                alpha=0.7
            )
            
        
        
    ax.plot(x, y_mean, color="black", label="Mean")
    ax.fill_between(
        x,
        y_mean - y_std,
        y_mean + y_std,
        alpha=0.1,
        color="black",
        label=r"$\pm$ 1 std. dev.",
    )
    
    #ax.set_ylim([-3, 3])

In [None]:
def fourier_analysis(dates, values, ax, mode = "logxlogy",title="Analyse de Fourier - Spectre des fréquences",
                    xlabel="Fréquence (cycles par jour)",ylabel="Amplitude",label="Amplitude spectrale"):
    # Centrer les données autour de la moyenne
    values_centered = values - np.mean(values)

    # Nombre de points
    N = len(dates)
    # Intervalle d'échantillonnage (assume 1 jour entre chaque point)
    T = np.mean(np.diff(dates))  # Période d'échantillonnage

    # Fréquence de Nyquist (limite de Shannon)
    f_nyquist = 1 / (2 * T)
    
    # Transformée de Fourier
    fft_values = fft(values_centered)/ np.sqrt(N)
    freqs = fftfreq(N, T)  # Fréquences associées

    

    # Seulement la moitié du spectre est utile (symétrie)
    positive_freqs = freqs[:N // 2]
    positive_fft_values = np.abs(fft_values[:N // 2])

    # Tracer le spectre
    #plt.figure(figsize=(16, 6),layout="constrained")
    
    ax.plot(positive_freqs, positive_fft_values,'ob-' ,ms=5,label=label)

    if mode == "logxliny":
        ax.set_xscale("log")  # Définit l'axe X en échelle logarithmique
        ax.set_yscale("linear")  # Garde l'axe Y en échelle linéaire
    elif mode == "logxlogy":
        ax.set_xscale("log")  # Définit l'axe X en échelle logarithmique
        ax.set_yscale("log")  # Garde l'axe Y en échelle logarithmique
    elif mode == "linxlogy":
        ax.set_xscale("linear")  # Définit l'axe X en  échelle linéaire
        ax.set_yscale("log")  # Garde l'axe Y en échelle logarithmique
    elif mode == "linxliny":
        ax.set_xscale("linear")  # Définit l'axe X en échelle linéaire
        ax.set_yscale("linear")  # Garde l'axe Y en échelle linéaire
        
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    #ax.axvline(1/365.25, color='r', linestyle='-', label="Cycle : 365 days - 1 year")
    #ax.axvline(1/182.625, color='r', linestyle='--', label="Cycle : 182.6 days - 6 months")
    #ax.axvline(1/91.3125, color='r', linestyle='-.', label="Cycle : 91.3 days - 3 months")
    #ax.axvline(1/30.4375, color='r', linestyle=':', label="Cycle : 30.4 days - 1 month")
    #ax.axvline(1/7, color='purple', linestyle='--', label="Cycle : 7 days - 1 week")
    #ax.axvline(1, color='purple', linestyle='-', label="Cycle : 1 day ")
    #ax.axvline(1/0.5, color='purple', linestyle='-.', label="Cycle : 0.5 day ")

    ax.axvline(1/YEAR, color='r', linestyle='-', label="Cycle : 365 days - 1 year")
    ax.axvline(1/MONTHS6, color='r', linestyle='--', label="Cycle : 182.6 days - 6 months")
    ax.axvline(1/MONTHS4, color='r', linestyle=':', label="Cycle : 121.7 days - 4 months")
    ax.axvline(1/QUARTER, color='r', linestyle='-.', label="Cycle : 91.3 days - 3 months")
    ax.axvline(1/MONTH, color='r', linestyle=':', label="Cycle : 30.4 days - 1 month")
    ax.axvline(1/WEEK, color='purple', linestyle='--', label="Cycle : 7 days - 1 week")
    ax.axvline(DAY, color='purple', linestyle='-', label="Cycle : 1 day ")
    ax.axvline(1./(0.5*DAY), color='purple', linestyle='-.', label="Cycle : 0.5 day ")

    ax.axvline(f_nyquist, color='g', linestyle='--', label=f"Nyquist frequency({f_nyquist:.3f} cycles/days)")
    ax.legend(bbox_to_anchor=(1.05, 1.05),fontsize=12)

    #if figname !="":
    #    plt.savefig(figname)
        
    
    #plt.show()

# Appelle la fonction avec tes données
# fourier_analysis(dates, values)

In [None]:
def GetPWVPeriodicKernel_merra2():
    """
    """

    # Tendance long terme
    long_term_trend_kernel = ConstantKernel(.5, (0.0, 10.0)) * RBF(length_scale=365.0)
    periodic_1year_kernel =  ConstantKernel(3.0, (0.1, 10.0)) * ExpSineSquared(length_scale= 10*YEAR, periodicity= YEAR,                                                                          length_scale_bounds="fixed",periodicity_bounds="fixed")
    periodic_6months_kernel = ConstantKernel(2.5, (0.1, 10.0)) * ExpSineSquared(length_scale= 20*MONTHS6,periodicity=MONTHS6,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed") 
    periodic_3months_kernel = ConstantKernel(2.5, (0.1, 10.0)) * ExpSineSquared(length_scale= 40*QUARTER, periodicity=QUARTER,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")
    periodic_4months_kernel = ConstantKernel(2.5, (0.1, 10.0)) * ExpSineSquared(length_scale= 30*MONTHS4, periodicity=MONTHS4,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")

    seasonal_enveloppe = RBF(length_scale=YEAR, length_scale_bounds=(0.5*YEAR, 10*YEAR))

    # Saisonnalité multi-échelle
    seasonal_kernel = (
        #seasonal_enveloppe * ( periodic_1year_kernel + periodic_6months_kernel + periodic_3months_kernel)
        #seasonal_enveloppe * ( periodic_1year_kernel  + periodic_3months_kernel)
        #periodic_1year_kernel  + periodic_6months_kernel + periodic_3months_kernel
        periodic_1year_kernel  + periodic_6months_kernel + periodic_4months_kernel + periodic_3months_kernel
        #periodic_1year_kernel
    )


    # Petites fluctuations irrégulières
    irregularities_kernel = ConstantKernel(1.0, (0.0, 10.0)) * RationalQuadratic(length_scale=DAY, alpha=1.0)

    # Bruit et variations locales
    #noise_kernel = ConstantKernel(1.0, (0., 10.0)) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0**2)
    noise_kernel = ConstantKernel(1.0, (0., 10.0)) * RBF(length_scale=1.0) 

    # Kernel total
    #full_kernel = long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
    #full_kernel = seasonal_kernel + irregularities_kernel
    full_kernel = seasonal_kernel 

    return full_kernel


In [None]:
def GetOzonePeriodicKernel_merra2():
    """
    """

    # Tendance long terme
    long_term_trend_kernel = ConstantKernel(3, (0., 10.0)) * RBF(length_scale=YEAR)

    periodic_1year_kernel =  ConstantKernel(200, (0., 500.0)) * ExpSineSquared(length_scale= 15*YEAR, periodicity= YEAR,periodicity_bounds="fixed",length_scale_bounds="fixed")
    periodic_6months_kernel = ConstantKernel(300, (0., 500.0)) * ExpSineSquared(length_scale= 20*MONTHS6, periodicity=MONTHS6) 
    periodic_3months_kernel = ConstantKernel(300, (0., 500.0)) * ExpSineSquared(length_scale= 40*QUARTER, periodicity=QUARTER)

    seasonal_enveloppe = RBF(length_scale=YEAR, length_scale_bounds=(0.5*YEAR, 10*YEAR))

    # Saisonnalité multi-échelle
    seasonal_kernel = (
        #seasonal_enveloppe * ( periodic_1year_kernel + periodic_6months_kernel + periodic_3months_kernel)
        #seasonal_enveloppe * ( periodic_1year_kernel + periodic_6months_kernel )
         periodic_1year_kernel
    )


    # Petites fluctuations irrégulières
    irregularities_kernel = ConstantKernel(2, (0, 5.0)) * RationalQuadratic(length_scale=MONTH, alpha=1.0)

    # Bruit et variations locales
    #noise_kernel = ConstantKernel(1.0, (0., 10.0)) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0**2)
    noise_kernel =  ConstantKernel(1, (0, 5.0))  * RBF(length_scale=1.0) 

    # Kernel total
    #full_kernel = long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
    full_kernel = seasonal_kernel

    return full_kernel


In [None]:
def GetVAODPeriodicKernel_merra2():
    """
    """


    # Tendance long terme
    long_term_trend_kernel = ConstantKernel(3, (0., 10.0)) * RBF(length_scale=YEAR)

    periodic_1year_kernel =  ConstantKernel(0.1, (0, 2.)) * ExpSineSquared(length_scale= 10*YEAR, periodicity= YEAR,
                                                                           length_scale_bounds="fixed",periodicity_bounds="fixed")
    periodic_6months_kernel = ConstantKernel(0.1, (0, 2.)) * ExpSineSquared(length_scale= 20*MONTHS6,periodicity=MONTHS6,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed") 
    periodic_3months_kernel = ConstantKernel(0.1, (0, 2.)) * ExpSineSquared(length_scale= 40*QUARTER, periodicity=QUARTER,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")
    periodic_4months_kernel = ConstantKernel(0.1, (0, 2.)) * ExpSineSquared(length_scale= 30*MONTHS4, periodicity=MONTHS4,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")

    periodic_1months_kernel = ConstantKernel(0.1, (0, 2.)) * ExpSineSquared(length_scale= 4+12*MONTH, periodicity=MONTH,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")


    seasonal_enveloppe = RBF(length_scale=YEAR, length_scale_bounds=(0.5*YEAR, 5*YEAR))

    # Saisonnalité multi-échelle
    seasonal_kernel = (
        #seasonal_enveloppe * ( periodic_1year_kernel + periodic_6months_kernel + periodic_3months_kernel)
        #seasonal_enveloppe * ( periodic_1year_kernel )
        periodic_1year_kernel + periodic_6months_kernel + periodic_4months_kernel + periodic_3months_kernel + periodic_1months_kernel
    )

    # Petites fluctuations irrégulières
    irregularities_kernel = ConstantKernel(0.1, (0, 5.0)) * RationalQuadratic(length_scale=MONTH, alpha=1.0)

    # Bruit et variations locales
    #noise_kernel = ConstantKernel(1.0, (0., 10.0)) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0**2)
    noise_kernel =  ConstantKernel(0.1, (0, 5.0))  * RBF(length_scale=1.0) 

    # Kernel total
    #full_kernel = long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
    full_kernel = seasonal_kernel 

    return full_kernel


In [None]:
def GetAngstromPeriodicKernel_merra2():
    """
    """



    # Tendance long terme
    long_term_trend_kernel = ConstantKernel(3, (0., 4.0)) * RBF(length_scale=YEAR)

    periodic_1year_kernel =  ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 10*YEAR, periodicity= YEAR,
                                                                           length_scale_bounds="fixed",periodicity_bounds="fixed")
    periodic_6months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 20*MONTHS6,periodicity=MONTHS6,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed") 
    periodic_3months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 40*QUARTER, periodicity=QUARTER,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")
    periodic_4months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 30*MONTHS4, periodicity=MONTHS4,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")

    periodic_1months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 4+12*MONTH, periodicity=MONTH,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")

    seasonal_enveloppe = RBF(length_scale=YEAR, length_scale_bounds=(0.5*YEAR, 5*YEAR))

    # Saisonnalité multi-échelle
    seasonal_kernel = (
        #seasonal_enveloppe * ( periodic_1year_kernel + periodic_6months_kernel + periodic_3months_kernel)
        #seasonal_enveloppe * ( periodic_1year_kernel )
        periodic_1year_kernel + periodic_6months_kernel  +  periodic_4months_kernel +periodic_3months_kernel + periodic_1months_kernel
    )


    # Petites fluctuations irrégulières
    irregularities_kernel = ConstantKernel(0.1, (0, 5.0)) * RationalQuadratic(length_scale=MONTH, alpha=1.0)

    # Bruit et variations locales
    #noise_kernel = ConstantKernel(1.0, (0., 10.0)) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0**2)
    noise_kernel =  ConstantKernel(0.1, (0, 5.0))  * RBF(length_scale=1.0) 

    # Kernel total
    #full_kernel = long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
    full_kernel = seasonal_kernel 

    return full_kernel


## Configuration

In [None]:
observing_location = EarthLocation.of_site('Rubin Observatory')
tz = timezone('America/Santiago')

### MERRA2 files

In [None]:
filename_m2 = "../../SpectroMerra2/MerradataMerged/Merge_inst1_2d_asm_Nx_M2I1NXASM-2021-2024.csv"
filename_m2b = "../../SpectroMerra2/MerradataMerged/Merge_tavg1_2d_aer_Nx_M2T1NXAER-2021-2024.csv"

In [None]:
df_m = pd.read_csv(filename_m2)
df_mb = pd.read_csv(filename_m2b)

In [None]:
Nm = len(df_m)
Nmb = len(df_mb)
print("Number of points :: ",Nm,Nmb)

In [None]:
df_mb.columns

In [None]:
TMIN = pd.to_datetime(df_m.time.min())
TMAX = pd.to_datetime(df_m.time.max())

### Convert in MJD

In [None]:
df_m["mjd"] = Time(pd.to_datetime(df_m.time.values)).mjd
df_mb["mjd"] = Time(pd.to_datetime(df_mb.time.values)).mjd

In [None]:
mjd_zoom_start = Time("2024-01-01").mjd
mjd_zoom_stop = Time("2025-06-30").mjd

In [None]:
mjd_obs_start = df_m["mjd"].min() 
mjd_obs_stop = df_m["mjd"].max() 

# Start analysis

## Analysis of PWV

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_m.Time.values), df_m.TQV.values,c="b",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("Precipitable water vapor from Merra2")
ax1.legend()
ax1.set_ylabel("PWV (mm)")
#ax.set_xlim(TMIN,TMAX)

data = df_m.TQV.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected max-range for PWV : ",
                     f"- average : {mean:.2f} mm",
                     f"- median : {median:.2f} mm",
                     f"- sigma : {std:.2f} mm",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)


ax2.plot(df_m.mjd, df_m.TQV.values,c="b",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("PWV (mm)")

figname =f"{pathfigs}/pwv_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()


### Define the kernels for PWV

In [None]:
pwv_kernel = GetPWVPeriodicKernel_merra2()

### Make a subsample

In [None]:
NSAMP = 4000
a = np.arange(0,Nm ,1)
b = np.random.choice(a, size=NSAMP,replace = False)
index_selected = np.sort(b)

In [None]:
unique, counts = np.unique(b, return_counts = True)

### Index selected and Fit with gaussian process

In [None]:
# all values without sampling
x_full = df_m.mjd.values
X_full = x_full.reshape(-1, 1)
y_full = df_m.TQV.values

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,y_full,ax=ax ,mode= "logxliny",
                 title = "PWV : absolute  FFT",
                 xlabel="days",
                 ylabel=" mm",
                 label="FFT")
figname =f"{pathfigs}/pwv_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### subset of values choosen randomly to be fitted on 

In [None]:
# subset of values choosen randomly to be fitted on 
x = df_m.mjd.values[index_selected]
X = x.reshape(-1, 1)
y = df_m.TQV.values[index_selected]
y_median = np.median(y)
y_mean = np.mean(y)

### Fit GP

In [None]:
#gaussian_process = GaussianProcessRegressor(kernel=pwv_kernel, normalize_y= True)
#gaussian_process.fit(X, y)

gaussian_process = GaussianProcessRegressor(kernel=pwv_kernel, normalize_y= False)
gaussian_process.fit(X, y-y_median)

In [None]:
txtstr_kernel = f"{gaussian_process.kernel_}"
txtstr_kernel = "\n + ".join(txtstr_kernel.split("+ "))

In [None]:
txtstr_kernel

### Save Gaussian Kernel

In [None]:
# save

gpparams_model_filename = 'model_gpperiodic_pwv_wthnorm.pkl'
gpparams_model_fullfilename = os.path.join(pathdata,gpparams_model_filename)

with open(gpparams_model_fullfilename,'wb') as f:
    pickle.dump(gaussian_process,f)


In [None]:
# load
with open(gpparams_model_fullfilename, 'rb') as f:
    gp_regressor = pickle.load(f)
    print(gp_regressor)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.imshow(gp_regressor.kernel(X),origin="lower",cmap="rainbow")
plt.show()

### Prediction on subsample

In [None]:
mjd_min = df_m.mjd.values.min()
mjd_max = df_m.mjd.values.max() + YEAR

In [None]:
x_test = np.arange(start=mjd_min, stop=mjd_max,step=2)
X_test = x_test.reshape(-1,1)
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)

In [None]:
mean_y_pred += y_median

In [None]:
fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_test,mean_y_pred,color="tab:blue", lw=3 ,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:blue",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("PWV (mm)")
ax1.set_xlabel("mjd")
ax1.set_title("Fit PWV with Gaussian process (subsample)")
ax1.text(0.1, 0.95, txtstr_kernel, transform=ax1.transAxes, fontsize=16,verticalalignment='top', bbox=props)

ax2.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax2.plot(x_test,mean_y_pred,color="tab:blue",lw=3 ,alpha=1.0, label="Gaussian process")
ax2.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:blue",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("PWV (mm)")
ax2.set_xlabel("mjd")
ax2.set_title("Time-Zoom on Fit PWV with Gaussian process")
ax2.set_xlim(mjd_zoom_start,mjd_zoom_stop)
ax2.text(0.1, 0.95, txtstr_kernel, transform=ax2.transAxes, fontsize=16,verticalalignment='top', bbox=props)



figname =f"{pathfigs}/pwv_fitgp_merra2"+figtype
fig.savefig(figname)
plt.show()


### Residuals on the whole statistics

In [None]:
mean_yfull_pred, std_yfull_pred = gaussian_process.predict(X_full, return_std=True)
mean_yfull_pred += y_median

In [None]:
residuals = y_full -  mean_yfull_pred

In [None]:
stat_mean = np.mean(residuals)
stat_med = np.median(residuals)
stat_std = np.std(residuals)

In [None]:
txtstr_stat = [f"mean = {stat_mean:.2f} mm ", f"median = {stat_med:.2f} mm",f"std = {stat_std:.2f} mm"]
txtstr_stat = "\n".join(txtstr_stat)

In [None]:
fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x_full,y_full,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_full,mean_yfull_pred,color="tab:blue", lw=3 ,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_full.ravel(),
    mean_yfull_pred - std_yfull_pred,
    mean_yfull_pred + std_yfull_pred,
    color="tab:blue",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("PWV (mm)")
ax1.set_xlabel("mjd")
ax1.set_title("Fit PWV with Gaussian process")
ax1.text(0.1, 0.95, txtstr_kernel, transform=ax1.transAxes, fontsize=16,verticalalignment='top', bbox=props)


ax2.plot(x_full,residuals,'-',color="k",linestyle="solid", label="Residuals")
ax2.fill_between(
    X_full.ravel(),
    - std_yfull_pred,
    std_yfull_pred,
    color="tab:blue",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("PWV residuals (mm)")
ax2.set_xlabel("mjd")
ax2.set_title("Residuals on Fit PWV with Gaussian process")
ax2.axhline(0,color="tab:blue",linewidth=3)
ax2.text(0.01, 0.95, txtstr_stat, transform=ax2.transAxes, fontsize=16,verticalalignment='top', bbox=props)


figname =f"{pathfigs}/pwv_fitgpresiduals_merra2"+figtype
fig.savefig(figname)
plt.show()


In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,4),layout="constrained")
ax.hist(residuals,bins=50,facecolor="tab:blue")
ax.set_title("Residuals to PWV GP periodic model")
ax.text(0.45, 0.95, txtstr_stat, transform=ax.transAxes, fontsize=12,verticalalignment='top', bbox=props)
ax.set_xlabel("$\Delta PWV$ (mm)")
figname =f"{pathfigs}/pwvres_histdata_merra2"+figtype
fig.savefig(figname)
plt.show()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,residuals,ax=ax ,mode= "logxliny",
                 title = "PWV periodic-GP fit residuals absolute  FFT",
                 xlabel="days",
                 ylabel=" mm",
                 label="FFT residuals")
figname =f"{pathfigs}/pwvres_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### Save residuals data in a file

In [None]:
df_out = pd.DataFrame({'mjd': x_full, "res" : residuals})
datafilename = f"{pathdata}/pwv_fitgpresiduals_merra2"+datatype
df_out.to_csv(datafilename)

## Ozone

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)

ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_m.Time.values), df_m.TO3.values,c="r",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("Ozone from Merra2")
#ax1.legend()
ax1.set_ylabel("Ozone (DU)")
#ax.set_xlim(TMIN,TMAX)

data = df_m.TO3.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected range for Ozone : ",
                     f"- average : {mean:.2f} DU",
                     f"- median : {median:.2f} DU",
                     f"- sigma : {std:.2f} DU",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)

ax2.plot(df_m.mjd, df_m.TO3.values,c="r",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("Ozone (DU)")


figname =f"{pathfigs}/ozone_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()


### Define the kernels

In [None]:
#ozone_kernel = full_kernel
ozone_kernel = GetOzonePeriodicKernel_merra2()

### FFT over the full Ozone dataset

In [None]:
# all values without sampling
x_full = df_m.mjd.values
X_full = x_full.reshape(-1, 1)
y_full = df_m.TO3.values

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,y_full,ax=ax ,mode= "logxliny",
                 title = "Ozone : absolute  FFT (full sample)",
                 xlabel="days",
                 ylabel="DU",
                 label="FFT")
figname =f"{pathfigs}/ozone_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### Subsampling

In [None]:
NSAMP = 4000
a = np.arange(0,Nm ,1)
b = np.random.choice(a, size=NSAMP,replace = False)
index_selected = np.sort(b)
print(index_selected)

In [None]:
x = df_m.mjd.values[index_selected]
X = x.reshape(-1, 1)
y = df_m.TO3.values[index_selected]
y_mean = y.mean()

In [None]:
y_mean

In [None]:
y

In [None]:
fig,ax = plt.subplots(1,1,figsize=(FIGXSIZE_0,FIGYSIZE_0))
ax.plot(x,y)
ax.set_title("sampled values for ozone")
plt.show()

### Gaussian Fit

- **Note here I should not ask for internal normalisation**

- **Note here the alpha parameter help in having a definit positive kernel function. It act as a regularisation

In [None]:
#gaussian_process = GaussianProcessRegressor(kernel=ozone_kernel, normalize_y=False)
gaussian_process = GaussianProcessRegressor(kernel=ozone_kernel,alpha=0.0001,normalize_y=False,random_state=2)
gaussian_process.fit(X, y - y_mean)
                                  

In [None]:
txtstr_kernel = f"{gaussian_process.kernel_}"
txtstr_kernel = "\n + ".join(txtstr_kernel.split("+ "))

### Prediction

In [None]:
x_test = np.arange(start=mjd_min, stop=mjd_max,step=2)
X_test = x_test.reshape(-1,1)
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
mean_y_pred += y_mean

In [None]:
fig = plt.figure(figsize=(FIGXSIZE_1,FIGYSIZE_1),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_test,mean_y_pred,color="tab:red", lw=3,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:red",
    alpha=0.2,
)
ax1.legend()
ax1.set_ylabel("Ozone (DU)")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Ozone with Gaussian process (subsamples)")
ax1.text(0.1, 0.95, txtstr_kernel, transform=ax1.transAxes, fontsize=16,verticalalignment='top', bbox=props)
ax1.set_ylim(0.,600.)


ax2.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax2.plot(x_test,mean_y_pred,color="tab:red", lw=3,alpha=1.0, label="Gaussian process")
ax2.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:red",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("Ozone (DU)")
ax2.set_xlabel("mjd")
ax2.set_title("Time-Zoom on Fit Ozone with Gaussian process")
ax2.set_xlim(mjd_zoom_start,mjd_zoom_stop)
ax2.text(0.1, 0.95, txtstr_kernel, transform=ax2.transAxes, fontsize=16,verticalalignment='top', bbox=props)



figname =f"{pathfigs}/ozone_fitgp_merra2"+figtype
fig.savefig(figname)
plt.show()


### Save gaussian Kernel parameters

In [None]:
# save

gpparams_model_filename = 'model_gpperiodic_ozone_nonorm.pkl'
gpparams_model_fullfilename = os.path.join(pathdata,gpparams_model_filename)

with open(gpparams_model_fullfilename,'wb') as f:
    pickle.dump(gaussian_process,f)


In [None]:
# load
with open(gpparams_model_fullfilename, 'rb') as f:
    gp_regressor = pickle.load(f)
    print(gp_regressor)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.imshow(gp_regressor.kernel(X),origin="lower",cmap="rainbow")
plt.show()

### Residuals on the whole statistics

- Note here the gaussian process is learned without internal normalisation

In [None]:
mean_yfull_pred, std_yfull_pred = gaussian_process.predict(X_full, return_std=True)
mean_yfull_pred += y_mean

In [None]:
residuals = y_full -  mean_yfull_pred

In [None]:
stat_mean = np.mean(residuals)
stat_med = np.median(residuals)
stat_std = np.std(residuals)

In [None]:
txtstr_stat = [f"mean = {stat_mean:.2f} DU ", f"median = {stat_med:.2f} DU",f"std = {stat_std:.2f} DU"]
txtstr_stat = "\n".join(txtstr_stat)

In [None]:
fig = plt.figure(figsize=(FIGXSIZE_1,FIGYSIZE_1),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x_full,y_full,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_full,mean_yfull_pred,color="tab:red", lw=3 ,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_full.ravel(),
    mean_yfull_pred - std_yfull_pred,
    mean_yfull_pred + std_yfull_pred,
    color="tab:red",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("Ozone (DU)")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Ozone with Gaussian process")
ax1.text(0.1, 0.95, txtstr_kernel, transform=ax1.transAxes, fontsize=16,verticalalignment='top', bbox=props)


ax2.plot(x_full,residuals,'-',color="k",linestyle="solid", label="Residuals")
ax2.fill_between(
    X_full.ravel(),
    - std_yfull_pred,
    std_yfull_pred,
    color="tab:red",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("Ozone residuals (DU)")
ax2.set_xlabel("mjd")
ax2.set_title("Residuals on Fit Ozone with Gaussian process")
ax2.axhline(0,color="tab:red",linewidth=3)
ax2.text(0.01, 0.95, txtstr_stat, transform=ax2.transAxes, fontsize=16,verticalalignment='top', bbox=props)


figname =f"{pathfigs}/ozone_fitgpresiduals_merra2"+figtype
fig.savefig(figname)
plt.show()


In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,4),layout="constrained")
ax.hist(residuals,bins=50,facecolor="tab:red")
ax.set_title("Residuals to Ozone GP periodic model")
ax.text(0.45, 0.95, txtstr_stat, transform=ax.transAxes, fontsize=12,verticalalignment='top', bbox=props)
ax.set_xlabel("$\Delta$ Ozone (DU)")
figname =f"{pathfigs}/ozoneres_histdata_merra2"+figtype
fig.savefig(figname)
plt.show()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,residuals,ax=ax ,mode= "logxliny",
                 title = "Ozone periodic-GP fit residuals absolute  FFT",
                 xlabel="days",
                 ylabel="DU",
                 label="FFT residuals")
figname =f"{pathfigs}/ozoneres_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### Save data file

In [None]:
df_out = pd.DataFrame({'mjd': x_full, "res" : residuals})
datafilename = f"{pathdata}/ozone_fitgpresiduals_merra2"+datatype
df_out.to_csv(datafilename)

## Aerosol VAOD

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(12,6))
gs = GridSpec(2, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_mb.Time.values), df_mb.TOTEXTTAU.values,c="g",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("VAOD from Merra2")
ax1.legend()
ax1.set_ylabel("VAOD")
#ax.set_xlim(TMIN,TMAX)
data = df_mb.TOTEXTTAU.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected max-range for VAOD : ",
                     f"- average : {mean:.3f}",
                     f"- median : {median:.3f}",
                     f"- sigma : {std:.3f}",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)

ax2.plot(df_mb.mjd, df_mb.TOTEXTTAU.values,c="g",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("VAOD")


figname =f"{pathfigs}/vaod_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()


### Define the kernels

In [None]:
full_kernel =  GetVAODPeriodicKernel_merra2()

In [None]:
vaod_kernel = (
    full_kernel
)

### Full sample

In [None]:
# all values without sampling
x_full = df_mb.mjd.values
X_full = x_full.reshape(-1, 1)
y_full =  df_mb.TOTEXTTAU.values

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,y_full,ax=ax ,mode= "logxliny",
                 title = "VAOD : absolute  FFT (full sample)",
                 xlabel="days",
                 ylabel="no-unit",
                 label="FFT")
figname =f"{pathfigs}/vaod_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### Make a subsample

In [None]:
NSAMP = 4000
a = np.arange(0,Nmb ,1)
#b = np.random.choice(a, size=10000,replace=False)
b = np.random.choice(a, size=NSAMP,replace=False)

In [None]:
unique, counts = np.unique(b, return_counts=True)

In [None]:
index_selected = np.sort(b)

In [None]:
x = df_mb.mjd.values[index_selected]
X = x.reshape(-1, 1)
y = df_mb.TOTEXTTAU.values[index_selected]
y_mean = 0.

### Fit the gaussian process

In [None]:
gaussian_process = GaussianProcessRegressor(kernel=vaod_kernel, normalize_y=True)
gaussian_process.fit(X, y - y_mean)

In [None]:
txtstr_kernel = f"{gaussian_process.kernel_}"
txtstr_kernel = "\n + ".join(txtstr_kernel.split("+ "))

### Save gaussian kernel

In [None]:
# save

gpparams_model_filename = 'model_gpperiodic_vaod_wthnorm.pkl'
gpparams_model_fullfilename = os.path.join(pathdata,gpparams_model_filename)

with open(gpparams_model_fullfilename,'wb') as f:
    pickle.dump(gaussian_process,f)

In [None]:
# load
with open(gpparams_model_fullfilename, 'rb') as f:
    gp_regressor = pickle.load(f)
    print(gp_regressor)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.imshow(gp_regressor.kernel(X),origin="lower",cmap="rainbow")
plt.show()

### Prediction

In [None]:
mjd_min_b = df_mb.mjd.values.min()
mjd_max_b = df_mb.mjd.values.max() + YEAR

In [None]:
x_test = np.arange(start=mjd_min_b, stop=mjd_max_b,step=2)
X_test = x_test.reshape(-1,1)
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
mean_y_pred += y_mean

In [None]:
fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_test,mean_y_pred,color="tab:green", lw=3,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:green",
    alpha=0.2,
)
ax1.legend()
ax1.text(0.1, 0.95, txtstr_kernel, transform=ax1.transAxes, fontsize=12,verticalalignment='top', bbox=props)

ax1.set_ylabel("VAOD")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Aerosol VAOD with Gaussian process")

ax2.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax2.plot(x_test,mean_y_pred,color="tab:green", lw=3 ,alpha=1.0, label="Gaussian process")
ax2.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:green",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("VAOD")
ax2.set_xlabel("mjd")
ax2.set_title("Time-Zoom on Fit VAOD with Gaussian process")
ax2.set_xlim(mjd_zoom_start,mjd_zoom_stop)
ax2.text(0.1, 0.95, txtstr_kernel, transform=ax2.transAxes, fontsize=12,verticalalignment='top', bbox=props)



figname =f"{pathfigs}/aervaod_fitgp_merra2"+figtype
fig.savefig(figname)
plt.show()


### Residuals on the whole statistics

In [None]:
mean_yfull_pred, std_yfull_pred = gaussian_process.predict(X_full, return_std=True)
mean_yfull_pred += y_mean

In [None]:
residuals = y_full -  mean_yfull_pred

In [None]:
stat_mean = np.mean(residuals)
stat_med = np.median(residuals)
stat_std = np.std(residuals)

In [None]:
txtstr_stat = [f"mean = {stat_mean:.2f}", f"median = {stat_med:.2f}",f"std = {stat_std:.2f}"]
txtstr_stat = "\n".join(txtstr_stat)

In [None]:
fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x_full,y_full,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_full,mean_yfull_pred,color="tab:green", lw=3 ,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_full.ravel(),
    mean_yfull_pred - std_yfull_pred,
    mean_yfull_pred + std_yfull_pred,
    color="tab:green",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("VAOD")
ax1.set_xlabel("mjd")
ax1.set_title("Fit VAOD with Gaussian process")
ax1.text(0.1, 0.95, txtstr_kernel, transform=ax1.transAxes, fontsize=12,verticalalignment='top', bbox=props)


ax2.plot(x_full,residuals,'-',color="k",linestyle="solid", label="Residuals")
ax2.fill_between(
    X_full.ravel(),
    - std_yfull_pred,
    std_yfull_pred,
    color="tab:green",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("VAOD residuals")
ax2.set_xlabel("mjd")
ax2.set_title("Residuals on Fit VAOD with Gaussian process")
ax2.axhline(0,color="tab:green",linewidth=3)
ax2.text(0.01, 0.95, txtstr_stat, transform=ax2.transAxes, fontsize=12,verticalalignment='top', bbox=props)


figname =f"{pathfigs}/vaod_fitgpresiduals_merra2"+figtype
fig.savefig(figname)
plt.show()


In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,4),layout="constrained")
ax.hist(residuals,bins=50,facecolor="tab:green")
ax.set_title("Residuals to VAOD GP periodic model")
ax.text(0.45, 0.95, txtstr_stat, transform=ax.transAxes, fontsize=12,verticalalignment='top', bbox=props)
ax.set_xlabel("$\Delta$ VAOD")
figname =f"{pathfigs}/vaodres_histdata_merra2"+figtype
fig.savefig(figname)
plt.show()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,residuals,ax=ax ,mode= "logxliny",
                 title = "VAOD periodic-GP fit residuals absolute  FFT",
                 xlabel="days",
                 ylabel="",
                 label="FFT residuals")
figname =f"{pathfigs}/vaodres_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### Save data file

In [None]:
df_out = pd.DataFrame({'mjd': x_full, "res" : residuals})
datafilename = f"{pathdata}/vaod_fitgpresiduals_merra2"+datatype
df_out.to_csv(datafilename)

## Aerosol Angstrom

In [None]:
from matplotlib.dates import DateFormatter
#date_form = DateFormatter("%y-%m-%dT%H:%M")
date_form = DateFormatter("%y-%m")

fig = plt.figure(figsize=(12,6))
gs = GridSpec(2, 1,figure=fig)

ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
leg2=ax2.get_legend()


ax1.plot(pd.to_datetime(df_mb.Time.values), df_mb.TOTANGSTR.values,c="purple",lw=0.5,label="Merra2")
ax1.set_xlabel("time")
ax1.xaxis.set_major_formatter(date_form)
ax1.set_title("VAOD_Angstrom from Merra2")
ax1.legend()
ax1.set_ylabel("VAOD_Angstrom")
#ax.set_xlim(TMIN,TMAX)
data = df_mb.TOTANGSTR.values
mean = np.mean(data)
median = np.median(data)
std = np.std(data)
textstr = "\n".join((f"Expected max-range for VAOD-Angstrom : ",
                     f"- average : {mean:.3f}",
                     f"- median : {median:.3f}",
                     f"- sigma : {std:.3f}",     
                    ))
ax1.text(0.05, 0.95, textstr, transform=ax1.transAxes, fontsize=14,verticalalignment='top', bbox=props)


ax2.plot(df_mb.mjd, df_mb.TOTANGSTR.values,c="purple",lw=0.5,label="Merra2")
ax2.set_xlabel("time (MJD)")
ax2.legend()
ax2.set_ylabel("VAOD")



figname =f"{pathfigs}/vaodangstrom_allpoints_merra2"+figtype
fig.savefig(figname)
plt.show()

### Define kernels

In [None]:
# Tendance long terme
long_term_trend_kernel = ConstantKernel(3, (0., 4.0)) * RBF(length_scale=YEAR)

periodic_1year_kernel =  ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 10*YEAR, periodicity= YEAR,
                                                                           length_scale_bounds="fixed",periodicity_bounds="fixed")
periodic_6months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 20*MONTHS6,periodicity=MONTHS6,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed") 
periodic_3months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 40*QUARTER, periodicity=QUARTER,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")
periodic_4months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 30*MONTHS4, periodicity=MONTHS4,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")

periodic_1months_kernel = ConstantKernel(0.1, (0, 4.)) * ExpSineSquared(length_scale= 4+12*MONTH, periodicity=MONTH,
                                                                            length_scale_bounds="fixed",periodicity_bounds="fixed")

seasonal_enveloppe = RBF(length_scale=YEAR, length_scale_bounds=(0.5*YEAR, 5*YEAR))

# Saisonnalité multi-échelle
seasonal_kernel = (
    #seasonal_enveloppe * ( periodic_1year_kernel + periodic_6months_kernel + periodic_3months_kernel)
    #seasonal_enveloppe * ( periodic_1year_kernel )
    periodic_1year_kernel + periodic_6months_kernel  +  periodic_4months_kernel +periodic_3months_kernel + periodic_1months_kernel
)


# Petites fluctuations irrégulières
irregularities_kernel = ConstantKernel(0.1, (0, 5.0)) * RationalQuadratic(length_scale=MONTH, alpha=1.0)

# Bruit et variations locales
#noise_kernel = ConstantKernel(1.0, (0., 10.0)) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0**2)
noise_kernel =  ConstantKernel(0.1, (0, 5.0))  * RBF(length_scale=1.0) 

# Kernel total
#full_kernel = long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
full_kernel = seasonal_kernel 


In [None]:
full_kernel = GetAngstromPeriodicKernel_merra2()

In [None]:
angstrom_kernel = full_kernel

### Full sample

In [None]:
# all values without sampling
x_full = df_mb.mjd.values
X_full = x_full.reshape(-1, 1)
y_full =  df_mb.TOTANGSTR.values

### FFT over full sample

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,y_full,ax=ax ,mode= "logxliny",
                 title = "Aerosol-Angstrom : absolute  FFT (full sample)",
                 xlabel="days",
                 ylabel="",
                 label="FFT")
figname =f"{pathfigs}/angstrom_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### subsample

In [None]:
x = df_mb.mjd.values[index_selected]
X = x.reshape(-1, 1)
y = df_mb.TOTANGSTR.values[index_selected]
y_mean = 0.

### Fit the gaussian process model

In [None]:
gaussian_process = GaussianProcessRegressor(kernel=angstrom_kernel, normalize_y= True)
gaussian_process.fit(X, y - y_mean)

In [None]:
txtstr_kernel = f"{gaussian_process.kernel_}"
txtstr_kernel = "\n + ".join(txtstr_kernel.split("+ "))

### Save parameters

In [None]:
# save

gpparams_model_filename = 'model_gpperiodic_angstrom_wthnorm.pkl'
gpparams_model_fullfilename = os.path.join(pathdata,gpparams_model_filename)

with open(gpparams_model_fullfilename,'wb') as f:
    pickle.dump(gaussian_process,f)


In [None]:
# load
with open(gpparams_model_fullfilename, 'rb') as f:
    gp_regressor = pickle.load(f)
    print(gp_regressor)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.imshow(gp_regressor.kernel(X),origin="lower",cmap="rainbow")
plt.show()

### Prediction

In [None]:
x_test = np.arange(start=mjd_min_b, stop=mjd_max_b,step=2)
X_test = x_test.reshape(-1,1)
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
mean_y_pred += y_mean

In [None]:
fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
ax1.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_test,mean_y_pred,color="tab:purple",lw=3 ,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:purple",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("Angstrom")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Aerosol Angstrom exponent with Gaussian process")
ax1.text(0.1, 0.95, txtstr_kernel, transform=ax1.transAxes, fontsize=12,verticalalignment='top', bbox=props)

ax2.plot(x,y,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax2.plot(x_test,mean_y_pred,color="tab:purple", lw=3, alpha=1.0, label="Gaussian process")
ax2.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="tab:purple",
    alpha=0.2,
)
ax2.legend()


ax2.set_ylabel("Angstrom")
ax2.set_xlabel("mjd")
ax2.set_title("Time-Zoom on Angstrom exponent with Gaussian process")
ax2.set_xlim(mjd_zoom_start,mjd_zoom_stop)
ax2.text(0.1, 0.95, txtstr_kernel, transform=ax2.transAxes, fontsize=12,verticalalignment='top', bbox=props)


figname =f"{pathfigs}/aerangstrom_fitgp_merra2"+figtype
fig.savefig(figname)
plt.show()


### Residuals on the whole statistics

In [None]:
mean_yfull_pred, std_yfull_pred = gaussian_process.predict(X_full, return_std=True)
mean_yfull_pred += y_mean

In [None]:
residuals = y_full -  mean_yfull_pred

In [None]:
stat_mean = np.mean(residuals)
stat_med = np.median(residuals)
stat_std = np.std(residuals)

In [None]:
txtstr_stat = [f"mean = {stat_mean:.2f}", f"median = {stat_med:.2f}",f"std = {stat_std:.2f}"]
txtstr_stat = "\n".join(txtstr_stat)

In [None]:
fig = plt.figure(figsize=(12,6),layout="constrained")
gs = GridSpec(2, 1,figure=fig)
#gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1],sharey=ax1)
ax1.plot(x_full,y_full,'-',color="k",linestyle="dashed", label="M2 Measurements")
ax1.plot(x_full,mean_yfull_pred,color="tab:purple", lw=3 ,alpha=1.0, label="Gaussian process")
ax1.fill_between(
    X_full.ravel(),
    mean_yfull_pred - std_yfull_pred,
    mean_yfull_pred + std_yfull_pred,
    color="tab:purple",
    alpha=0.2,
)
ax1.legend()

ax1.set_ylabel("Angstrom exponent")
ax1.set_xlabel("mjd")
ax1.set_title("Fit Angstrom with Gaussian process")
ax1.text(0.3, 0.5, txtstr_kernel, transform=ax1.transAxes, fontsize=12,verticalalignment='top', bbox=props)


ax2.plot(x_full,residuals,'-',color="k",linestyle="solid", label="Residuals")
ax2.fill_between(
    X_full.ravel(),
    - std_yfull_pred,
    std_yfull_pred,
    color="tab:purple",
    alpha=0.2,
)
ax2.legend()

ax2.set_ylabel("Angstrom residuals")
ax2.set_xlabel("mjd")
ax2.set_title("Residuals on Fit Angstrom exponent with Gaussian process")
ax2.axhline(0,color="tab:purple",linewidth=3)
ax2.text(0.01, 0.95, txtstr_stat, transform=ax2.transAxes, fontsize=12,verticalalignment='top', bbox=props)


figname =f"{pathfigs}/angstrom_fitgpresiduals_merra2"+figtype
fig.savefig(figname)
plt.show()


In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,4),layout="constrained")
ax.hist(residuals,bins=50,facecolor="tab:purple")
ax.set_title("Residuals to Angstrol GP periodic model")
ax.text(0.45, 0.95, txtstr_stat, transform=ax.transAxes, fontsize=12,verticalalignment='top', bbox=props)
ax.set_xlabel("$\Delta$ Angstrom")
figname =f"{pathfigs}/angstromres_histdata_merra2"+figtype
fig.savefig(figname)
plt.show()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,4),layout="constrained")
fourier_analysis(x_full,residuals,ax=ax ,mode= "logxliny",
                 title = "Angstrom periodic-GP fit residuals absolute  FFT",
                 xlabel="days",
                 ylabel="",
                 label="FFT residuals")
figname =f"{pathfigs}/angstromres_FFTdata_merra2"+figtype
fig.savefig(figname)
plt.show()

### save datafile

In [None]:
df_out = pd.DataFrame({'mjd': x_full, "res" : residuals})
datafilename = f"{pathdata}/angstrom_fitgpresiduals_merra2"+datatype
df_out.to_csv(datafilename)