# Discrete Covariance Function On residuals from time sequences from Merra2

- author Sylvie Dagoret-Campagne
- affiliation : IJCLab
- creation date 2025-03-08 
- last update : 2025-03-08
- Kernel @usdf **w_2024_50*
- Office emac : mamba_py311
- Home emac : base (conda)
- laptop : conda_py311

**Goal** : Fit the variation of Merra2 parameter impact the transmission

- CO2 fit : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py

- Kernels : https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels

In [None]:
from platform import python_version
print(python_version())

In [None]:
import warnings
warnings.resetwarnings()
warnings.simplefilter('ignore')

In [None]:
from platform import python_version
print(python_version())

In [None]:
import os,glob

In [None]:
# where are stored the figures
pathfigs = "figsDCTResidualsAtmosphereFomMerra22"
if not os.path.exists(pathfigs):
    os.makedirs(pathfigs) 
figtype = ".png"

In [None]:
# where are stored the data
pathdata = "dataDCTResidualsAtmosphereFromMerra2"
if not os.path.exists(pathdata):
    os.makedirs(pathdata) 
datatype = ".csv"

dcf_path_input = os.path.join(pathdata,"dcf_timecurves") 
dcf_path_output = os.path.join(pathdata,"dcf_results") 
if not os.path.exists(dcf_path_input):
    os.makedirs(dcf_path_input) 
if not os.path.exists(dcf_path_output):
    os.makedirs(dcf_path_output) 

In [None]:
# where are stored the redsiduals
inputpathdata = "dataFitGPPerAtmosphereFromMerra2"
if not os.path.exists(inputpathdata):
    #os.makedirs(pathdata) 
    raise Exception("Missing input data on residuals")
else:
    filessearch_str = os.path.join(inputpathdata ,"*.csv")
    filelist = glob.glob( filessearch_str)

In [None]:
filename_residuals = {}
for tag in ["pwv", "ozone","vaod", "angstrom"]:
    for filename in filelist:
        if tag in filename:
            filename_residuals[tag] = filename
            break
filename_residuals            

In [None]:
import numpy as np
from numpy.linalg import inv
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LogNorm,SymLogNorm
from matplotlib.patches import Circle,Annulus
from astropy.visualization import ZScaleInterval
props = dict(boxstyle='round', facecolor="white", alpha=0.1)
#props = dict(boxstyle='round')

import matplotlib.colors as colors
import matplotlib.cm as cmx

import matplotlib.ticker                         # here's where the formatter is
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)

from matplotlib.gridspec import GridSpec

from astropy.visualization import (MinMaxInterval, SqrtStretch,ZScaleInterval,PercentileInterval,
                                   ImageNormalize,imshow_norm)
from astropy.visualization.stretch import SinhStretch, LinearStretch,AsinhStretch,LogStretch

from astropy.io import fits
from astropy.wcs import WCS
from astropy import units as u
from astropy import constants as c

from astropy.coordinates.earth import EarthLocation
from datetime import datetime
from pytz import timezone

from scipy import interpolate
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree, BallTree

import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', 100)

import matplotlib.ticker                         # here's where the formatter is
import os
import re
import pandas as pd
import pickle
from collections import OrderedDict

plt.rcParams["figure.figsize"] = (4,3)
plt.rcParams["axes.labelsize"] = 'xx-large'
plt.rcParams['axes.titlesize'] = 'xx-large'
plt.rcParams['xtick.labelsize']= 'xx-large'
plt.rcParams['ytick.labelsize']= 'xx-large'

import scipy
from scipy import stats
from scipy.optimize import curve_fit,least_squares

# https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html

props = dict(boxstyle='round', facecolor='white', alpha=0.5)

In [None]:
# Remove to run faster the notebook
import ipywidgets as widgets
%matplotlib widget

In [None]:
from astropy.modeling import models

In [None]:
from numpy.random import lognormal

In [None]:
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
from astropy.visualization import (MinMaxInterval, SqrtStretch,ZScaleInterval,PercentileInterval,
                                   ImageNormalize,imshow_norm)
from astropy.visualization.stretch import SinhStretch, LinearStretch,AsinhStretch,LogStretch

from astropy.time import Time
from astropy.timeseries import TimeSeries
from statsmodels.tsa.stattools import adfuller, kpss

In [None]:
# Remove to run faster the notebook
import ipywidgets as widgets
%matplotlib widget

In [None]:
from importlib.metadata import version

In [None]:
# wavelength bin colors
#jet = plt.get_cmap('jet')
#cNorm = mpl.colors.Normalize(vmin=0, vmax=NSED)
#scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
#all_colors = scalarMap.to_rgba(np.arange(NSED), alpha=1)

In [None]:
np.__version__

In [None]:
pd.__version__

In [None]:
from astropy.timeseries import LombScargle

In [None]:
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.gaussian_process.kernels import ConstantKernel
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from scipy.special import gamma
from scipy.stats import levy_stable,cauchy, laplace, norm

In [None]:
from astropy.modeling import models, fitting
from pyzdcf import pyzdcf

In [None]:
YEAR = 365.25
MONTHS6 = YEAR/2.
MONTHS4 = YEAR/3.
QUARTER = YEAR/4. 
DAY = 1.
MONTH = YEAR/12.
WEEK = 7*DAY

In [None]:
FIGXSIZE_1 = 14
FIGYSIZE_1 = 8

FIGXSIZE_0 = 14
FIGYSIZE_0 = 5

In [None]:
def ComputeZDCF(filename_in,df_pwv_curve,minpts=20):
    """
    Compute the Discrete Covariance Curve with pyzdcf

    parameters :
    - df_pwv_curve : pandas dataframe with 3 columns : (time, pwv, sigma)
      The time has to be chosen in terms of days/hours,min ..., outside this function
    - filename_in : csv file where are written the tempory DTC curve
    
    """
    
    # add the error on the point 
    #df_pwvc = df_pwvc.assign(sig_pwv = lambda x: sigma_repeatability)

    full_filename_in = os.path.join(dcf_path_input,filename_in)
    df_pwv_curve.to_csv(full_filename_in, index=False,header=False)

    # parameters for the pyzdcf
    params_dcf = dict(autocf    =  True, # Autocorrelation (T) or cross-correlation (F)
              prefix            = 'acf',  # Output files prefix
              uniform_sampling  =  False, # Uniform sampling?
              omit_zero_lags    =  False,  # Omit zero lag points?
              minpts            =  minpts,     # Min. num. of points per bin (0 is a flag for default value of 11)
              num_MC            =  100,   # Num. of Monte Carlo simulations for error estimation
              lc1_name          =  filename_in,   # Name of the first light curve file
              lc2_name          =  filename_in    # Name of the second light curve file (required only if we do CCF)
             )

    # compute the ZDCF
    
    dcf_df = pyzdcf(input_dir  =  dcf_path_input + "/" , 
                    output_dir = dcf_path_output + "/", 
                    intr       = False, 
                    parameters = params_dcf, 
                    sep        = ',', 
                    sparse     = 'auto', 
                    verbose    = False)
    return dcf_df

## Configuration

In [None]:
tmin_select = 59500

### Residuals

# Start analysis

## Analysis of PWV

In [None]:
full_filename = filename_residuals["pwv"]
print(full_filename)

In [None]:
df = pd.read_csv(full_filename,index_col=0)
N = len(df)

In [None]:
x_full = df["mjd"].values
y_full = df["res"].values
X_full = x_full.reshape(-1, 1)

### Continue in GP Fit if iregularities

In [None]:

tmax_select = x_full.max()
good_indexes_forresiduals = np.where(np.logical_and(x_full > tmin_select, x_full< tmax_select ))[0]

In [None]:
fig,ax = plt.subplots(1,1,figsize=(4,3))
ax.hist(y_full[good_indexes_forresiduals],bins=200,facecolor="tab:blue");
ax.set_xlabel("mm")
ax.set_title("input residuals PWV")

In [None]:
fig = plt.figure(figsize=(FIGXSIZE_0,FIGYSIZE_0),layout="constrained")
#gs = GridSpec(2, 1,figure=fig)
gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
#ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
#leg2=ax2.get_legend()
ax1.plot(x_full,y_full,c="b",lw=0.5,label="Merra2")
ax1.set_xlabel("time (MJD)")
ax1.legend()
ax1.set_ylabel("PWV residuals (mm)")
ax1.set_title("PWV residuals after GP-Periodic removal")
figname =f"{pathfigs}/pwv_resGPper_timeseqall_merra2"+figtype
ax1.axvline(tmin_select,color="k",ls=':')
ax1.axvline(tmax_select,color="k",ls=':')
fig.savefig(figname)
plt.show()


In [None]:
df = df[df.mjd>tmin_select]
N = len(df)
NSAMP = 10000
index_range = np.arange(0,N ,1)
index_selected = np.random.choice(index_range , size=NSAMP,replace = False)
index_selected = np.sort(index_selected)
index_selected = index_range 

In [None]:
SIGMA_PWVREPEATABILITY = 0.26

In [None]:
df_dcf_in = df[["mjd","res"]]
tstart = df_dcf_in["mjd"].min()
tstop = df_dcf_in["mjd"].max()
df_dcf_in["t_day"] = df_dcf_in["mjd"] - tstart
df_dcf_in = df_dcf_in[["t_day","res"]] 

df_dcf_in = df_dcf_in.assign(sig_pwv = lambda x: SIGMA_PWVREPEATABILITY)
df_dcf_in = df_dcf_in.iloc[index_selected]

In [None]:
df_dcf_out = ComputeZDCF("dcf_in_pwvres_merra2.csv",df_dcf_in, minpts=20)

In [None]:
xerr = df_dcf_out[["-sig(tau)","+sig(tau)"]].values.T	
yerr = df_dcf_out[["-err(dcf)","+err(dcf)"]].values.T	
x = df_dcf_out["tau"].values
y = df_dcf_out["dcf"].values

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,5),layout="constrained")
ax.errorbar(x,y,xerr=xerr,yerr=yerr,marker='o', mfc='red',linewidth=0.5,
         mec='blue', ms=2, mew=2,ecolor="k",elinewidth=2,capsize=2,uplims=True, lolims=True)
ax.grid()
ax.set_ylim(-1,1)
ax.set_title(f"Discrete covariance function on PWV residuals in Merra2")
ax.set_xlabel("Time (days)")
ax.set_ylabel("DCF (no units)")

## Analysis of Ozone

In [None]:
full_filename = filename_residuals["ozone"]
print(full_filename)

In [None]:
df = pd.read_csv(full_filename,index_col=0)
N = len(df)

In [None]:
x_full = df["mjd"].values
y_full = df["res"].values
X_full = x_full.reshape(-1, 1)

### Continue on fit

In [None]:
tmin_select = 59500
tmax_select = x_full.max()
good_indexes_forresiduals = np.where(np.logical_and(x_full > tmin_select, x_full< tmax_select ))[0]

In [None]:
fig,ax = plt.subplots(1,1,figsize=(4,3))
ax.hist(y_full[good_indexes_forresiduals],bins=200,facecolor="tab:red");
ax.set_xlabel("DU")
ax.set_title("input residuals Ozone")

In [None]:
fig = plt.figure(figsize=(FIGXSIZE_0,FIGYSIZE_0),layout="constrained")
#gs = GridSpec(2, 1,figure=fig)
gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
#ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
#leg2=ax2.get_legend()
ax1.plot(x_full,y_full,c="r",lw=0.5,label="Merra2")
ax1.set_xlabel("time (MJD)")
ax1.legend()
ax1.set_ylabel("Ozone residuals (DU)")
ax1.set_title("Ozone residuals after GP-Periodic removal")
figname =f"{pathfigs}/ozone_resGPper_timeseqall_merra2"+figtype
ax1.axvline(tmin_select,color="k",ls=':')
ax1.axvline(tmax_select,color="k",ls=':')
fig.savefig(figname)
plt.show()

In [None]:
SIGMA_OZONEREPEATABILITY = 5.

In [None]:
df = df[df.mjd>tmin_select]
N = len(df)
NSAMP = 10000
index_range = np.arange(0,N ,1)
index_selected = np.random.choice(index_range , size=NSAMP,replace = False)
index_selected = np.sort(index_selected)
index_selected = index_range 

In [None]:
df_dcf_in = df[["mjd","res"]]
tstart = df_dcf_in["mjd"].min()
tstop = df_dcf_in["mjd"].max()
df_dcf_in["t_day"] = df_dcf_in["mjd"] - tstart
df_dcf_in = df_dcf_in[["t_day","res"]] 

df_dcf_in = df_dcf_in.assign(sig_ozone = lambda x: SIGMA_PWVREPEATABILITY)
df_dcf_in = df_dcf_in.iloc[index_selected]

In [None]:
df_dcf_out = ComputeZDCF("dcf_in_ozoneres_merra2.csv",df_dcf_in, minpts=20)

In [None]:
xerr = df_dcf_out[["-sig(tau)","+sig(tau)"]].values.T	
yerr = df_dcf_out[["-err(dcf)","+err(dcf)"]].values.T	
x = df_dcf_out["tau"].values
y = df_dcf_out["dcf"].values

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,5),layout="constrained")
ax.errorbar(x,y,xerr=xerr,yerr=yerr,marker='o', mfc='red',linewidth=0.5,
         mec='red', ms=2, mew=2,ecolor="k",elinewidth=2,capsize=2,uplims=True, lolims=True)
ax.grid()
ax.set_ylim(-1,1)
ax.set_title(f"Discrete covariance function on Ozone residuals in Merra2")
ax.set_xlabel("Time (days)")
ax.set_ylabel("DCF (no units)")

## Aerosol VAOD

In [None]:
full_filename = filename_residuals["vaod"]
print(full_filename)

In [None]:
df = pd.read_csv(full_filename,index_col=0)
N = len(df)

In [None]:
x_full = df["mjd"].values
y_full = df["res"].values
X_full = x_full.reshape(-1, 1)

In [None]:
tmin_select = 59500
tmax_select = x_full.max()
good_indexes_forresiduals = np.where(np.logical_and(x_full > tmin_select, x_full< tmax_select ))[0]

In [None]:
fig,ax = plt.subplots(1,1,figsize=(4,3))
ax.hist(y_full[good_indexes_forresiduals],bins=200,facecolor="tab:green");
ax.set_xlabel("VAOD")
ax.set_title("input residuals VAOD")

In [None]:
fig = plt.figure(figsize=(FIGXSIZE_0,FIGYSIZE_0),layout="constrained")
#gs = GridSpec(2, 1,figure=fig)
gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
#ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
#leg2=ax2.get_legend()
ax1.plot(x_full,y_full,c="g",lw=0.5,label="Merra2")
ax1.set_xlabel("time (MJD)")
ax1.legend()
ax1.set_ylabel("VAOD residuals")
ax1.set_title("VAOD residuals after GP-Periodic removal")
figname =f"{pathfigs}/vaod_resGPper_timeseqall_merra2"+figtype
ax1.axvline(tmin_select,color="k",ls=':')
ax1.axvline(tmax_select,color="k",ls=':')
fig.savefig(figname)
plt.show()

In [None]:
SIGMA_VAODREPEATABILITY = 0.01

In [None]:
df = df[df.mjd>tmin_select]
N = len(df)
NSAMP = 10000
index_range = np.arange(0,N ,1)
index_selected = np.random.choice(index_range , size=NSAMP,replace = False)
index_selected = np.sort(index_selected)
index_selected = index_range 

In [None]:
df_dcf_in = df[["mjd","res"]]
tstart = df_dcf_in["mjd"].min()
tstop = df_dcf_in["mjd"].max()
df_dcf_in["t_day"] = df_dcf_in["mjd"] - tstart
df_dcf_in = df_dcf_in[["t_day","res"]] 

df_dcf_in = df_dcf_in.assign(sig_ozone = lambda x: SIGMA_VAODREPEATABILITY)
df_dcf_in = df_dcf_in.iloc[index_selected]

In [None]:
df_dcf_out = ComputeZDCF("dcf_in_vaod_merra2.csv",df_dcf_in, minpts=20)

In [None]:
xerr = df_dcf_out[["-sig(tau)","+sig(tau)"]].values.T	
yerr = df_dcf_out[["-err(dcf)","+err(dcf)"]].values.T	
x = df_dcf_out["tau"].values
y = df_dcf_out["dcf"].values

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,5),layout="constrained")
ax.errorbar(x,y,xerr=xerr,yerr=yerr,marker='o', mfc='red',linewidth=0.5,
         mec='green', ms=2, mew=2,ecolor="k",elinewidth=2,capsize=2,uplims=True, lolims=True)
ax.grid()
ax.set_ylim(-1,1)
ax.set_title(f"Discrete covariance function on VAOD residuals in Merra2")
ax.set_xlabel("Time (days)")
ax.set_ylabel("DCF (no units)")

## Aerosol Angstrom

In [None]:
full_filename = filename_residuals["angstrom"]
print(full_filename)

In [None]:
df = pd.read_csv(full_filename,index_col=0)
N = len(df)

In [None]:
x_full = df["mjd"].values
y_full = df["res"].values
X_full = x_full.reshape(-1, 1)

In [None]:
tmin_select = 59500
tmax_select = x_full.max()
good_indexes_forresiduals = np.where(np.logical_and(x_full > tmin_select, x_full< tmax_select ))[0]

In [None]:
fig,ax = plt.subplots(1,1,figsize=(4,3))
ax.hist(y_full[good_indexes_forresiduals],bins=200,facecolor="tab:purple");
ax.set_xlabel("angstrom")
ax.set_title("input residuals Angstrom")

In [None]:
fig = plt.figure(figsize=(FIGXSIZE_0,FIGYSIZE_0),layout="constrained")
#gs = GridSpec(2, 1,figure=fig)
gs = GridSpec(1, 1,figure=fig)
ax1 = fig.add_subplot(gs[0])
#ax2 = fig.add_subplot(gs[1])
        
leg1=ax1.get_legend()
#leg2=ax2.get_legend()
ax1.plot(x_full,y_full,c="purple",lw=0.5,label="Merra2")
ax1.set_xlabel("time (MJD)")
ax1.legend()
ax1.set_ylabel("Angstrom residuals")
ax1.set_title("Angstrom residuals after GP-Periodic removal")
figname =f"{pathfigs}/angstrom_resGPper_timeseqall_merra2"+figtype
ax1.axvline(tmin_select,color="k",ls=':')
ax1.axvline(tmax_select,color="k",ls=':')
fig.savefig(figname)
plt.show()

In [None]:
SIGMA_ANGSTROMREPEATABILITY = 0.001

In [None]:
df = df[df.mjd>tmin_select]
N = len(df)
NSAMP = 10000
index_range = np.arange(0,N ,1)
index_selected = np.random.choice(index_range , size=NSAMP,replace = False)
index_selected = np.sort(index_selected)
index_selected = index_range 

In [None]:
df_dcf_in = df[["mjd","res"]]
tstart = df_dcf_in["mjd"].min()
tstop = df_dcf_in["mjd"].max()
df_dcf_in["t_day"] = df_dcf_in["mjd"] - tstart
df_dcf_in = df_dcf_in[["t_day","res"]] 

df_dcf_in = df_dcf_in.assign(sig_ozone = lambda x: SIGMA_ANGSTROMREPEATABILITY )
df_dcf_in = df_dcf_in.iloc[index_selected]

In [None]:
df_dcf_out = ComputeZDCF("dcf_in_angstrom_merra2.csv",df_dcf_in, minpts=20)

In [None]:
xerr = df_dcf_out[["-sig(tau)","+sig(tau)"]].values.T	
yerr = df_dcf_out[["-err(dcf)","+err(dcf)"]].values.T	
x = df_dcf_out["tau"].values
y = df_dcf_out["dcf"].values

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,5),layout="constrained")
ax.errorbar(x,y,xerr=xerr,yerr=yerr,marker='o', mfc='red',linewidth=0.5,
         mec='purple', ms=2, mew=2,ecolor="k",elinewidth=2,capsize=2,uplims=True, lolims=True)
ax.grid()
ax.set_ylim(-1,1)
ax.set_title(f"Discrete covariance function on Angstrom residuals in Merra2")
ax.set_xlabel("Time (days)")
ax.set_ylabel("DCF (no units)")