In [None]:
import intake
from easygems import healpix as egh

import numpy as np
import xarray as xr
import pandas as pd

import scipy.stats

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cf

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Time period

time = ('2020-04-01','2020-04-30')

# Region

domains10x10 = {
    "peruvian":     np.array([-90, -80, -20, -10]) ,
    "namibian":     np.array([0, 10, -20, -10]),
    "californian":  np.array([-130, -120, 20, 30]),
    "canarian":     np.array([-35, -25, 15, 25])
}

map_domain = domains10x10['namibian'] + np.array([-1,1,-1,1])*0

In [None]:
# Dataset specification

hknode = 'EU'

cat = intake.open_catalog("https://digital-earths-global-hackathon.github.io/catalog/catalog.yaml")[hknode]

simulations = {
    'IFS': {
        'id':    'ifs_tco3999-ng5_rcbmf_cf',
        'opt':   {'zoom':11, 'time':'PT1H'},
        'rnm':   {'level':'pressure', 'value':'cell', 'clwvi':'lwp'},
        'vunits': 'hPa'
    },
    'ICON': {
        'id':    'icon_d3hp003',
        'opt':   {'zoom':11, 'time':'PT6H', 'time_method':'inst'},
        'rnm':   {'qall':'lwc'}
    },
    'NICAM': {
        'id':    'nicam_gl11',
        'opt':   {'zoom': 9, 'time':'PT6H'},
        'rnm':   {'lev':'pressure', 'qall':'lwc'}
    },
    'UM': {
        'id':    'um_glm_n2560_RAL3p3',
        'opt':   {'zoom':10, 'time':'PT3H'},
        'rnm':   {'clw':'lwc'}
    }
}

In [None]:
# Load datasets

ds = {}

for name, params in simulations.items():
    ds[name] = cat[params['id']](**params['opt']).to_dask() \
        .rename(params['rnm']).pipe(egh.attach_coords)

    if 'pressure' in ds[name] and 'units' not in ds[name]['pressure'].attrs:
        ds[name]['pressure'].attrs['units'] = params['vunits']
    
    print(name)

In [None]:
%%time

# Select time period and region

for name in ds.keys():
    cells = egh.isel_extent(ds[name],map_domain)
    ds[name] = ds[name].sel(time=slice(*time)).isel({'cell':cells})

In [None]:
%%time

# Derive relevant parameters: lwp and wamax

for name in ds.keys():

    if 'pressure' in ds[name].dims:
        if ds[name]['pressure'].attrs['units'] == 'hPa':
            pfactor = 100
        else:
            pfactor = 1
    else:
        pfactor = 0

    if 'lwp' not in ds[name] and 'lwc' in ds[name]:
        ds[name]['lwp'] = ds[name]['lwc'].sortby('pressure') \
            .integrate('pressure')/9.81*pfactor
        print(name+' LWP integration')
    
    if 'wamax' not in ds[name] and 'wa' in ds[name]:
        ds[name]['wamax'] = ds[name]['wa'].sortby('pressure') \
            .sel(pressure=slice(900e2/pfactor,1000e2/pfactor)).max(dim='pressure')
        print(name+' wa max <900hPa')

In [None]:
%%time

# Calculate simple stats

for name in ds.keys():
    for var in ['lwp','wamax']:
        ds[name][var+'_mean'] = ds[name][var].mean(dim='cell')
        ds[name][var+'_std']  = ds[name][var].std(dim='cell')
        ds[name][var+'_skw']  = ds[name][var].reduce(scipy.stats.skew,dim='cell')
        ds[name][var+'_hom']  = (ds[name][var+'_mean']/ds[name][var+'_std'])
        print(name+' '+var)

In [None]:
# Read EarthCare file

ds['EC'] = pd.read_csv('./stats_earthcare_April2025.csv',usecols=('date_time','lwp_mean','lwp_std','lwp_skew')) \
    .rename(columns={'date_time':'time', 'lwp_skew':'lwp_skw'})

ds['EC']['lwp_hom'] = ds['EC']['lwp_mean']/ds['EC']['lwp_std']

ds['EC']['time'] = pd.to_datetime(ds['EC']['time'], format='%Y%m%dT%H%M%SZ').apply(lambda dt: dt.replace(year=2020))

ds['EC'] = ds['EC'].sort_values(by='time', ascending=True)                     

In [None]:
# Plot timeseries

plot_path = f"./figures/"

stats = ['_mean','_std','_skw','_hom']

variables = ['lwp','wamax']
labels = ['LWP [kg/m2]','Max w <900hPa [m/s]']

for var, lab in zip(variables,labels):
    Npanel = len(stats)
    fig, axs = plt.subplots(Npanel,1, figsize=(12,3*Npanel), sharex=True, constrained_layout=True)
    
    for ax, stat in zip(axs,stats):
        for name in ds.keys():
            if name=='EC':
                if var+stat in ds[name]:
                    ax.plot(ds[name].time, ds[name][var+stat], marker='o', color='black')
            else:
                ax.plot(ds[name].time, ds[name][var+stat])
        ax.set_ylabel(lab+stat.replace('_',' '))
        ax.grid()
        ax.autoscale(enable=None,tight=True)
        
    fig.legend(labels=ds.keys(),bbox_to_anchor=(0.5,0),loc='upper center',ncol=len(ds))
    
    plt.savefig(plot_path+'timeseries_'+var,bbox_inches='tight',dpi=300)

In [None]:
# Plot histograms

plot_path = f"./figures/"

variables = ['lwp','wamax']
labels = ['LWP [kg/m2]','Max w <900hPa [m/s]']
binedges = [np.arange(0,0.4,0.01), np.arange(0,0.5,0.01)]

for var, be, lab in zip(variables,binedges,labels):
    
    fig = plt.figure(figsize=(8,6))
    for name in simulations.keys():
        plt.hist( np.ravel(ds[name][var]), bins=be, density=True, histtype='step', label=name)
    plt.legend()
    plt.grid()
    plt.ylabel('PDF')
    plt.xlabel(lab)
    plt.yscale('log')
    plt.ylim((1e-1,1e2))

    plt.savefig(plot_path+'histogram_'+var,bbox_inches='tight',dpi=300)