# Imports

In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from weatherbench2 import config
from weatherbench2.evaluation import evaluate_in_memory, evaluate_with_beam
from weatherbench2 import metrics
from weatherbench2.visualization import set_wb2_style
from tqdm import tqdm

In [2]:
np.random.seed(0)

In [3]:
set_wb2_style()

In [4]:
plt.rcParams.update({'font.size': 28})
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.family'] = 'STIXGeneral'

# 64x32

## calculate errors

In [None]:
%%time
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/pangu/2018-2022_0012_64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2018-01-01 \
    --time_stop=2021-12-21 \
    --variables=temperature \
    --levels=850 \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=pangu_vs_era5_2018_2021_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [5]:
pangu_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/pangu_vs_era5_2018_2021_spatial_temporal_bias.nc')

## calculate ensemble forecasts (random)

In [9]:
%%time
# Load forecast and errors datasets
pangu_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/pangu/2018-2022_0012_64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)#.rename({'time': 'init_time'})
pangu_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/pangu_vs_era5_2018_2021_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)
pangu_errors_T850_train = xr.concat([pangu_errors_T850['temperature'].sel(init_time=slice('2018-01-01','2019-12-31')), pangu_errors_T850['temperature'].sel(init_time=slice('2021-01-01','2021-12-21'))], dim='init_time')

t = pangu_errors_T850_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in pangu_forecast_T850
for time in pangu_forecast_T850.time:
    
    # Select random errors
    random_indices = np.random.choice(t, size=10, )
    sampled_errors = (
        pangu_errors_T850_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = pangu_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')

CPU times: user 44.7 s, sys: 924 ms, total: 45.6 s
Wall time: 45.7 s


In [10]:
# turn from dataarray to dataset
ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()

In [11]:
ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))

In [18]:
# save as zarr file 
# ensemble_forecast_T850.to_zarr('Results_CLS/64x32/Errors/pangu_ensemble_forecast_T850.zarr',mode='w')

<xarray.backends.zarr.ZarrStore at 0x7f6d5f61ccf0>

## evaluate ensemble forecast

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/pangu_ensemble_forecast_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=pangu_error_ens_vs_era5_2020_ \
  --input_chunks=init_time=16 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## evaluate deterministic forecast

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=gs://weatherbench2/datasets/pangu/2018-2022_0012_64x32_equiangular_conservative.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/deterministic/ \
  --output_file_prefix=pangu_vs_era5_2020_ \
  --input_chunks=init_time=128 \
  --eval_configs=custom_deterministic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## calculate ensemble forecast (init_time dependent)

In [11]:
pangu_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/pangu/2018-2022_0012_64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)#.rename({'time': 'init_time'})
pangu_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/pangu_vs_era5_2018_2021_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)
pangu_errors_T850_train = xr.concat([pangu_errors_T850['temperature'].sel(init_time=slice(None,'2019-12-31')), pangu_errors_T850['temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

windows = [5,15,30]

# Get the day of the year for each time in the forecast
day_of_year = pangu_forecast_T850['time.dayofyear']

for window in windows:
    # Initialize an empty list to store ensemble forecasts
    ensemble_forecast_datasets = []

    # Iterate over all time values in pangu_forecast_T850
    for time, doy in zip(pangu_forecast_T850.time, day_of_year):

        # Select random errors within ±30 days from the day of the year
        valid_indices = np.where((pangu_errors_T850_train['init_time.dayofyear'] >= doy - window) & (pangu_errors_T850_train['init_time.dayofyear'] <= doy + window))[0]
        random_indices = np.random.choice(valid_indices, size=10)

        sampled_errors = (
            pangu_errors_T850_train.isel(init_time=random_indices)
            .drop(['valid_time', 'time'])
            .reset_index('init_time', drop=True)
            .rename({'init_time': 'number'})
            .assign_coords(number=np.arange(len(random_indices)))
        )

        # Create an ensemble for each time by adding errors
        forecast_datasets = []
        for i in sampled_errors.number:
            modified_forecast = pangu_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
            forecast_datasets.append(modified_forecast)

        # Concatenate individual forecasts to create an ensemble for the current time
        ensemble_forecast = xr.concat(forecast_datasets, dim='number')

        # Append the ensemble forecast to the list
        ensemble_forecast_datasets.append(ensemble_forecast)
    
    print('Done:',window)

    # Concatenate all ensemble forecasts along the 'time' dimension, turn to dataset and assign 'level' as index
    ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
    ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
    ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))
    ensemble_forecast_T850.to_zarr(f'Results_CLS/64x32/Errors/pangu_ensemble_forecast_timedependent_{window}_T850.zarr',mode='w')

Done: 5
Done: 15
Done: 30


In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/pangu_ensemble_forecast_timedependent_5_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=pangu_error_ens_timedependent_5_vs_era5_2020_T850_ \
  --input_chunks=init_time=128 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/pangu_ensemble_forecast_timedependent_15_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=pangu_error_ens_timedependent_15_vs_era5_2020_T850_ \
  --input_chunks=init_time=128 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/pangu_ensemble_forecast_timedependent_30_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=pangu_error_ens_timedependent_30_vs_era5_2020_T850_ \
  --input_chunks=init_time=128 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## HRES

In [None]:
%%time
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/hres/2016-2022-0012-64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2016-01-01 \
    --time_stop=2022-12-21 \
    --variables=temperature \
    --levels=850 \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=hres_vs_analysis_2016_2022_temperature_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [110]:
hres_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/hres_vs_analysis_2016_2022_temperature_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)

In [42]:
%%time
# Load forecast and errors datasets
hres_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/hres/2016-2022-0012-64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)#.rename({'time': 'init_time'})
hres_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/hres_vs_analysis_2016_2022_temperature_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)
hres_errors_T850_train = xr.concat([hres_errors_T850['temperature'].sel(init_time=slice('2016-01-01','2019-12-31')), hres_errors_T850['temperature'].sel(init_time=slice('2022-01-01','2022-12-21'))], dim='init_time')

t = hres_errors_T850_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in hres_forecast_T850
for time in hres_forecast_T850.time:
    
    # Select random errors
    random_indices = np.random.choice(t, size=10)
    sampled_errors = (
        hres_errors_T850_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = hres_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))

CPU times: user 44.4 s, sys: 419 ms, total: 44.8 s
Wall time: 44.9 s


In [43]:
ensemble_forecast_T850.to_zarr('Results_CLS/64x32/Errors/hres_ensemble_forecast_T850.zarr',mode='w')

<xarray.backends.zarr.ZarrStore at 0x7fca5067ee40>

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/hres_ensemble_forecast_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=hres_error_ens_vs_analysis_2020_T850_ \
  --input_chunks=init_time=16 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=gs://weatherbench2/datasets/hres/2016-2022-0012-64x32_equiangular_conservative.zarr \
  --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/deterministic/ \
  --output_file_prefix=hres_vs_analysis_2020_T850_ \
  --input_chunks=init_time=128 \
  --eval_configs=custom_deterministic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [6]:
hres_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/hres/2016-2022-0012-64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)#.rename({'time': 'init_time'})
hres_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/hres_vs_analysis_2016_2022_temperature_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)
hres_errors_T850_train = xr.concat([hres_errors_T850['temperature'].sel(init_time=slice(None,'2019-12-31')), hres_errors_T850['temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

windows = [5,15,30]

# Get the day of the year for each time in the forecast
day_of_year = hres_forecast_T850['time.dayofyear']

for window in windows:
    # Initialize an empty list to store ensemble forecasts
    ensemble_forecast_datasets = []

    # Iterate over all time values in hres_forecast_T850
    for time, doy in zip(hres_forecast_T850.time, day_of_year):

        # Select random errors within ±30 days from the day of the year
        valid_indices = np.where((hres_errors_T850_train['init_time.dayofyear'] >= doy - window) & (hres_errors_T850_train['init_time.dayofyear'] <= doy + window))[0]
        random_indices = np.random.choice(valid_indices, size=10)

        sampled_errors = (
            hres_errors_T850_train.isel(init_time=random_indices)
            .drop(['valid_time', 'time'])
            .reset_index('init_time', drop=True)
            .rename({'init_time': 'number'})
            .assign_coords(number=np.arange(len(random_indices)))
        )

        # Create an ensemble for each time by adding errors
        forecast_datasets = []
        for i in sampled_errors.number:
            modified_forecast = hres_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
            forecast_datasets.append(modified_forecast)

        # Concatenate individual forecasts to create an ensemble for the current time
        ensemble_forecast = xr.concat(forecast_datasets, dim='number')

        # Append the ensemble forecast to the list
        ensemble_forecast_datasets.append(ensemble_forecast)
    
    print('Done:',window)

    # Concatenate all ensemble forecasts along the 'time' dimension, turn to dataset and assign 'level' as index
    ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
    ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
    ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))
    ensemble_forecast_T850.to_zarr(f'Results_CLS/64x32/Errors/hres_ensemble_forecast_timedependent_{window}_T850.zarr',mode='w')

Done: 5
Done: 15
Done: 30


In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/hres_ensemble_forecast_timedependent_5_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=hres_error_ens_timedependent_5_vs_era5_2020_T850_ \
  --input_chunks=init_time=32 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/hres_ensemble_forecast_timedependent_15_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=hres_error_ens_timedependent_15_vs_era5_2020_T850_ \
  --input_chunks=init_time=32 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/hres_ensemble_forecast_timedependent_30_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=hres_error_ens_timedependent_30_vs_era5_2020_T850_ \
  --input_chunks=init_time=32 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## GraphCast

In [None]:
%%time
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/graphcast/2018/date_range_2017-11-16_2019-02-01_12_hours-64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2017-11-16 \
    --time_stop=2019-01-31 \
    --variables=temperature \
    --levels=850 \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=graphcast_vs_era5_2018_2019_temperature_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [None]:
%%time
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/graphcast/2020/date_range_2019-11-16_2021-02-01_12_hours-64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2019-11-16 \
    --time_stop=2021-01-31 \
    --variables=temperature \
    --levels=850 \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=graphcast_vs_era5_2020_2021_temperature_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [18]:
graphcast_errors_T850 = xr.concat([xr.open_dataset('Results_CLS/64x32/Errors/graphcast_vs_era5_2018_2019_temperature_spatial_temporal_bias.nc'), xr.open_dataset('Results_CLS/64x32/Errors/graphcast_vs_era5_2020_2021_temperature_spatial_temporal_bias.nc')],dim='init_time').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)

In [21]:
graphcast_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/graphcast/2020/date_range_2019-11-16_2021-02-01_12_hours-64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)

graphcast_errors_T850_train = xr.concat([graphcast_errors_T850['temperature'].sel(init_time=slice(None,'2019-12-31')), graphcast_errors_T850['temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

t = graphcast_errors_T850_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in graphcast_forecast_T850
for time in graphcast_forecast_T850.time:
    
    # Select random errors
    random_indices = np.random.choice(t, size=10)
    sampled_errors = (
        graphcast_errors_T850_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = graphcast_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))

In [22]:
%%time
ensemble_forecast_T850.to_zarr('Results_CLS/64x32/Errors/graphcast_ensemble_forecast_T850.zarr',mode='w')

CPU times: user 10min 35s, sys: 2min 51s, total: 13min 27s
Wall time: 10min 9s


<xarray.backends.zarr.ZarrStore at 0x7f4563113190>

In [None]:
%%time
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=Results_CLS/64x32/Errors/graphcast_ensemble_forecast_T850.zarr \
    --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --variables=temperature \
    --levels=850 \
    --eval_configs=probabilistic \
    --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
    --output_file_prefix=graphcast_error_ens_vs_era5_2020_T850_ \
    --input_chunks=init_time=16 \
    --use_beam=True \
    --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=gs://weatherbench2/datasets/graphcast/2020/date_range_2019-11-16_2021-02-01_12_hours-64x32_equiangular_conservative.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/deterministic/ \
  --output_file_prefix=graphcast_vs_era5_2020_ \
  --input_chunks=init_time=128 \
  --eval_configs=deterministic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [21]:
graphcast_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/graphcast/2020/date_range_2019-11-16_2021-02-01_12_hours-64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)#.rename({'time': 'init_time'})
graphcast_errors_T850 = xr.concat([xr.open_dataset('Results_CLS/64x32/Errors/graphcast_vs_era5_2018_2019_temperature_spatial_temporal_bias.nc'), xr.open_dataset('Results_CLS/64x32/Errors/graphcast_vs_era5_2020_2021_temperature_spatial_temporal_bias.nc')],dim='init_time').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)
graphcast_errors_T850_train = xr.concat([graphcast_errors_T850['temperature'].sel(init_time=slice(None,'2019-12-31')), graphcast_errors_T850['temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

windows = [5,15,30]

# Get the day of the year for each time in the forecast
day_of_year = graphcast_forecast_T850['time.dayofyear']

for window in windows:
    # Initialize an empty list to store ensemble forecasts
    ensemble_forecast_datasets = []

    # Iterate over all time values in graphcast_forecast_T850
    for time, doy in zip(graphcast_forecast_T850.time, day_of_year):

        # Select random errors within ±30 days from the day of the year
        valid_indices = np.where((graphcast_errors_T850_train['init_time.dayofyear'] >= doy - window) & (graphcast_errors_T850_train['init_time.dayofyear'] <= doy + window))[0]
        random_indices = np.random.choice(valid_indices, size=10)

        sampled_errors = (
            graphcast_errors_T850_train.isel(init_time=random_indices)
            .drop(['valid_time', 'time'])
            .reset_index('init_time', drop=True)
            .rename({'init_time': 'number'})
            .assign_coords(number=np.arange(len(random_indices)))
        )

        # Create an ensemble for each time by adding errors
        forecast_datasets = []
        for i in sampled_errors.number:
            modified_forecast = graphcast_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
            forecast_datasets.append(modified_forecast)

        # Concatenate individual forecasts to create an ensemble for the current time
        ensemble_forecast = xr.concat(forecast_datasets, dim='number')

        # Append the ensemble forecast to the list
        ensemble_forecast_datasets.append(ensemble_forecast)
    
    print('Done:',window)

    # Concatenate all ensemble forecasts along the 'time' dimension, turn to dataset and assign 'level' as index
    ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
    ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
    ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))
    ensemble_forecast_T850.to_zarr(f'Results_CLS/64x32/Errors/graphcast_ensemble_forecast_timedependent_{window}_T850.zarr',mode='w')

Done: 5
Done: 15
Done: 30


In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/graphcast_ensemble_forecast_timedependent_5_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=graphcast_error_ens_timedependent_5_vs_era5_2020_T850_ \
  --input_chunks=init_time=128 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/graphcast_ensemble_forecast_timedependent_15_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=graphcast_error_ens_timedependent_15_vs_era5_2020_T850_ \
  --input_chunks=init_time=128 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/graphcast_ensemble_forecast_timedependent_30_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=graphcast_error_ens_timedependent_30_vs_era5_2020_T850_ \
  --input_chunks=init_time=128 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

# 50 Member

## Pangu

In [3]:
%%time
# Load forecast and errors datasets
pangu_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/pangu/2018-2022_0012_64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)#.rename({'time': 'init_time'})
pangu_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/pangu_vs_era5_2018_2021_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)
pangu_errors_T850_train = xr.concat([pangu_errors_T850['temperature'].sel(init_time=slice('2018-01-01','2019-12-31')), pangu_errors_T850['temperature'].sel(init_time=slice('2021-01-01','2021-12-21'))], dim='init_time')

t = pangu_errors_T850_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in pangu_forecast_T850
for time in pangu_forecast_T850.time:
    
    # Select random errors
    random_indices = np.random.choice(t, size=50)
    sampled_errors = (
        pangu_errors_T850_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = pangu_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))

CPU times: user 3min 31s, sys: 2.5 s, total: 3min 34s
Wall time: 3min 44s


In [9]:
%%time
ensemble_forecast_T850.to_zarr('Results_CLS/64x32/Errors/pangu_50_ensemble_forecast_T850.zarr',mode='w')

CPU times: user 49min 10s, sys: 14min 23s, total: 1h 3min 34s
Wall time: 44min 51s


<xarray.backends.zarr.ZarrStore at 0x7fa7bfca6890>

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/pangu_50_ensemble_forecast_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=pangu_error_ens_50_vs_era5_2020_ \
  --input_chunks=init_time=16 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## HRES

In [4]:
%%time
# Load forecast and errors datasets
hres_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/hres/2016-2022-0012-64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)#.rename({'time': 'init_time'})
hres_errors_T850 = xr.open_dataset('Results_CLS/64x32/Errors/hres_vs_analysis_2016_2022_temperature_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias', level=850)
hres_errors_T850_train = xr.concat([hres_errors_T850['temperature'].sel(init_time=slice('2016-01-01','2019-12-31')), hres_errors_T850['temperature'].sel(init_time=slice('2022-01-01','2022-12-21'))], dim='init_time')

t = hres_errors_T850_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in hres_forecast_T850
for time in hres_forecast_T850.time:
    
    # Select random errors
    random_indices = np.random.choice(t, size=50)
    sampled_errors = (
        hres_errors_T850_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = hres_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))

CPU times: user 3min 32s, sys: 3.36 s, total: 3min 35s
Wall time: 3min 47s


In [5]:
ensemble_forecast_T850.to_zarr('Results_CLS/64x32/Errors/hres_50_ensemble_forecast_T850.zarr',mode='w')

<xarray.backends.zarr.ZarrStore at 0x7f34ef147a50>

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/hres_50_ensemble_forecast_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=hres_error_ens_50_vs_analysis_2020_T850_ \
  --input_chunks=init_time=16 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

In [None]:
# evaluate NeuaralGCM
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=gs://weatherbench2/datasets/neuralgcm_ens/2020-64x32_equiangular_conservative.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_conservative.zarr  \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/probabilistic/ \
  --output_file_prefix=neuralgcm_ens_vs_era5_2020_ \
  --input_chunks=init_time=16 \
  --eval_configs=probabilistic \
  --variables=geopotential,temperature,specific_humidity,wind_speed \
  --ensemble_dim=realization \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## GraphCast

In [24]:
graphcast_forecast_T850 = xr.open_zarr('gs://weatherbench2/datasets/graphcast/2020/date_range_2019-11-16_2021-02-01_12_hours-64x32_equiangular_conservative.zarr')['temperature'].sel(time=slice('2020-01-01','2020-12-31'), level=850)

graphcast_errors_T850_train = xr.concat([graphcast_errors_T850['temperature'].sel(init_time=slice(None,'2019-12-31')), graphcast_errors_T850['temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

t = graphcast_errors_T850_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in graphcast_forecast_T850
for time in graphcast_forecast_T850.time:
    
    # Select random errors
    random_indices = np.random.choice(t, size=50)
    sampled_errors = (
        graphcast_errors_T850_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = graphcast_forecast_T850.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_T850 = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_T850 = ensemble_forecast_T850.to_dataset()
ensemble_forecast_T850 = ensemble_forecast_T850.drop('level').assign(level=xr.DataArray([850], dims='level'))

In [25]:
%%time
ensemble_forecast_T850.to_zarr('Results_CLS/64x32/Errors/graphcast_50_ensemble_forecast_T850.zarr',mode='w')

CPU times: user 49min 34s, sys: 14min 48s, total: 1h 4min 23s
Wall time: 45min 52s


<xarray.backends.zarr.ZarrStore at 0x7f45625f00b0>

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/graphcast_50_ensemble_forecast_T850.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=graphcast_error_ens_50_vs_era5_2020_T850_ \
  --input_chunks=init_time=16 \
  --eval_configs=probabilistic \
  --variables=temperature \
  --levels=850 \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

# T2M

## Pangu

In [None]:
%%time
# Pangu
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/pangu/2018-2022_0012_64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2018-01-01 \
    --time_stop=2022-12-31 \
    --variables=2m_temperature \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=pangu_vs_era5_2018_2021_2m_temperature_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [18]:
# Load forecast and errors datasets
pangu_forecast_t2m = xr.open_zarr('gs://weatherbench2/datasets/pangu/2018-2022_0012_64x32_equiangular_conservative.zarr')['2m_temperature'].sel(time=slice('2020-01-01','2020-12-31'))#.rename({'time': 'init_time'})
pangu_errors_t2m = xr.open_dataset('Results_CLS/64x32/Errors/pangu_vs_era5_2018_2021_2m_temperature_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias')
pangu_errors_t2m_train = xr.concat([pangu_errors_t2m['2m_temperature'].sel(init_time=slice(None,'2019-12-31')), pangu_errors_t2m['2m_temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

t = pangu_errors_t2m_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in pangu_forecast
for time in tqdm(pangu_forecast_t2m.time):
    
    # Select random errors
    random_indices = np.random.choice(t, size=50)
    sampled_errors = (
        pangu_errors_t2m_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = pangu_forecast_t2m.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_t2m = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_t2m = ensemble_forecast_t2m.to_dataset()
#ensemble_forecast_t2m = ensemble_forecast_T850#.drop('level').assign(level=xr.DataArray([850], dims='level'))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 732/732 [03:36<00:00,  3.38it/s]


In [22]:
%%time
ensemble_forecast_t2m.to_zarr('Results_CLS/64x32/Errors/pangu_50_ensemble_forecast_T2M.zarr',mode='w')

CPU times: user 43min 20s, sys: 11min 8s, total: 54min 29s
Wall time: 44min 52s


<xarray.backends.zarr.ZarrStore at 0x7f9d507f2270>

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/pangu_50_ensemble_forecast_T2M.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=pangu_error_ens_50_T2M_vs_era5_2020_ \
  --input_chunks=init_time=32 \
  --eval_configs=probabilistic \
  --variables=2m_temperature \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## HRES

In [None]:
%%time
# HRES
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/hres/2016-2022-0012-64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2016-01-01 \
    --time_stop=2022-12-21 \
    --variables=2m_temperature \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=hres_vs_analysis_2016_2022_2m_temperature_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [24]:
# Load forecast and errors datasets
hres_forecast_t2m = xr.open_zarr('gs://weatherbench2/datasets/hres/2016-2022-0012-64x32_equiangular_conservative.zarr')['2m_temperature'].sel(time=slice('2020-01-01','2020-12-31'))#.rename({'time': 'init_time'})
hres_errors_t2m = xr.open_dataset('Results_CLS/64x32/Errors/hres_vs_analysis_2016_2022_2m_temperature_spatial_temporal_bias.nc').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias')
hres_errors_t2m_train = xr.concat([hres_errors_t2m['2m_temperature'].sel(init_time=slice(None,'2019-12-31')), hres_errors_t2m['2m_temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

t = hres_errors_t2m_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in hres_forecast
for time in tqdm(hres_forecast_t2m.time):
    
    # Select random errors
    random_indices = np.random.choice(t, size=50)
    sampled_errors = (
        hres_errors_t2m_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = hres_forecast_t2m.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_t2m = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_t2m = ensemble_forecast_t2m.to_dataset()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 732/732 [03:22<00:00,  3.62it/s]


In [25]:
%%time
ensemble_forecast_t2m.to_zarr('Results_CLS/64x32/Errors/hres_50_ensemble_forecast_T2M.zarr',mode='w')

CPU times: user 44min 33s, sys: 12min 20s, total: 56min 54s
Wall time: 46min 46s


<xarray.backends.zarr.ZarrStore at 0x7f9c7770a5f0>

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/hres_50_ensemble_forecast_T2M.zarr \
  --obs_path=gs://weatherbench2/datasets/hres_t0/2016-2022-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=hres_error_ens_50_T2M_vs_analysis_2020_ \
  --input_chunks=init_time=64 \
  --eval_configs=probabilistic \
  --variables=2m_temperature \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner

## GraphCast

In [None]:
%%time
# GraphCast 1
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/graphcast/2018/date_range_2017-11-16_2019-02-01_12_hours-64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2017-11-16 \
    --time_stop=2019-01-31 \
    --variables=2m_temperature \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=graphcast_vs_era5_2018_2019_2m_temperature_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [None]:
%%time
# GraphCast 2
!python weatherbench2/scripts/evaluate.py \
    --forecast_path=gs://weatherbench2/datasets/graphcast/2020/date_range_2019-11-16_2021-02-01_12_hours-64x32_equiangular_conservative.zarr \
    --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
    --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
    --time_start=2019-11-16 \
    --time_stop=2021-01-31 \
    --variables=2m_temperature \
    --eval_configs=spatial_temporal_bias \
    --output_dir=/mnt/sda/Data2/wb2/Results_CLS/64x32/Errors/ \
    --output_file_prefix=graphcast_vs_era5_2020_2021_2m_temperature_ \
    --input_chunks=init_time=128 \
    --use_beam=True \
    --runner=DirectRunner

In [26]:
# Load forecast and errors datasets
graphcast_forecast_t2m = xr.open_zarr('gs://weatherbench2/datasets/graphcast/2020/date_range_2019-11-16_2021-02-01_12_hours-64x32_equiangular_conservative.zarr')['2m_temperature'].sel(time=slice('2020-01-01','2020-12-31'))
graphcast_errors_t2m = xr.concat([xr.open_dataset('Results_CLS/64x32/Errors/graphcast_vs_era5_2018_2019_2m_temperature_spatial_temporal_bias.nc'), xr.open_dataset('Results_CLS/64x32/Errors/graphcast_vs_era5_2020_2021_2m_temperature_spatial_temporal_bias.nc')],dim='init_time').rename({'lead_time': 'prediction_timedelta'}).sel(metric='bias')
graphcast_errors_t2m_train = xr.concat([graphcast_errors_t2m['2m_temperature'].sel(init_time=slice(None,'2019-12-31')), graphcast_errors_t2m['2m_temperature'].sel(init_time=slice('2021-01-01',None))], dim='init_time')

t = graphcast_errors_t2m_train.sizes['init_time']

# Initialize an empty list to store ensemble forecasts
ensemble_forecast_datasets = []

# Iterate over all time values in graphcast_forecast
for time in tqdm(graphcast_forecast_t2m.time):
    
    # Select random errors
    random_indices = np.random.choice(t, size=50)
    sampled_errors = (
        graphcast_errors_t2m_train.isel(init_time=random_indices)
        .drop(['valid_time', 'time'])
        .reset_index('init_time', drop=True)
        .rename({'init_time': 'number'})
        .assign_coords(number=np.arange(len(random_indices)))
    )
    
    # Create an ensemble for each time by adding errors
    forecast_datasets = []
    for i in sampled_errors.number:
        modified_forecast = graphcast_forecast_t2m.sel(time=time) + sampled_errors.sel(number=i)
        forecast_datasets.append(modified_forecast)

    # Concatenate individual forecasts to create an ensemble for the current time
    ensemble_forecast = xr.concat(forecast_datasets, dim='number')
    
    # Append the ensemble forecast to the list
    ensemble_forecast_datasets.append(ensemble_forecast)

# Concatenate all ensemble forecasts along the 'time' dimension
ensemble_forecast_t2m = xr.concat(ensemble_forecast_datasets, dim='time')
ensemble_forecast_t2m = ensemble_forecast_t2m.to_dataset()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 732/732 [03:12<00:00,  3.81it/s]


In [27]:
%%time
ensemble_forecast_t2m.to_zarr('Results_CLS/64x32/Errors/graphcast_50_ensemble_forecast_T2M.zarr',mode='w')

CPU times: user 38min 58s, sys: 10min 3s, total: 49min 1s
Wall time: 40min 27s


<xarray.backends.zarr.ZarrStore at 0x7f9c1c1f1a50>

In [None]:
!python weatherbench2/scripts/evaluate.py \
  --forecast_path=Results_CLS/64x32/Errors/graphcast_50_ensemble_forecast_T2M.zarr \
  --obs_path=gs://weatherbench2/datasets/era5/1959-2023_01_10-6h-64x32_equiangular_conservative.zarr \
  --climatology_path=gs://weatherbench2/datasets/era5-hourly-climatology/1990-2019_6h_64x32_equiangular_conservative.zarr \
  --output_dir=Results_CLS/64x32/Errors/probabilistic/ \
  --output_file_prefix=graphcast_error_ens_50_vs_era5_2020_T2M_ \
  --input_chunks=init_time=64 \
  --eval_configs=probabilistic \
  --variables=2m_temperature \
  --use_beam=True \
  --fanout=24 \
  --runner=DirectRunner