# Generate qsub scripts to compute mean and std per variable

In [2]:
import os
import sys
import yaml
import numpy as np
import xarray as xr

In [3]:
sys.path.insert(0, os.path.realpath('../libs/'))
import preprocess_utils as pu

## 6 hourly mean std

In [4]:
config_name = os.path.realpath('data_config_6h.yml')

with open(config_name, 'r') as stream:
    conf = yaml.safe_load(stream)

In [5]:
varnames = list(conf['zscore'].keys())
varnames = varnames[:-3] # remove save_loc and others

In [6]:
varnames

['U',
 'V',
 'T',
 'Q',
 'SP',
 't2m',
 'V500',
 'U500',
 'T500',
 'Z500',
 'Q500',
 'tsi']

In [None]:
pu.zscore_var(conf, 'tsi')

6431975.416946704 - 59504949632630.39


## debug on tsi (solved using `astype('float64')`)

In [7]:
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
from glob import glob

def get_forward_data(filename) -> xr.DataArray:
    '''
    Check nc vs. zarr files
    open file as xr.Dataset
    '''
    if filename[-3:] == '.nc' or filename[-4:] == '.nc4':
        dataset = xr.open_dataset(filename)
    else:
        dataset = xr.open_zarr(filename, consolidated=True)
    return dataset

In [9]:
varname = 'tsi'

In [10]:
filenames = sorted(glob(conf['zscore'][varname]))

year_range = conf['zscore']['years_range']
train_years = [str(year) for year in range(year_range[0], year_range[1])]
train_files = [file for file in filenames if any(year in file for year in train_years)]

list_ds_train = []

for fn in train_files:
    list_ds_train.append(get_forward_data(fn))

In [11]:
ds_example = list_ds_train[0][varname]
var_shape = ds_example.shape
N_grids = var_shape[-1] * var_shape[-2]
mean_std_save = np.empty((2,))
mean_std_save.fill(np.nan)

In [12]:
ds = list_ds_train[0]
ds_subset = ds[varname]

In [13]:
ds # < --- tsi is float32

In [88]:
ds_subset = ds_subset.astype('float64') # <--- solution

`xarray.mean` and `std` with `skipna=False` is correct

In [89]:
float(ds_subset.var(skipna=False))

59504949632630.39

`xarray.mean` and `std` with `skipna=True` has problems if not using `float64`

In [90]:
float(ds_subset.var(skipna=True))

59504949639829.88

What `numpy` would give

In [91]:
test_tsi = np.array(ds_subset)
np.var(test_tsi)

59504949632630.39