# Compute residual normalization constant

In [1]:
import os
import sys
import yaml
import time
import numpy as np
import xarray as xr
from glob import glob

In [2]:
from credit.data import get_forward_data

In [3]:
sys.path.insert(0, os.path.realpath('../libs/'))
import preprocess_utils as pu

In [4]:
config_name = os.path.realpath('data_config_1h.yml')

with open(config_name, 'r') as stream:
    conf = yaml.safe_load(stream)

In [4]:
varname = 'U'
ind_level = 14

In [13]:
pu.residual_zscore_var(conf, varname, ind_level=ind_level)

In [13]:
varname = 'U500'
ind_level = None

In [14]:
pu.residual_zscore_var(conf, varname, ind_level=ind_level)

## separate results by year

In [5]:
config_name = os.path.realpath('data_config_1h.yml')

with open(config_name, 'r') as stream:
    conf = yaml.safe_load(stream)

In [6]:
varname = 'tsi'

In [8]:
for year in range(1979, 2018):
    pu.residual_zscore_var_split_years(conf, varname, year, ind_level=None, flag_float64=True)

applying np.diff ...
... done
3.992981097915965e-08 - 0.08399907090968746
Save to /glade/campaign/cisl/aiml/ksha/CREDIT/backup_1h_2018_residual_mean_std_tsi_y1979.npy
applying np.diff ...
... done
-4.717411896698478e-08 - 0.08400552463365456
Save to /glade/campaign/cisl/aiml/ksha/CREDIT/backup_1h_2018_residual_mean_std_tsi_y1980.npy
applying np.diff ...
... done
4.007483693310204e-09 - 0.08400406503324591
Save to /glade/campaign/cisl/aiml/ksha/CREDIT/backup_1h_2018_residual_mean_std_tsi_y1981.npy
applying np.diff ...
... done
1.0039195639892813e-08 - 0.08396608843809757
Save to /glade/campaign/cisl/aiml/ksha/CREDIT/backup_1h_2018_residual_mean_std_tsi_y1982.npy
applying np.diff ...



KeyboardInterrupt



In [27]:
data_temp = np.load('/glade/campaign/cisl/aiml/ksha/CREDIT/backup_1h_2018_residual_mean_std_tsi_y1989.npy')

### Update from all yearly results

In [9]:
config_name = os.path.realpath('data_config_1h.yml')

with open(config_name, 'r') as stream:
    conf = yaml.safe_load(stream)

In [10]:
varname_surf = ['tsi',]
years_range = conf['residual']['years_range']
years = np.arange(years_range[0], years_range[1])

In [12]:
for varname in varname_surf:
    
    # allocate result for this variable
    mean_std_save = np.empty((2,))
    
    for i_year, year in enumerate(years):
        save_name = conf['residual']['save_loc'] + 'backup_{}_mean_std_{}_y{}.npy'.format(
            conf['residual']['prefix'], varname, year)

        mean_std_N_save = np.load(save_name)

        mean_current_yr = mean_std_N_save[0]
        var_current_yr = mean_std_N_save[1]
        L = mean_std_N_save[2]

        if i_year == 0:
            # if it is the first year, pass current year to the combined 
            mean_std_save[0] = mean_current_yr
            mean_std_save[1] = var_current_yr
            N_samples = L
            
        else:
            # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
            mean_new = (L * mean_current_yr + N_samples * mean_std_save[0]) / (L + N_samples)
            var_new = ((L - 1) * var_current_yr + (N_samples - 1) * mean_std_save[1]) / (L + N_samples - 1)
            var_new_adjust = (L * N_samples * (mean_current_yr - mean_std_save[0])**2) / (L + N_samples) / (L + N_samples -1)
            
            mean_std_save[0] = mean_new
            mean_std_save[1] = var_new + var_new_adjust
            N_samples = N_samples + L

            print('{} - {}'.format(mean_std_save[0], mean_std_save[1]))
            
    save_name = conf['residual']['save_loc'] + '{}_mean_std_{}.npy'.format(conf['residual']['prefix'], varname)
    print('Save to {}'.format(save_name))
    # np.save(save_name, mean_std_save)

-3.681739398048566e-09 - 0.08400230218064257
-1.121003872509177e-09 - 0.08400288925806965
1.6671361026513772e-09 - 0.0839936953480684
4.2124762988100274e-10 - 0.0839842079475786
-7.97641204231631e-09 - 0.08397034943415727
-6.248697512438231e-09 - 0.08395794247779552
-4.018932301636456e-09 - 0.08394843207070722
9.291853702540194e-10 - 0.0839431854243798
-8.966570047515398e-10 - 0.08394260497555227
1.6553347610759329e-09 - 0.08394719133579581
2.8836856894128255e-09 - 0.08395098446441296
2.563889562351489e-09 - 0.08395333867575981
-1.301898661150006e-09 - 0.08395373321671372
-1.2241314613656236e-09 - 0.08395229234957925
-1.1108155340644264e-09 - 0.08394929795920945
2.6385410821211495e-10 - 0.08394581576643097
-2.0955292146369012e-09 - 0.08394220099531254
-8.439570221854462e-10 - 0.08393987482282209
1.4278654901968038e-09 - 0.08393980856115625
2.802577436048392e-09 - 0.08394158511283892
1.2870007397053525e-09 - 0.08394436099449792
2.111898396374396e-09 - 0.08394694557015414
1.8749797516660

## `xr.apply_ufunc(np.diff)` vs `np.diff` directly

In [18]:
ds = list_ds_train[0]
if ind_level is not None:
    ds = ds.isel(level=ind_level)

In [19]:
ds = ds.isel(time=slice(100))

In [20]:
var_diff = xr.apply_ufunc(
    np.diff,
    ds[varname],
    input_core_dims=[['time']],
    output_core_dims=[['time_diff']],  # Change this to a new dimension name
    vectorize=True,
    dask='allowed',
    output_dtypes=[ds[varname].dtype]
)

ds_out = var_diff.to_dataset(name='{}_diff'.format(varname))

ds_out = ds_out.assign_coords(
    time_diff=ds_out['time_diff'])

ds_out = ds_out.transpose("time_diff", "latitude", "longitude")

In [22]:
diff1 = np.array(ds_out['U_diff'])

In [24]:
diff2 = np.diff(np.array(ds['U']), axis=0)

In [26]:
np.sum(diff1 - diff2)

0.0

## old blocks

In [None]:
# filenames = sorted(glob(conf['zscore'][varname]))

# year_range = conf['zscore']['years_range']
# train_years = [str(year) for year in range(year_range[0], year_range[1])]
# train_files = [file for file in filenames if any(year in file for year in train_years)]

# list_ds_train = []

# for fn in train_files:
#     list_ds_train.append(get_forward_data(fn))
    
# # ------------------------------------------------------------------------------------ #
# ds_example = list_ds_train[0][varname]
# var_shape = ds_example.shape

# N_grids = var_shape[-1] * var_shape[-2]
# mean_std_save = np.empty((2,))
# mean_std_save.fill(np.nan)

# for i_fn, ds in enumerate(list_ds_train):
#     # ===================================================================== #
#     # apply np.diff
#     var_diff = xr.apply_ufunc(
#         np.diff,
#         ds[varname],
#         input_core_dims=[['time']],
#         output_core_dims=[['time_diff']],  # Change this to a new dimension name
#         vectorize=True,
#         dask='allowed',
#         output_dtypes=[ds[varname].dtype]
#     )
    
#     ds_out = var_diff.to_dataset(name='{}_diff'.format(varname))
    
#     ds_out = ds_out.assign_coords(
#         time_diff=ds_out['time_diff'])
    
#     ds_out = ds_out.transpose("time_diff", "latitude", "longitude")
    
#     # ===================================================================== #
#     # compute the mean and std from the np.diff result
    
#     ds_subset = ds_out['{}_diff'.format(varname)]
    
#     # get mean and var for the current year
#     mean_current_yr = float(ds_subset.mean())
#     var_current_yr = float(ds_subset.var())
#     L = len(ds_subset) * N_grids
    
#     print('{} - {}'.format(mean_current_yr, var_current_yr))
        
#     if i_fn == 0:
#         # if it is the first year, pass current year to the combined 
#         mean_std_save[0] = mean_current_yr
#         mean_std_save[1] = var_current_yr
#         N_samples = L
        
#     else:
#         # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
#         mean_new = (L * mean_current_yr + N_samples * mean_std_save[0]) / (L + N_samples)
#         var_new = ((L - 1) * var_current_yr + (N_samples - 1) * mean_std_save[1]) / (L + N_samples - 1)
#         var_new_adjust = (L * N_samples * (mean_current_yr - mean_std_save[0])**2) / (L + N_samples) / (L + N_samples -1)
        
#         mean_std_save[0] = mean_new
#         mean_std_save[1] = var_new + var_new_adjust
#         N_samples = N_samples + L
        
#         print('{} - {}'.format(mean_std_save[0], mean_std_save[1]))

# save_name = conf['zscore']['save_loc'] + '{}_residual_mean_std_{}.npy'.format(conf['zscore']['prefix'], varname)
# print('Save to {}'.format(save_name))
# # np.save(save_name, mean_std_save)

# start_time = time.time()
# main()
# print("--- %s seconds ---" % (time.time() - start_time))