# Check the distribution of zscored variables (TSI and Q)

In [1]:
import numpy as np
import xarray as xr
from glob import glob

In [2]:
from credit.data import get_forward_data

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

## Check normalized TSI

In [4]:
# glob all tsi files and lazzy open as xr.Dataset
filenames = sorted(glob('/glade/derecho/scratch/dgagne/credit_solar_1h_0.25deg/*.nc'))
#filenames = sorted(glob('/glade/derecho/scratch/schreck/STAGING/TOTAL_*'))

train_years_range = [1979, 2014]
train_years = [str(year) for year in range(train_years_range[0], train_years_range[1])]
train_files = [file for file in filenames if any(year in file for year in train_years)]

list_ds_train = []

for fn in train_files:
    list_ds_train.append(get_forward_data(fn))

### how tsi mean std was computed

In [5]:
# # loop thorugh files and compute mean and std 
# for i_ds, ds in enumerate(list_ds_train):
    
#     print('{}'.format(train_files[i_ds]))
    
#     mean_current_yr = float(ds['tsi'].mean())
#     var_current_yr = float(ds['tsi'].var())
#     L = len(ds['tsi'])*640*1280
    
#     print('{} - {}'.format(mean_current_yr, var_current_yr))
    
#     print('... done')
    
#     if i_ds == 0:
    
#         mean_combined = mean_current_yr
#         var_combined = var_current_yr
#         L_combined = L
        
#     else:
#         print('pooling ...')
#         # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
#         mean_new = (L*mean_current_yr + L_combined*mean_combined) / (L + L_combined)
#         var_new = ((L-1)*var_current_yr + (L_combined-1)*var_combined) / (L + L_combined -1)
#         var_new_adjust = (L*L_combined*(mean_current_yr -mean_combined)**2) / (L + L_combined) / (L + L_combined -1)
    
#         mean_combined = mean_new
#         var_combined = var_new + var_new_adjust
#         L_combined = L_combined + L

#         print('{} - {}'.format(mean_combined, var_combined))
        
#         print('... done')

In [6]:
NEW_mean = xr.open_dataset('/glade/campaign/cisl/aiml/ksha/CREDIT/mean_1h_1979_2018_16lev_0.25deg.nc')
NEW_std = xr.open_dataset('/glade/campaign/cisl/aiml/ksha/CREDIT/std_1h_1979_2018_16lev_0.25deg.nc')

In [7]:
mean_tsi = np.array(NEW_mean['tsi'])
std_tsi = np.array(NEW_std['tsi'])

In [8]:
mean_tsi

array(1056801.76402968)

In [9]:
std_tsi

array(1416275.11762502)

In [10]:
ds_norm = (list_ds_train[1]['tsi'] - mean_tsi) / std_tsi

In [11]:
float(ds_norm.min())

-0.7461839318275452

In [12]:
float(ds_norm.max())

2.7664783000946045

In [13]:
# ds_norm.plot.hist()

## How about the old TSI? (almost the same result)

In [14]:
TOA_forcing_path = '/glade/derecho/scratch/dgagne/credit_scalers/solar_radiation_2024-03-29_0204.nc' 
ds_old = get_forward_data(TOA_forcing_path)

In [15]:
float(ds_old['tsi'].mean(skipna=False))

988673.375

In [16]:
float(ds_old['tsi'].std(skipna=False))

1325118.875

In [36]:
ds_old_norm = (ds_old['tsi'] - 988673.375) / 1325118.875

In [37]:
float(ds_old_norm.min())

-0.7461016178131104

In [38]:
float(ds_old_norm.max())

2.7680108547210693

In [39]:
# ds_old_norm.plot.hist()

## Check normalized 500 hPa Q

In [18]:
ERA_files = sorted(glob('/glade/derecho/scratch/wchapman/y_TOTAL*'))
ERA_example = xr.open_zarr(ERA_files[0])

In [19]:
mean_q = np.array(NEW_mean['Q500'])
std_q = np.array(NEW_std['Q500'])

In [20]:
q_norm = (ERA_example['Q500'] - mean_q) / std_q

In [21]:
float(q_norm.min())

-0.7999893426895142

In [22]:
float(q_norm.max())

8.571893692016602

In [23]:
# q_norm.plot.hist()

## Check top-of-atmos Q

In [24]:
ERA_files = sorted(glob('/glade/derecho/scratch/wchapman/y_TOTAL*'))
ERA_example = xr.open_zarr(ERA_files[0])

In [25]:
mean_q_toa = np.array(NEW_mean['Q'].isel(level=0))
std_q_toa = np.array(NEW_std['Q'].isel(level=0))

In [26]:
q_toa_norm = (ERA_example['Q'].isel(level=0) - mean_q_toa) / std_q_toa

In [27]:
float(q_toa_norm.min())

-8.435981750488281

In [28]:
float(q_toa_norm.max())

0.8486422896385193

In [29]:
# q_toa_norm.plot.hist()

## Check surface Q

In [30]:
ERA_files = sorted(glob('/glade/derecho/scratch/wchapman/y_TOTAL*'))
ERA_example = xr.open_zarr(ERA_files[0])

In [31]:
mean_q_surf = np.array(NEW_mean['Q'].isel(level=14))
std_q_surf = np.array(NEW_std['Q'].isel(level=14))

In [32]:
q_surf_norm = (ERA_example['Q'].isel(level=14) - mean_q_surf) / std_q_surf

In [33]:
float(q_surf_norm.min())

-1.1894210577011108

In [34]:
float(q_surf_norm.max())

3.679657459259033

In [35]:
# q_surf_norm.plot.hist()