This script is to load original netcdf datasets / to preprocess / and to save them as npy files.

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import models
from keras import layers
from keras import callbacks
import keras_tuner as kt
from keras_tuner import HyperModel
from keras_tuner import RandomSearch
import os
import tensorflow_addons as tfa
import sys
import argparse
import glob
import random
from pathlib import Path

2023-06-02 22:16:15.970525: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# v2 vars
# using all variables except:
# 1. state_pmid is not used since it is a function of state_ps (e.g., hyam*p0+hybm*ps)
# 2. Ozone and GHG variables in vars_mli_utls are used for levels [5,20],
#.   where the variance is the largest.

vars_mli      = ['state_t','state_q0001', 'state_q0002', 'state_q0003', 'state_u', 'state_v',
                 'state_ps', 'pbuf_SOLIN','pbuf_LHFLX', 'pbuf_SHFLX',  'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS',
                 'cam_in_ALDIF', 'cam_in_ALDIR', 'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP',
                 'cam_in_ICEFRAC', 'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHICE', 'cam_in_SNOWHLAND']
vars_mli_utls = ['pbuf_ozone', 'pbuf_CH4', 'pbuf_N2O']
vars_mlo      = ['ptend_t','ptend_q0001','ptend_q0002','ptend_q0003', 'ptend_u', 'ptend_v',
                 'cam_out_NETSW', 'cam_out_FLWDS', 'cam_out_PRECSC', 'cam_out_PRECC',
                 'cam_out_SOLS', 'cam_out_SOLL', 'cam_out_SOLSD', 'cam_out_SOLLD']

In [3]:
# normalization/scaling factors
# https://github.com/sungdukyu/E3SM-MMF_baseline/tree/main/norm_factors
mli_mean  = xr.open_dataset('../norm_factors/mli_mean.nc',  engine='netcdf4')
mli_min   = xr.open_dataset('../norm_factors/mli_min.nc',   engine='netcdf4')
mli_max   = xr.open_dataset('../norm_factors/mli_max.nc',   engine='netcdf4')
mlo_scale = xr.open_dataset('../norm_factors/mlo_scale.nc', engine='netcdf4')

# for vars_mli_utls variables:
# creating a clipped lev dimension called 'lev2'
for k, kds in enumerate([mli_mean, mli_min, mli_max]):
    kds_utls = kds[vars_mli_utls]\
          .isel(lev=slice(5,21)).rename({'lev':'lev2'})
    kds = kds[vars_mli]
    kds = kds.merge(kds_utls)
    if k==0: mli_mean=kds
    if k==1: mli_min=kds
    if k==2: mli_max=kds

In [4]:
# data generator for v2
# (also includes data preprocessing)

input_length = 425
output_length = 368

def load_nc_dir_with_generator(filelist:list):
    def gen():
        for file in filelist:

            # input read / preprocess #
            # read mli (-> ds)
            ds = xr.open_dataset(file, engine='netcdf4')
            # subset ozone, ch4, n2o
            ds_utls = ds[vars_mli_utls]\
                      .isel(lev=slice(5,21)).rename({'lev':'lev2'})
            # combine ds and ds_utls
            ds = ds[vars_mli]
            ds = ds.merge(ds_utls)

            # output read / preprocess #
            # read mlo (-> dso)
            dso = xr.open_dataset(file.replace('.mli.','.mlo.'), engine='netcdf4')
            # make mlo tendency variales ("ptend_xxxx"):
            for kvar in ['state_t','state_q0001','state_q0002', 'state_q0003', 'state_u', 'state_v']:
                dso[kvar.replace('state','ptend')] = (dso[kvar] - ds[kvar])/1200 # timestep=1200[sec]
            # remove "state_xxxx"
            dso = dso[vars_mlo]

            # normalizatoin, scaling #
            ds = (ds-mli_mean)/(mli_max-mli_min)
            dso = dso*mlo_scale

            # flatten input variables #
            #ds = ds.stack({'batch':{'sample','ncol'}})
            ds = ds.stack({'batch':{'ncol'}})
            ds = ds.to_stacked_array("mlvar", sample_dims=["batch"], name='mli')
            #dso = dso.stack({'batch':{'sample','ncol'}})
            dso = dso.stack({'batch':{'ncol'}})
            dso = dso.to_stacked_array("mlvar", sample_dims=["batch"], name='mlo')

            yield (ds.values, dso.values)

    return tf.data.Dataset.from_generator(gen,
                                          output_types=(tf.float64, tf.float64),
                                          output_shapes=((None,input_length),(None,output_length)),
                                         )

In [5]:
# save train -> npy

# set stride
stride_sample = 7 # prime number to sample all 'tod'

# files (train)
f_mli1 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.000[1234567]-*-*-*.nc')
f_mli2 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.0008-01-*-*.nc')
f_mli = sorted([*f_mli1, *f_mli2])
# random.shuffle(f_mli)
f_mli = f_mli[::stride_sample]

# data generator -> npy array
tds = load_nc_dir_with_generator(f_mli)
work = list(tds.as_numpy_iterator())
x_true = np.concatenate([ work[k][0] for k in range(len(work)) ])
y_true = np.concatenate([ work[k][1] for k in range(len(work)) ])

# to .npy
with open(f'./npy_files/train_input.v2.stride-{stride_sample}.npy', 'wb') as f:
    np.save(f, np.float32(x_true))
with open(f'./npy_files/train_target.v2.stride-{stride_sample}.npy', 'wb') as f:
    np.save(f, np.float32(y_true))

2023-06-02 22:16:27.870596: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-06-02 22:16:27.870660: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (nid005763): /proc/driver/nvidia/version does not exist
2023-06-02 22:16:27.877665: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# save validation -> npy

# set stride
stride_sample = 7 # prime number to sample all 'tod'

# files (val)
f_mli1 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.0008-0[23456789]-*-*.nc')
f_mli2 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.0008-1[012]-*-*.nc')
f_mli3 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.0009-01-*-*.nc')
f_mli_val = sorted([*f_mli1, *f_mli2, *f_mli3])
# random.shuffle(f_mli_val)
f_mli_val = f_mli_val[::stride_sample]

# data generator -> npy array


tds = load_nc_dir_with_generator(f_mli_val)
work = list(tds.as_numpy_iterator())
x_true = np.concatenate([ work[k][0] for k in range(len(work)) ])
y_true = np.concatenate([ work[k][1] for k in range(len(work)) ])

# to .npy
with open(f'./npy_files/val_input.v2.stride-{stride_sample}.npy', 'wb') as f:
    np.save(f, np.float32(x_true))
with open(f'./npy_files/val_target.v2.stride-{stride_sample}.npy', 'wb') as f:
    np.save(f, np.float32(y_true))

In [7]:
# save validation -> npy

# set stride
stride_sample = 6 # prime number to sample all 'tod'

# files (val)
f_mli1 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.0008-0[23456789]-*-*.nc')
f_mli2 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.0008-1[012]-*-*.nc')
f_mli3 = glob.glob('/pscratch/sd/s/sungduk/hugging/E3SM-MMF_ne4/train/*/E3SM-MMF.mli.0009-01-*-*.nc')
f_mli_val = sorted([*f_mli1, *f_mli2, *f_mli3])
# random.shuffle(f_mli_val)
f_mli_val = f_mli_val[::stride_sample]

# data generator -> npy array


tds = load_nc_dir_with_generator(f_mli_val)
work = list(tds.as_numpy_iterator())
x_true = np.concatenate([ work[k][0] for k in range(len(work)) ])
y_true = np.concatenate([ work[k][1] for k in range(len(work)) ])

# to .npy
with open(f'./npy_files/val_input.v2.stride-{stride_sample}.npy', 'wb') as f:
    np.save(f, np.float32(x_true))
with open(f'./npy_files/val_target.v2.stride-{stride_sample}.npy', 'wb') as f:
    np.save(f, np.float32(y_true))