In [10]:
import pandas as pd
from scipy import stats
from matplotlib import cm, colors
from mpl_toolkits.axes_grid1 import ImageGrid
import json
import pickle
import csv
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import glob, os
import random
import pickle

import tensorflow as tf
from tensorflow import keras

In [2]:
# in/out variable lists
vars_mli = ['state_t','state_q0001','state_ps','pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX']
vars_mlo = ['ptend_t','ptend_q0001','cam_out_NETSW','cam_out_FLWDS','cam_out_PRECSC','cam_out_PRECC','cam_out_SOLS','cam_out_SOLL','cam_out_SOLSD','cam_out_SOLLD']

In [3]:
!pwd

/ocean/projects/atm200007p/jlin96/neurips_proj/e3sm_train_npy


In [4]:
# change these for your HPC and repo path
data_path = '/ocean/projects/atm200007p/walrus/for_jerry/train/'
norm_path = '/ocean/projects/atm200007p/jlin96/neurips_proj/mooers_metrics/norm_factors/'
grid_path = '/ocean/projects/atm200007p/jlin96/neurips_proj/mooers_metrics/test_data/E3SM-MMF_ne4_grid-info.orig.nc'
save_path = '/ocean/projects/atm200007p/jlin96/neurips_proj/e3sm_train_npy/'

In [5]:
# new dataset generator function
# that has new options (latlim, lonlim)

mli_mean = xr.open_dataset(norm_path + 'mli_mean.nc')
mli_min = xr.open_dataset(norm_path + 'mli_min.nc')
mli_max = xr.open_dataset(norm_path + 'mli_max.nc')
mlo_scale = xr.open_dataset(norm_path + 'mlo_scale.nc')
ne4_grid_info = xr.open_dataset(grid_path)

In [6]:
def ls(data_path = ""):
    return os.popen(" ".join(["ls", data_path])).read().splitlines()

def concatenate_arrays(xrdata, vars):
    return np.concatenate([np.atleast_1d(xrdata[var].values) for var in vars])

In [7]:
mli_mean_npy = concatenate_arrays(mli_mean, vars_mli)
mli_min_npy = concatenate_arrays(mli_min, vars_mli)
mli_max_npy = concatenate_arrays(mli_max, vars_mli)
mlo_scale_npy = concatenate_arrays(mlo_scale, vars_mlo)

# Debugging

In [11]:
latlim=[-999,999]
lonlim=[-999,999]

def showme_i(file):
    ds = xr.open_dataset(file, engine = 'netcdf4')
    ds = ds[vars_mli]
    ds = ds.merge(ne4_grid_info[['lat','lon']])
    ds = ds.where((ds['lat']>latlim[0])*(ds['lat']<latlim[1]),drop=True)
    ds = ds.where((ds['lon']>lonlim[0])*(ds['lon']<lonlim[1]),drop=True)
    return(ds)

def showme_o(file):
    # read mli
    ds = xr.open_dataset(file, engine='netcdf4')
    ds = ds[vars_mli]
    ds = ds.merge(ne4_grid_info[['lat','lon']])
    ds = ds.where((ds['lat']>latlim[0])*(ds['lat']<latlim[1]),drop=True)
    ds = ds.where((ds['lon']>lonlim[0])*(ds['lon']<lonlim[1]),drop=True)

    # read mlo
    dso = xr.open_dataset(file.replace('.mli.','.mlo.'), engine='netcdf4')
    dso = dso.merge(ne4_grid_info[['lat','lon']])
    dso = dso.where((dso['lat']>latlim[0])*(dso['lat']<latlim[1]),drop=True)
    dso = dso.where((dso['lon']>lonlim[0])*(dso['lon']<lonlim[1]),drop=True)

    # make mlo variales: ptend_t and ptend_q0001
    dso['ptend_t'] = (dso['state_t'] - ds['state_t'])/1200 # T tendency [K/s]
    dso['ptend_q0001'] = (dso['state_q0001'] - ds['state_q0001'])/1200 # Q tendency [kg/kg/s]
    dso = dso[vars_mlo]

    # normalizatoin, scaling
    # dso = dso*mlo_scale

    # stack
    #dso = dso.stack({'batch':{'sample','ncol'}})
    # dso = dso.stack({'batch':{'ncol'}})
    # dso = dso.to_stacked_array("mlvar", sample_dims=["batch"], name='mlo')
    return dso

def load_nc_dir_with_generator_test(filelist:list, latlim=[-999,999], lonlim=[-999,999]):
    def gen():
        for file in filelist:
            
            # read mli
            ds = xr.open_dataset(file, engine='netcdf4')
            ds = ds[vars_mli]
            ds = ds.merge(ne4_grid_info[['lat','lon']])
            ds = ds.where((ds['lat']>latlim[0])*(ds['lat']<latlim[1]),drop=True)
            ds = ds.where((ds['lon']>lonlim[0])*(ds['lon']<lonlim[1]),drop=True)
            
            # read mlo
            dso = xr.open_dataset(file.replace('.mli.','.mlo.'), engine='netcdf4')
            dso = dso.merge(ne4_grid_info[['lat','lon']])
            dso = dso.where((dso['lat']>latlim[0])*(dso['lat']<latlim[1]),drop=True)
            dso = dso.where((dso['lon']>lonlim[0])*(dso['lon']<lonlim[1]),drop=True)
            
            # make mlo variales: ptend_t and ptend_q0001
            dso['ptend_t'] = (dso['state_t'] - ds['state_t'])/1200 # T tendency [K/s]
            dso['ptend_q0001'] = (dso['state_q0001'] - ds['state_q0001'])/1200 # Q tendency [kg/kg/s]
            dso = dso[vars_mlo]
            
            # normalization, scaling
            ds = (ds-mli_mean)/(mli_max-mli_min)
            dso = dso*mlo_scale

            # stack
            #ds = ds.stack({'batch':{'sample','ncol'}})
            ds = ds.stack({'batch':{'ncol'}})
            ds = ds.to_stacked_array("mlvar", sample_dims=["batch"], name='mli')
            #dso = dso.stack({'batch':{'sample','ncol'}})
            dso = dso.stack({'batch':{'ncol'}})
            dso = dso.to_stacked_array("mlvar", sample_dims=["batch"], name='mlo')
            
            yield (ds.values, dso.values)

    return tf.data.Dataset.from_generator(
        gen,
        output_types=(tf.float64, tf.float64),
        output_shapes=((None,124),(None,128))
    )

In [12]:
with open(save_path + 'train_input.npy', 'rb') as f:
    train_input = np.load(f)

with open(save_path + 'train_target.npy', 'rb') as f:
    train_target = np.load(f)

In [13]:
train_input.shape

(10091520, 124)

In [14]:
train_target.shape

(10091520, 128)

In [15]:
latlons = {i: (ne4_grid_info["lat"].values[i], ne4_grid_info["lon"].values[i]) for i in range(384)}

In [16]:
with open(save_path + 'indextolatlons.pkl', 'wb') as f:
    pickle.dump(latlons, f)

In [18]:
def reshape_npy(var_arr):
    var_arr = var_arr.reshape((int(var_arr.shape[0]/384), 384, 60))
    return(var_arr)