In [1]:
# general tools
import os
import sys
from glob import glob

# data tools
import time
import h5py
import random
import numpy as np
from random import shuffle

from datetime import datetime, timedelta
import re

# deep learning tools
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend
from tensorflow.keras import utils
from tensorflow.keras import Model

tf.config.run_functions_eagerly(True)

import keras_unet_collection
from keras_unet_collection import utils as k_utils

sys.path.insert(0, '/glade/u/home/ksha/NCAR/')
sys.path.insert(0, '/glade/u/home/ksha/NCAR/libs/')

from namelist import *
import data_utils as du

2023-02-23 13:09:49.955593: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    
def create_model(input_shape=(32, 32, 15)):

    channels = [32, 64, 96, 128]

    Input_shape=(32, 32, 15)
    IN = layers.Input(shape=Input_shape)

    X = IN

    X = keras.layers.Conv2D(channels[0], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    X = keras.layers.Conv2D(channels[0], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    # pooling
    X = keras.layers.Conv2D(channels[1], kernel_size=2, strides=(2, 2), padding='valid', use_bias=True)(X)

    X = keras.layers.Conv2D(channels[1], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    X = keras.layers.Conv2D(channels[1], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    # pooling
    X = keras.layers.Conv2D(channels[2], kernel_size=2, strides=(2, 2), padding='valid', use_bias=True)(X)

    X = keras.layers.Conv2D(channels[2], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    X = keras.layers.Conv2D(channels[2], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    # pooling
    X = keras.layers.Conv2D(channels[3], kernel_size=2, strides=(2, 2), padding='valid', use_bias=True)(X)

    X = keras.layers.Conv2D(channels[3], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    X = keras.layers.Conv2D(channels[3], kernel_size=3, padding='same', use_bias=False)(X)
    X = keras.layers.BatchNormalization(axis=-1)(X)
    X = keras.layers.Activation("gelu")(X)

    V1 = X
    OUT = keras.layers.GlobalMaxPooling2D()(V1)
    model = Model(inputs=IN, outputs=OUT)
    
    return model

def create_model_head():

    
    IN_vec = keras.Input((128,))    
    X = IN_vec
    #
    X = keras.layers.Dense(64)(X)
    X = keras.layers.Activation("relu")(X)
    X = keras.layers.BatchNormalization()(X)
    
    OUT = X
    OUT = keras.layers.Dense(1, activation='sigmoid', bias_initializer=keras.initializers.Constant(-10))(OUT)

    model = keras.models.Model(inputs=IN_vec, outputs=OUT)
    
    return model

def verif_metric(VALID_target, Y_pred):
    
    BS = np.mean((VALID_target.ravel() - Y_pred.ravel())**2)
    
    print('{}'.format(BS))
    metric = BS

    return metric

def name_extract(filenames):
    
    date_base = datetime(2020, 7, 14)
    date_base2 = datetime(2021, 1, 1)
    
    filename_train = []
    filename_valid = []
    
    base_v3_s = datetime(2018, 7, 15)
    base_v3_e = datetime(2020, 12, 2)

    base_v4_s = datetime(2020, 12, 3)
    base_v4_e = datetime(2022, 7, 15)

    base_ref = datetime(2010, 1, 1)
    
    date_list_v3 = [base_v3_s + timedelta(days=day) for day in range(365+365+142)]
    date_list_v4 = [base_v4_s + timedelta(days=day) for day in range(365+365+30)]
    
    base_ref = datetime(2019, 10, 1)
    date_list_v4x = [base_ref + timedelta(days=day) for day in range(429)]
    
    for i, name in enumerate(filenames):
        
        if 'v4x' in name:
            date_list = date_list_v4x
        elif 'v4' in name:
            date_list = date_list_v4
        else:
            date_list = date_list_v3
        
        nums = re.findall(r'\d+', name)
        day = int(nums[-4])
        day = date_list[day]
        
        if (day - date_base).days < 0:
            filename_train.append(name)
            
        else:
            if (day - date_base2).days < 0:
                filename_valid.append(name)

        
    return filename_train, filename_valid

In [3]:

ind_pick_from_batch = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
L_vars = len(ind_pick_from_batch)

vers = ['v3', 'v4x', 'v4']
leads = [2, 3, 4, 5, 6, 20, 21, 22, 23]
filenames_pos = {}
filenames_neg = {}

for ver in vers:
    for lead in leads:
        if ver == 'v3' and lead < 23:
            path_ = '/glade/scratch/ksha/DATA/NCAR_batch_v3/'
        elif ver == 'v3' and lead == 23:
            path_ = '/glade/campaign/cisl/aiml/ksha/NCAR_batch_v3/'
        elif ver == 'v4':
            path_ = '/glade/campaign/cisl/aiml/ksha/NCAR_batch_v4_temp/'
        else:
            path_ = '/glade/campaign/cisl/aiml/ksha/NCAR_batch_v4x/'
            
        filenames_pos['{}_lead{}'.format(ver, lead)] = sorted(glob("{}*pos*lead{}.npy".format(path_, lead)))
        filenames_neg['{}_lead{}'.format(ver, lead)] = sorted(glob("{}*neg_neg_neg*lead{}.npy".format(path_, lead)))
        
        print('{}, lead{}, pos: {}, neg: {}'.format(ver, lead, 
                                                    len(filenames_pos['{}_lead{}'.format(ver, lead)]), 
                                                    len(filenames_neg['{}_lead{}'.format(ver, lead)])))

v3, lead2, pos: 5510, neg: 887822
v3, lead3, pos: 4297, neg: 889035
v3, lead4, pos: 3242, neg: 890090
v3, lead5, pos: 2565, neg: 891795
v3, lead6, pos: 2127, neg: 891205
v3, lead20, pos: 5407, neg: 887925
v3, lead21, pos: 6711, neg: 887649
v3, lead22, pos: 7613, neg: 886747
v3, lead23, pos: 8009, neg: 886351
v4x, lead2, pos: 1995, neg: 348553
v4x, lead3, pos: 1618, neg: 349958
v4x, lead4, pos: 1218, neg: 350358
v4x, lead5, pos: 962, neg: 350614
v4x, lead6, pos: 817, neg: 349731
v4x, lead20, pos: 1777, neg: 344659
v4x, lead21, pos: 2209, neg: 344227
v4x, lead22, pos: 2511, neg: 343925
v4x, lead23, pos: 2690, neg: 343746
v4, lead2, pos: 2174, neg: 402858
v4, lead3, pos: 1724, neg: 403308
v4, lead4, pos: 1349, neg: 403683
v4, lead5, pos: 1034, neg: 403998
v4, lead6, pos: 795, neg: 403209
v4, lead20, pos: 1941, neg: 403091
v4, lead21, pos: 2446, neg: 402586
v4, lead22, pos: 2791, neg: 402241
v4, lead23, pos: 2903, neg: 402129


In [4]:
filenames_pos_train = {}
filenames_neg_train = {}

filenames_pos_valid = {}
filenames_neg_valid = {}

for ver in vers:
    for lead in leads:
        temp_namelist_pos = filenames_pos['{}_lead{}'.format(ver, lead)]
        temp_namelist_neg = filenames_neg['{}_lead{}'.format(ver, lead)]
        
        pos_train, pos_valid = name_extract(temp_namelist_pos)
        neg_train, neg_valid = name_extract(temp_namelist_neg)
        
        print('pos train: {} pos valid: {} neg train: {} neg valid {}'.format(len(pos_train), 
                                                                              len(pos_valid), 
                                                                              len(neg_train), 
                                                                              len(neg_valid)))
        
        filenames_pos_train['{}_lead{}'.format(ver, lead)] = pos_train
        filenames_neg_train['{}_lead{}'.format(ver, lead)] = neg_train
        
        filenames_pos_valid['{}_lead{}'.format(ver, lead)] = pos_valid
        filenames_neg_valid['{}_lead{}'.format(ver, lead)] = neg_valid
        
pos_train_all = []
neg_train_all = []
pos_valid_all = []
neg_valid_all = []

for ver in vers:
    for lead in leads:
        pos_train_all += filenames_pos_train['{}_lead{}'.format(ver, lead)]
        neg_train_all += filenames_neg_train['{}_lead{}'.format(ver, lead)]
        pos_valid_all += filenames_pos_valid['{}_lead{}'.format(ver, lead)]
        neg_valid_all += filenames_neg_valid['{}_lead{}'.format(ver, lead)]
        
save_dir = '/glade/work/ksha/NCAR/'

with h5py.File(save_dir+'CNN_Validation.hdf', 'r') as h5io:
    VALID_input_32 = h5io['VALID_input_64'][:, 16:-16, 16:-16, ...]
    VALID_target = h5io['VALID_target'][...]

pos train: 4856 pos valid: 654 neg train: 742500 neg valid 145322
pos train: 3787 pos valid: 510 neg train: 743569 neg valid 145466
pos train: 2874 pos valid: 368 neg train: 744482 neg valid 145608
pos train: 2289 pos valid: 276 neg train: 746095 neg valid 145700
pos train: 1885 pos valid: 242 neg train: 745471 neg valid 145734
pos train: 4632 pos valid: 775 neg train: 742724 neg valid 145201
pos train: 5774 pos valid: 937 neg train: 742610 neg valid 145039
pos train: 6589 pos valid: 1024 neg train: 741795 neg valid 144952
pos train: 6952 pos valid: 1057 neg train: 741432 neg valid 144919
pos train: 1452 pos valid: 543 neg train: 225736 neg valid 122817
pos train: 1187 pos valid: 431 neg train: 226001 neg valid 123957
pos train: 898 pos valid: 320 neg train: 226290 neg valid 124068
pos train: 721 pos valid: 241 neg train: 226467 neg valid 124147
pos train: 598 pos valid: 219 neg train: 225562 neg valid 124169
pos train: 1136 pos valid: 641 neg train: 220912 neg valid 123747
pos train: 

In [5]:
flag_train = 'base'

if flag_train == 'head':
    flag_weights = 'base'
else:
    flag_weights = 'head'
    
model_head = create_model_head()
model_base = create_model(input_shape=(32, 32, 15))

IN = layers.Input(shape=(32, 32, 15))

VEC = model_base(IN)
OUT = model_head(VEC)

model_final = Model(inputs=IN, outputs=OUT)
model_final.compile(loss=keras.losses.BinaryCrossentropy(from_logits=False), optimizer=keras.optimizers.Adam(lr=1e-4))


2023-02-23 13:21:43.980858: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-02-23 13:21:43.982358: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-02-23 13:21:44.038088: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:8a:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-02-23 13:21:44.038121: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-02-23 13:21:44.103672: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2023-02-23 13:21:44.103706: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.10
2

In [7]:
W_old = k_utils.dummy_loader('/glade/work/ksha/NCAR/Keras_models/RE2_vgg_base{}/'.format(2))
model_final.set_weights(W_old)

In [8]:
Y_pred = model_final.predict([VALID_input_32])
record_temp = verif_metric(VALID_target, Y_pred)

2023-02-23 13:36:17.002028: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-02-23 13:36:17.002463: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2600000000 Hz
2023-02-23 13:36:17.019329: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7
2023-02-23 13:36:18.332716: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10


0.025294339000915635


In [22]:
save_round = 1
seeds = 711

In [24]:
min_del = 0
max_tol = 100 # early stopping with patience

epochs = 500
batch_size = 200
L_train = 64 #int(len(TRAIN_Y_pick) / batch_size)

X_batch_32 = np.empty((batch_size, 32, 32, L_vars))
Y_batch = np.empty((batch_size, 1))

X_batch_32[...] = np.nan
Y_batch[...] = np.nan

temp_dir = '/glade/work/ksha/NCAR/Keras_models/'

# =========== Model Section ========== #
key = 'RE2_vgg_{}{}'.format(flag_train, save_round)
model_name = '{}'.format(key)
model_path = temp_dir+model_name

tol = 0

filename_pos_train = pos_train_all
filename_neg_train = neg_train_all
# ========== Training loop ========== #
L_pos = len(filename_pos_train)
L_neg = len(filename_neg_train)

record = record_temp #0.01840167896363949 #record_temp
print("Initial record: {}".format(record))

set_seeds(seeds)
    
for i in range(epochs):
    start_time = time.time()

    # loop of batch
    for j in range(L_train):
        if flag_train == 'base':
            N_pos = 20
        else:
            N_pos = 20
            
        N_neg = batch_size - N_pos

        ind_neg = du.shuffle_ind(L_neg)
        ind_pos = du.shuffle_ind(L_pos)

        file_pick_neg = []
        for ind_temp in ind_neg[:N_neg]:
            file_pick_neg.append(filename_neg_train[ind_temp])

        file_pick_pos = []
        for ind_temp in ind_pos[:N_pos]:
            file_pick_pos.append(filename_pos_train[ind_temp])

        file_pick = file_pick_neg + file_pick_pos

        if len(file_pick) != batch_size:
            sregwet

        for k in range(batch_size):
            data = np.load(file_pick[k])

            for l, c in enumerate(ind_pick_from_batch):
                temp = data[:, 16:-16, 16:-16, c] 
                X_batch_32[k, ..., l] = temp

            if 'pos' in file_pick[k]:
                Y_batch[k, :] = 1.0 #np.random.uniform(0.9, 0.99)
            elif 'neg_neg_neg' in file_pick[k]:
                Y_batch[k, :] = 0.0 #np.random.uniform(0.01, 0.05)
            else:
                werhgaer

        ind_ = du.shuffle_ind(batch_size)
        X_batch_32 = X_batch_32[ind_, ...]
        Y_batch = Y_batch[ind_, :]

        # train on batch
        model_final.train_on_batch(X_batch_32, Y_batch);

    # epoch end operations
    Y_pred = model_final.predict([VALID_input_32])
    # Y_pred[Y_pred<0] = 0
    # Y_pred[Y_pred>1] = 1

    record_temp = verif_metric(VALID_target, Y_pred)

    # if i % 10 == 0:
    #     model.save(model_path_backup)

    if (record - record_temp > min_del):
        print('Validation loss improved from {} to {}'.format(record, record_temp))
        record = record_temp
        tol = 0
        #print('tol: {}'.format(tol))
        # save
        print('save to: {}'.format(model_path))
        model_final.save(model_path)
    else:
        print('Validation loss {} NOT improved'.format(record_temp))
        if record_temp >= 2.0:
            print('Early stopping')
            break;
        else:
            tol += 1
            if tol >= max_tol:
                print('Early stopping')
                break;
            else:
                continue;
    print("--- %s seconds ---" % (time.time() - start_time))


Initial record: 0.04038640111228885




0.039531235440255545
Validation loss improved from 0.04038640111228885 to 0.039531235440255545
save to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1
INFO:tensorflow:Assets written to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1/assets
--- 317.5968677997589 seconds ---




0.03649287790712893
Validation loss improved from 0.039531235440255545 to 0.03649287790712893
save to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1
INFO:tensorflow:Assets written to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1/assets
--- 313.91480016708374 seconds ---




0.03064535618095853
Validation loss improved from 0.03649287790712893 to 0.03064535618095853
save to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1
INFO:tensorflow:Assets written to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1/assets
--- 386.40570640563965 seconds ---




0.03033785999633634
Validation loss improved from 0.03064535618095853 to 0.03033785999633634
save to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1
INFO:tensorflow:Assets written to: /glade/work/ksha/NCAR/Keras_models/RE2_vgg_base1/assets
--- 466.7732915878296 seconds ---




0.039937552980324496
Validation loss 0.039937552980324496 NOT improved
0.03831762728701112
Validation loss 0.03831762728701112 NOT improved
0.04535715821299042
Validation loss 0.04535715821299042 NOT improved
0.03171125491545084
Validation loss 0.03171125491545084 NOT improved


KeyboardInterrupt: 