latest_nn.py

#export TF_XLA_FLAGS=--tf_xla_auto_jit=2
#export TFA_XLA_FLAGS=--tf_xla_cpu_global_jit
#export XLA_FLAGS=--xla_hlo_profile

from __future__ import absolute_import, division, print_function, unicode_literals

# to suppress future warning from tf + np 1.17 combination.
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
#runtimewarning is from powertransformer
warnings.filterwarnings('ignore',category=RuntimeWarning)


epsilon = 1e-5

# libraries for read in data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from json import JSONDecoder, JSONDecodeError  # for reading the JSON data files
import re  # for regular expressions
import os  # for os related operations
import matplotlib.pyplot as plt
# %matplotlib inline
from scipy.stats import skew


# libraries needed for machine learning
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

from sklearn.preprocessing import PowerTransformer as pt, StandardScaler as ss, MinMaxScaler as mms, RobustScaler as rs, FunctionTransformer as ft, power_transform

from sklearn.compose import ColumnTransformer as ct

from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion as fu
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import *

import tensorflow as tf
tf.config.optimizer.set_jit(True)
import tensorflow_addons as tfa

from tensorflow.keras.models import Sequential, load_model
#from keras.layers import Dense, Activation, Dropout, TimeDistributed, LSTM, Flatten, Bidirectional
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import *
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import pickle

import multiprocessing as mp

FLOAT_TYPE = 'float32'
K.set_floatx(FLOAT_TYPE)

# all features
feature_names = ['TOTUSJH', 'TOTBSQ', 'TOTPOT', 'TOTUSJZ', 'ABSNJZH', 'SAVNCPP', 'USFLUX', 'TOTFZ', 'MEANPOT', 'EPSZ', 'MEANSHR', 'SHRGT45', 'MEANGAM', 'MEANGBT', 'MEANGBZ', 'MEANGBH', 'MEANJZH', 'TOTFY', 'MEANJZD', 'MEANALP', 'TOTFX', 'EPSY', 'EPSX', 'R_VALUE', 'XR_MAX']
# we select all features
selected_features = feature_names


"""
# By observing the histograms of relevant features, their histograms can be grouped into four categories.
# right skewed with extreme outliers, right skewed without extreme outliers, left skewed with extreme outliers, non skewed
right_skewed_features = ['TOTUSJH', 'TOTBSQ', 'TOTPOT', 'TOTUSJZ', 'ABSNJZH', 'SAVNCPP', 'USFLUX', 'EPSZ', 'MEANSHR', 'MEANGAM', 'MEANGBH', 'MEANJZD']
right_skewed_features_with_ol = ['TOTBSQ', 'TOTPOT', 'TOTUSJZ', 'SAVNCPP', 'USFLUX', 'MEANSHR', 'MEANGAM', 'MEANGBH', 'MEANJZD']
right_skewed_features_without_ol = ['TOTUSJH', 'ABSNJZH', 'EPSZ']
left_skewed_features_with_ol = ['TOTFZ']
non_skewed_features = ['MEANGBT', 'R_VALUE']
selected_features = right_skewed_features + non_skewed_features

# get the indice for features
indice_right_skewed_with_ol = []
indice_right_skewed_without_ol = []
indice_non_skewed = []
for i in range(0,len(selected_features)):
    if selected_features[i] in right_skewed_features_with_ol:
        indice_right_skewed_with_ol.append(i)
    elif selected_features[i] in right_skewed_features_without_ol:
        indice_right_skewed_without_ol.append(i)
    elif selected_features[i] in non_skewed_features:
        indice_non_skewed.append(i)


scale_params_right_skewed = pd.read_csv('scale_params_right_skewed.csv')
scale_params_right_skewed.set_index('Unnamed: 0', inplace=True)

scale_params_non_skewed = pd.read_csv('scale_params_non_skewed.csv')
scale_params_non_skewed.set_index('Unnamed: 0', inplace=True)

"""


# Functions for reading in data from .json files
def decode_obj(line, pos=0, decoder=JSONDecoder()):
    no_white_space_regex = re.compile(r'[^\s]')
    while True:
        match = no_white_space_regex.search(line, pos)
        # line is a long string with data type `str`
        if not match:
            # if the line is full of white space, get out of this func
            return
        # pos will be the position for the first non-white-space character in the `line`.
        pos = match.start()
        try:
            # JSONDecoder().raw_decode(line,pos) would return a tuple (obj, pos)
            # obj is a dict, and pos is an int
            # not sure how the pos is counted in this case, but it is not used anyway.
            obj, pos = decoder.raw_decode(line, pos)
            # obj = {'id': 1, 'classNum': 1, 'values',feature_dic}
            # here feature_dic is a dict with all features.
            # its key is feature name as a str
            # its value is a dict {"0": float, ..., "59": float}
        except JSONDecodeError as err:
            print('Oops! something went wrong. Error: {}'.format(err))
            # read about difference between yield and return
            # with `yield`, obj won't be assigned until it is used
            # Once it is used, it is forgotten.
        yield obj

def get_obj_with_last_n_val(line, n):
    # since decode_obj(line) is a generator
    # next(generator) would execute this generator and returns its content
    obj = next(decode_obj(line))  # type:dict
    id = obj['id']
    class_label = obj['classNum']
    data = pd.DataFrame.from_dict(obj['values'])  # type:pd.DataFrame
    data.set_index(data.index.astype(int), inplace=True)
    last_n_indices = np.arange(0, 60)[-n:]
    data = data.loc[last_n_indices]
    return {'id': id, 'classType': class_label, 'values': data}

def convert_json_data_to_nparray(data_dir: str, file_name: str, features):
    """
    Generates a dataframe by concatenating the last values of each
    multi-variate time series. This method is designed as an example
    to show how a json object can be converted into a csv file.
    :param data_dir: the path to the data directory.
    :param file_name: name of the file to be read, with the extension.
    :return: the generated dataframe.
    """
    fname = os.path.join(data_dir, file_name)
    all_df, labels, ids = [], [], []
    with open(fname, 'r') as infile: # Open the file for reading
        for line in infile:  # Each 'line' is one MVTS with its single label (0 or 1).
            obj = get_obj_with_last_n_val(line, 60) # obj is a dictionary
            # if the classType in the sample is NaN, we do not read in this sample
            if np.isnan(obj['classType']):
                pass
            else:
                # a pd.DataFrame with shape = time_steps x number of features
                # here time_steps = 60, and # of features are the length of the list `features`.
                df_selected_features = obj['values'][features]
                # a list of np.array, each has shape=time_steps x number of features
                # I use DataFrame here so that the feature name is contained, which we need later for
                # scaling features.
                all_df.append(np.array(df_selected_features))
                labels.append(obj['classType']) # list of integers, each integer is either 1 or 0
                ids.append(obj['id']) # list of integers
    return all_df, labels, ids


############################################################
############################################################

def get_obj_with_last_n_val_test(line, n):
    # since decode_obj(line) is a generator
    # next(generator) would execute this generator and returns its content
    obj = next(decode_obj(line))  # type:dict
    id = obj['id']
    data = pd.DataFrame.from_dict(obj['values'])  # type:pd.DataFrame
    data.set_index(data.index.astype(int), inplace=True)
    last_n_indices = np.arange(0, 60)[-n:]
    data = data.loc[last_n_indices]
    return {'id': id, 'values': data}

def convert_json_data_to_nparray_test(data_dir: str, file_name: str, features):
    """
    Generates a dataframe by concatenating the last values of each
    multi-variate time series. This method is designed as an example
    to show how a json object can be converted into a csv file.
    :param data_dir: the path to the data directory.
    :param file_name: name of the file to be read, with the extension.
    :return: the generated dataframe.
    """
    fname = os.path.join(data_dir, file_name)
    all_df, ids = [], [], []
    with open(fname, 'r') as infile: # Open the file for reading
        for line in infile:  # Each 'line' is one MVTS with its single label (0 or 1).
            obj = get_obj_with_last_n_val(line, 60) # obj is a dictionary
            # if the classType in the sample is NaN, we do not read in this sample
            if np.isnan(obj['classType']):
                pass
            else:
                # a pd.DataFrame with shape = time_steps x number of features
                # here time_steps = 60, and # of features are the length of the list `features`.
                df_selected_features = obj['values'][features]
                # a list of np.array, each has shape=time_steps x number of features
                # I use DataFrame here so that the feature name is contained, which we need later for
                # scaling features.
                all_df.append(np.array(df_selected_features))
                ids.append(obj['id']) # list of integers
    return all_df, labels, ids


##################################################################
##################################################################


print('Files contained in the ../input directiory include:')
print(os.listdir("./input"))


def diff_func(x):
    # x is 2d array
    return np.diff(x, axis=0)
    #return np.asarray([i - x[0] for i in x])[1:]

  
def timeseries_detrending(X):
    # X is np.array, 3D
    X_2D = [*zip(X[i] for i in range(X.shape[0]))]
    with mp.Pool() as pool:
        X_new = pool.starmap(diff_func, X_2D)
    return np.asarray(X_new)

def detrender(X):
    x = pd.DataFrame(x)
    output_df = x.diff()[1:]
    return output_df.values


#rescaling to range 0-1
def minmax_func(x):
    # x is 2d array
    return (x - np.nanmin(x, axis=0))/(np.nanmax(x, axis=0) - np.nanmin(x, axis=0) + epsilon)

def timeseries_normalization(X):
    # X is np.array, 3D
    X_2D = [*zip(X[i] for i in range(X.shape[0]))]
    with mp.Pool() as pool:
        X_new = pool.starmap(minmax_func, X_2D)
    return np.asarray(X_new)

def impute_func(x):
    # x is 2d array
    #return IterativeImputer().fit_transform(x)
    return SimpleImputer()#.fit_transform(x)

def timeseries_imputation(X):
    # X is np.array, 3D
    X_2D = [*zip(X[i] for i in range(X.shape[0]))]
    with mp.Pool() as pool:
        X_new = pool.starmap(impute_func, X_2D)
    return np.asarray(X_new)

def powertransform_func(x):
    # x is 2d array
    return pt().fit_transform(x)

def timeseries_powertransformation(X):
    # X is np.array, 3D
    X_2D = [*zip(X[i] for i in range(X.shape[0]))]
    with mp.Pool() as pool:
        X_new = pool.starmap(powertransform_func, X_2D)
    return np.asarray(X_new)


def log_transform(x):
    min_vals = np.nanmin(x, axis=0)
    x += (abs(min_vals) - min_vals)/2.  + epsilon
    return np.log1p(x)

def logged_features(x):
    x = pd.DataFrame(x)
    feature_skew = x.skew(axis=0,skipna=True)
    log_features = feature_skew[abs(feature_skew) > 0.9].index
    scale_features = [name for name in feature_skew.index if name not in log_features]
    return x[log_features].values

def unlogged_features(x):
    x = pd.DataFrame(x)
    feature_skew = x.skew(axis=0,skipna=True)
    log_features = feature_skew[abs(feature_skew) > 0.9].index
    scale_features = [name for name in feature_skew.index if name not in log_features]
    return x[scale_features].values


p1 = make_pipeline(ft(logged_features, validate=False), ft(log_transform, validate=False))
p2 = make_pipeline(ft(unlogged_features, validate=False))
p3 = fu([("p1",p1), ("p2", p2)])


def nan_standard_scaler(x):
    output_arr =  (x - np.nanmean(x))/(np.nanstd(x) + epsilon)
    return output_arr

def timeseries_nan_ss(X):
    # X is np.array, 3D
    X_2D = [*zip(X[i] for i in range(X.shape[0]))]
    with mp.Pool() as pool:
        X_new = pool.starmap(nan_standard_scaler, X_2D)
    return np.asarray(X_new)

def nan_robust_scaler(x):
    x = pd.DataFrame(x)
    output_df = (x - x.median(skipna=True))/(x.mad(skipna=True) + epsilon)
    return output_df.values

def timeseries_nan_rs(X):
    # X is np.array, 3D
    X_2D = [*zip(X[i] for i in range(X.shape[0]))]
    with mp.Pool() as pool:
        X_new = pool.starmap(nan_robust_scaler, X_2D)
    return np.asarray(X_new)

#pre_preprocessor_per_timestep = ct(transformers=[('log', ft(log_transform, validate=False), list_of_features_to_log)], remainder='passthrough')
#('scale', ss(), scale_features)],


imputer_per_sample = make_pipeline(ft(timeseries_imputation, validate=False))
pt_per_sample = ft(timeseries_powertransformation, validate=False)
nan_ss_per_sample = ft(timeseries_nan_ss, validate=False)
nan_rs_per_sample = ft(timeseries_nan_rs, validate=False)
dt_per_sample = ft(timeseries_detrending, validate=False)
norm_per_sample = ft(timeseries_normalization, validate=False)
preprocessor_per_sample = make_pipeline(nan_rs_per_sample,dt_per_sample)
# if applied, preprocessor_per_sample with first robustly standardize each feature in the given sample, then detrend using row-wise differencing


imputer_per_timestep = SimpleImputer(strategy='median')
nan_ss_per_timestep = ft(nan_standard_scaler, validate=False)
nan_rs_per_timestep = ft(nan_robust_scaler, validate=False)
#dt_per_timestep doesn't make much sense
#dt_per_timestep = ft(detrender, validate=False)
preprocessor_per_timestep = make_pipeline(p3, nan_rs_per_timestep, imputer_per_timestep)#, mms())
# preprocessor_per_timestep first take the log of features with skew above a certain level ('p3'), then robustly standardizes each feature along all times, then does median imputation


path_to_data = "./input"
file_id = 'fold1Training'
file_name = file_id+'.json'
#file_name
#file_name_test = "testSet.json"


"""


## Run this commented part only once, so you are able to save the pickled files. Then comment it out.

# Read in all data in a single file
all_input, labels, ids = convert_json_data_to_nparray(path_to_data, file_name, selected_features)

##all_input_test, labels_test, ids_test = convert_json_data_to_nparray(path_to_data, file_name_test, selected_features)


# Change X and y to numpy.array in the correct shape.
X = np.array(all_input)
y = np.array([labels]).T
y = np.squeeze(y).reshape(-1,1)
labels = y.copy()
#print("The shape of X is (sample_size x time_steps x feature_num) = {}.".format(X.shape))
#print("the shape of y is (sample_size x 1) = {}, because it is a binary classification.".format(y.shape))
pickle.dump(X, open(file_id + ".pkl", "wb"))
pickle.dump(y, open(file_id + "_output.pkl", "wb"))

"""


# read from pickle
X = pickle.load(open(file_id + ".pkl", "rb"))
y = pickle.load(open(file_id + "_output.pkl", "rb"))
labels = y.copy()

#X_test = np.array(all_input_test)
#y_test = np.array([labels_test]).T
#print("The shape of X_test is (sample_size x time_steps x feature_num) = {}.".format(X_test.shape))
#print("the shape of y_test is (sample_size x 1) = {}, because it is a binary classification.".format(y_test.shape))

"""
#X = preprocessor_per_sample.fit_transform(X)
#X_test = preprocessor_per_sample.fit_transform(X_test)

# write to pickle
pickle.dump(X, open(file_id + "_modified.pkl", "wb"))
#pickle.dump(X_test, open(file_id_test + "_modified.pkl", "wb"))
pickle.dump(y, open(file_id + "_output_modified.pkl", "wb"))
#pickle.dump(y_test, open(file_id_test + "_output_modified.pkl", "wb"))


# read from pickle
X = pickle.load(open(file_id + "_modified.pkl", "rb"))
#X_test = pickle.load(open(file_id_test + "_modified.pkl", "rb"))
y = pickle.load(open(file_id + "_output_modified.pkl", "rb"))
#y_test = pickle.load(open(file_id_test + "_output_modified.pkl", "rb"))
labels = y.copy()


"""

# Define metric, which does not depend on imbalance of positive and negative classes in validation/test set
# Defining sensitivity = true_positive/(total real positive) = tp/(tp+fn)
# sensitivity is the same as recall
def sensitivity(y_true, y_pred):
    y_pred = K.clip(y_pred, 0, 1)
    true_positives = K.sum(K.round(y_true * y_pred)) 
    # K.clip(x,a,b) x is a tensor, a and b are numbers, clip converts any element of x falling
    # below the range [a,b] to a, and any element of x falling above the range [a,b] to b.
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    # K.epsilon >0 & <<< 1, in order to avoid division by zero.
    sen = recall = true_positives / (possible_positives + K.epsilon())
    return sen

# Specificity = true_negative/(total real negative) = tn/(tn+fp)
def specificity(y_true, y_pred):
    true_negatives = K.sum(K.round(K.clip((1-y_true) * (1-y_pred), 0, 1)))
    possible_negatives = K.sum(K.round(K.clip(1-y_true, 0, 1)))
    spec = true_negatives / (possible_negatives + K.epsilon())
    return spec

# Precision = true_positives/predicted_positives = tp/(tp+fp)
def precision(y_true, y_pred):
    # just in case of hipster activation at the final layer
    y_pred = K.clip(y_pred, 0, 1)
    true_positives = K.sum(K.round(y_true * y_pred)) 
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    prec = true_positives / (predicted_positives + K.epsilon())
    return prec

# Informedness = sensitivity + specificity - 1
def informedness(y_true, y_pred):
    return sensitivity(y_true, y_pred)+specificity(y_true, y_pred)-1


# f1 = 2/((1/precision) + (1/recall))
def f1_score(y_true, y_pred):
    prec = precision(y_true, y_pred)
    sen = sensitivity(y_true, y_pred)
    f1 = 2*((prec*sen)/(prec + sen + K.epsilon()))
    return f1

# check NaN in y, X #, X_scaled
print('There are {} NaN in y.'.format(np.isnan(y).sum()))
print('There are {} NaN in X.'.format(np.isnan(X).sum()))
#print('There are {} NaN in X_scaled.'.format(np.isnan(X_scaled).sum()))


# one-hot encode y
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)
y = np.asarray(onehot_encoder.fit_transform(y), dtype=FLOAT_TYPE)
y_dim = np.shape(y)[1] # y=0 if no flare, y=1 if flare


# alpha, gamma, and beta_cb are to be set by CV
# parameters for focal loss
alpha = .5
gamma = 2.

# beta_cb is also to be set by CV.
# beta_cb gives alpha_cb, which is the main parameter for ____ loss.
beta_cb = 0.99
unique_targets, unique_targets_cnts = np.unique(labels, return_counts=True)
#y_cnts is basically normalized_unique_targets_cnts
y_cnts = unique_targets_cnts/len(labels)
# from that paper
alpha_cb = (1-beta_cb)/(1-beta_cb**y_cnts)
# 1/sqrt(cnts)
#alpha_cb = 1/np.sqrt(unique_targets_cnts)

alpha_cb_norm_fac = len(unique_targets)/np.sum(alpha_cb)
alpha_cb *= alpha_cb_norm_fac
alpha_cb = np.array(alpha_cb, dtype=FLOAT_TYPE)
print('alpha_cb =', alpha_cb)


def wrapped_loss(alpha_cb, alpha=alpha, gamma=gamma):
    def weighted_crossentropy(targets, inputs):
        # we use a modulating factor which down-weights the loss assigned to well-classified examples to prevent numerous easy examples from overwhelming the classifier.
        y_class = K.argmax(inputs, axis=1)
        w = tf.gather(alpha_cb, y_class)
        # crossentropy
        cce = K.categorical_crossentropy(targets, inputs)
        # weighted crossentropy
        wcce = w * cce
        ###
        cce_exp = K.exp(-cce)
        ###
        # focal loss        
        fl = alpha * K.pow((1-cce_exp), gamma) * cce
        # weighted focal loss
        wfl = w * K.pow((1-cce_exp), gamma) * cce
        return wfl
    return weighted_crossentropy


def mish(x):
    return x*K.tanh(K.softplus(x))

# Build LSTM networks using keras
num_epochs = 100
num_epochs_lr = 1


# Set some hyperparameters
n_sample = len(y)
time_steps = X.shape[1]#60
batch_size = 512
feature_num = len(selected_features) # 25 features per time step
hidden_size = feature_num
use_dropout = True
use_callback = False # to be added later
adam = Adam(lr=0.001)


radam = tfa.optimizers.RectifiedAdam(
        lr=1e-1,
        total_steps=10000,
        warmup_proportion=0.1,
        min_lr=1e-5)

ranger = tfa.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)


#from keras_self_attention import SeqSelfAttention
# doesn't work with tensorflow 2.0

def classifier(alpha_cb=alpha_cb, time_steps = time_steps, optimizer='adam', dropout=0.50, recurrent_dropout=0.25):
    model = Sequential()
    #model.add(LSTM(units=hidden_size*2, input_shape=(time_steps,feature_num), return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))
    model.add(Bidirectional(LSTM(units=hidden_size, input_shape=(time_steps,feature_num), return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout), merge_mode = 'concat'))
    #model.add(LSTM(units=hidden_size, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))
    model.add(Bidirectional(LSTM(units=hidden_size, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout), merge_mode = 'concat'))
    model.add(Bidirectional(LSTM(units=hidden_size, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout), merge_mode = 'concat'))
    #model.add(LSTM(units=hidden_size, return_sequences=True))
    model.add(TimeDistributed(Dense(int(hidden_size), activation=mish)))
    model.add(Flatten())
    model.add(Dense(1024))
    model.add(Dropout(rate=dropout))
    model.add(Dense(1024))
    model.add(Dropout(rate=dropout))
    model.add(Dense(y_dim)) # Dense layer has y_dim=1 or 2 neuron.
    model.add(Activation('softmax'))
    model.compile(loss=wrapped_loss(alpha_cb), optimizer=optimizer, metrics=[f1_score])
    return model

classifier().summary()

# Split X, y into training and validation sets
# define k-fold cross validation test harness
seed = 100
n_splits = 10
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
cvscores = []


for train, val in kfold.split(np.asarray(labels), np.asarray(labels)):
    X_train = X[train]
    X_val = X[val]
    y_train = y[train]
    y_val = y[val]
    #"""
    #pre-processing per sample
    #X_train = preprocessor_per_sample.fit_transform(X_train)
    #X_val = preprocessor_per_sample.fit_transform(X_val)
    for i in range(X_train.shape[1]):
        q = preprocessor_per_timestep
        _ = q.fit(X_train[:,i])
        X_train[:,i] = preprocessor_per_timestep.transform(X_train[:,i])
        X_val[:,i] = preprocessor_per_timestep.transform(X_val[:,i])
    print('There are {} NaN in y_train.'.format(np.isnan(y_train).sum()))
    print('There are {} NaN in X_train.'.format(np.isnan(X_train).sum()))
    time_steps = X_train.shape[1]#60
    clf = KerasClassifier(classifier, alpha_cb=alpha_cb, time_steps=time_steps, optimizer=adam, epochs=num_epochs, batch_size=batch_size, verbose=2, validation_data=(X_val,y_val), callbacks=[TerminateOnNaN()])
    history = clf.fit(X_train, y_train)
    final_val_score = history.history['val_f1_score'][-1]
    print('val_f1_score = %.2f' %final_val_score)
    #lr_callback.plot_schedule(clip_beginning=10, clip_endding=5)
    cvscores.append(final_val_score)