# developing

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import time
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
import random
import joblib
import gc

# variables

In [None]:
output_size = 500
batch_size = 64
N_Folds = 3

# confirm train_data 

In [None]:
X = np.nan_to_num(joblib.load("/kaggle/input/hms-train-dataset-create-baseline/train_matrix.pkl"))
X[1,:,:]

## starderize....

from sklearn.preprocessing import StandardScaler
scalers = {}
for i in range(X.shape[2]):
    scalers[i] = StandardScaler()
    X[:, :, i] = scalers[i].fit_transform(X[:, :, i]) 

X[1,:,:]

#X = X[:5000,:,:]
X.shape

In [None]:
Y = joblib.load("/kaggle/input/hms-train-dataset-create-baseline/target.pkl")
Y

In [None]:
#Y = Y[:5000,:]
Y.shape

# model

In [None]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)
                 
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, d_model):
    super().__init__()
    self.d_model = d_model 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def call(self, x):
    length = tf.shape(x)[1]
    # This factor sets the relative scale of the embedding and positonal_encoding.
    #x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
class RnnModel(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    self.pos_embedding = PositionalEmbedding(d_model=d_model)
    
    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.layer1_100 = tf.keras.layers.Dense(108, activation='relu')
    self.layer1_6 = tf.keras.layers.Dense(6,activation='relu')
    self.add = tf.keras.layers.Add()
  def call(self, x):
    #print("1:",x.shape)
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
    #print("2:",x.shape)
    # Add dropout.
    x = self.dropout(x)
    for i in range(self.num_layers):
        x = self.enc_layers[i](x)
    #print("3:",x.shape)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    #print("4:",x.shape)
    x = self.layer1_100(x)   
    x = self.layer1_6(x) 
    x = tf.keras.layers.Softmax(-1)(x)
    #print("5:",x.shape)
    return x

# train model

In [None]:
N_EPOCHS = 71
steps_per_epoch = X.shape[0]//batch_size
#val_steps_per_epoch = 64

In [None]:
def loss_fn(labels, targets):
    loss = tf.math.abs(labels - targets)
    #loss = tf.math.reduce_mean(loss,1)
    #loss = tf.math.reduce_mean(loss,0)
    loss = tf.math.reduce_mean(loss)
    return loss
save_folder = '/kaggle/working'
def create_save_callback(fold):
    try:
        os.mkdir(f'{save_folder}/{fold}_weights/')
    except:
        pass
    class save_model_callback(tf.keras.callbacks.Callback):
        def __init__(self,fold):
            super().__init__()
            self.fold = fold
        def on_epoch_end(self, epoch: int, logs=None):
            if epoch == 5 or (epoch)%10 == 0:
                self.model.save_weights(f"{save_folder}/{self.fold}_weights/model_epoch_{epoch}.h5")
    return save_model_callback(fold)

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu="local") # "local" for 1VM TPU
    print('Running on TPU ')#, tpu.cluster_spec().as_dict()['worker'])
except:
    tpu = None
if tpu:
    strategy = tf.distribute.TPUStrategy(tpu)
    print("on TPU")
    print("REPLICAS: ", strategy.num_replicas_in_sync)
else:
    print("on GPU")
    strategy = tf.distribute.get_strategy()
with strategy.scope():
    kf = KFold(n_splits=N_Folds, shuffle=True, random_state=100)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X, Y)):
        print(fold,"/",N_Folds , "###################")
        X_train, X_valid = X[train_idx,:], X[valid_idx,:]
        y_train, y_valid = Y[train_idx], Y[valid_idx]
        
        learning_rate = 1e-4
        epsilon = 1e-15
        loss = loss_fn #tf.keras.losses.MeanSquaredError()

        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
        model = RnnModel(num_layers=3,d_model=120,num_heads=1,dff=200,dropout_rate=0.5)
        model.compile(optimizer=optimizer, loss=loss)
        callback1 = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
        model.fit(  x=X_train, y=y_train,
                    validation_data=(X_valid, y_valid),
                    epochs=N_EPOCHS,
                    steps_per_epoch = steps_per_epoch,
                    #validation_steps=val_steps_per_epoch,
                    verbose = 2,
                    callbacks=[callback1,create_save_callback(fold)]
                 )
        del model
        gc.collect()

joblib.dump(scalers,"transformer.pkl")