#### Neural Network Baseline

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
from datetime import datetime
from pathlib import Path
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

In [None]:
class Tokenizer():
    def __init__(self):
        pass

    def fit(self, x):
        self.encoders = []
        self.offsets = []
        self.vocab_size = 0
        for i in range(x.shape[1]):
            encoder = preprocessing.LabelEncoder()
            encoder = encoder.fit(x[:, i])
            self.encoders.append(encoder)
            self.offsets.append(self.vocab_size)
            self.vocab_size += len(encoder.classes_)

        return self

    def transform(self, x):
        x_idx = np.zeros(x.shape)
        for i in range(x.shape[1]):
            x_idx[:, i] = self.encoders[i].transform(x[:, i])
            x_idx[:, i] += self.offsets[i]

        return x_idx

class EmbeddingRegression():
    def __init__(self, num_dim, cat_dim, emb_dim, vocab_size, hidden_units, batch_size, epochs, patience, log_dir):
        # hyper parameters
        self.num_dim = num_dim
        self.cat_dim = cat_dim
        self.emb_dim = emb_dim
        self.vocab_size = vocab_size
        self.hidden_units = hidden_units

        # optimizer parameters
        self.batch_size = batch_size
        self.epochs = epochs
        self.patience = patience

        # logging settings
        self.log_dir = log_dir

    def fit(self, x_num_train, x_cat_train, y_train, x_num_val, x_cat_val, y_val):
        # convert np.array to tf.data.Dataset
        ds_train = self.np2ds(x_num_train, x_cat_train, y_train)
        ds_val = self.np2ds(x_num_val, x_cat_val, y_val)

        # build model and compile loss, optimizer and metrics
        self.build()
        self.compile()

        # fit model with early stopping
        best_loss = float("inf")
        best_epoch = 0
        for epoch in range(self.epochs):
            for batch_x_num, batch_x_cat, batch_y in ds_train:
                self.train_step(batch_x_num, batch_x_cat, batch_y)
            for batch_x_num, batch_x_cat, batch_y in ds_val:
                self.val_step(batch_x_num, batch_x_cat, batch_y)

            self.print_metrics(epoch)

            if best_loss > self.val_loss.result():
                best_loss = self.val_loss.result()
                best_epoch = epoch
            if epoch - best_epoch > self.patience:
                break

            self.reset_metrics()

        return self

    def predict(self, x_num, x_cat):
        predictions = self.model([x_num, x_cat])
        predictions = predictions.numpy()[:, 0]

        return predictions

    def np2ds(self, x_num, x_cat, y):
        ds = tf.data.Dataset.from_tensor_slices((x_num, x_cat, y))
        ds = ds.shuffle(len(x_num))
        ds = ds.batch(self.batch_size)

        return ds

    def build(self):
        x_num_input = tf.keras.Input(shape = (self.num_dim,), name = "x_num")
        x_cat_input = tf.keras.Input(shape = (self.cat_dim,), name = "x_cat")
        x_cat = tf.keras.layers.Embedding(self.vocab_size, self.emb_dim, name = "embedding")(x_cat_input)
        x_cat = tf.keras.layers.Flatten(name = "flatten")(x_cat)
        x = tf.keras.layers.concatenate([x_num_input, x_cat], name = "concat")
        x = tf.keras.layers.BatchNormalization(name = "hidden_norm1")(x)
        x = tf.keras.layers.Dense(self.hidden_units, activation = tf.keras.layers.ReLU(), name = "hidden_dense1")(x)
        x = tf.keras.layers.BatchNormalization(name = "hidden_norm2")(x)
        x = tf.keras.layers.Dense(self.hidden_units, activation = tf.keras.layers.ReLU(), name = "hidden_dense2")(x)
        x = tf.keras.layers.BatchNormalization(name = "output_norm")(x)
        x = tf.keras.layers.Dense(1, name = "output_dense")(x)
        self.model = tf.keras.Model(inputs = [x_num_input, x_cat_input], outputs = x)

    def compile(self):
        self.loss_object = tf.keras.losses.MeanSquaredError()
        self.optimizer = tf.keras.optimizers.Adam()
        self.train_loss = tf.keras.metrics.Mean(name = "train_loss")
        self.train_accuracy = tf.keras.metrics.RootMeanSquaredError(name='train_rmse')
        self.val_loss = tf.keras.metrics.Mean(name = "val_loss")
        self.val_accuracy = tf.keras.metrics.RootMeanSquaredError(name='val_rmse')
        
        
    @tf.function
    def train_step(self, x_num, x_cat, y):
        with tf.GradientTape() as tape:
            predictions = self.model([x_num, x_cat])
            loss = self.loss_object(y, predictions)
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        self.train_loss(loss)
        self.train_accuracy(y, predictions)

    @tf.function
    def val_step(self, x_num, x_cat, y):
        predictions = self.model([x_num, x_cat])
        loss = self.loss_object(y, predictions)
        self.val_loss(loss)
        self.val_accuracy(y, predictions)

    def print_metrics(self, epoch):
        template = "Epoch {}, tr_RMSE: {}, va_RMSE: {}"
        print(template.format(
            epoch + 1,
            self.train_accuracy.result(),
            self.val_accuracy.result()
        ))

    def reset_metrics(self):
        self.train_loss.reset_states()
        self.train_accuracy.reset_states()
        self.val_loss.reset_states()
        self.val_accuracy.reset_states()


In [None]:

train = pd.read_csv("./data/hu_train.csv")
test = pd.read_csv("./data/test.csv")


traintest = pd.concat([train, test], ignore_index = True, sort = False)


# list numeric and categorical columns
num_cols = ['year','month','day','lat', 'lon', 'co_cnt', 'co_min', 'co_mid', 'co_max',
   'co_var', 'o3_cnt', 'o3_min', 'o3_mid', 'o3_max', 'o3_var', 'so2_cnt',
   'so2_min', 'so2_mid', 'so2_max', 'so2_var', 'no2_cnt', 'no2_min',
   'no2_mid', 'no2_max', 'no2_var', 'temperature_cnt', 'temperature_min',
   'temperature_mid', 'temperature_max', 'temperature_var', 'humidity_cnt',
   'humidity_min', 'humidity_mid', 'humidity_max', 'humidity_var',
   'pressure_cnt', 'pressure_min', 'pressure_mid', 'pressure_max',
   'pressure_var', 'ws_cnt', 'ws_min', 'ws_mid', 'ws_max', 'ws_var',
   'dew_cnt', 'dew_min', 'dew_mid', 'dew_max', 'dew_var']
cat_cols = ["Country","City"]

# split data to numeric, categorical and target column(s)
x_num_trainval = train[num_cols].values
x_cat_trainval = train[cat_cols].values
y_trainval = train["pm25_mid"].values
x_num_test = test[num_cols].values
x_cat_test = test[cat_cols].values

# tokenize categorical columns
tokenizer = Tokenizer()
tokenizer = tokenizer.fit(traintest[cat_cols].values)
x_cat_trainval = tokenizer.transform(x_cat_trainval)
x_cat_test = tokenizer.transform(x_cat_test)

# train
n = 1
models = []
for i in range(n):
    x_num_train, x_num_val, x_cat_train, x_cat_val, y_train, y_val = train_test_split(x_num_trainval, x_cat_trainval, y_trainval, test_size = 0.2, random_state = i)

    model = EmbeddingRegression(
        num_dim = x_num_train.shape[1],
        cat_dim = x_cat_train.shape[1],
        emb_dim = 2,
        vocab_size = tokenizer.vocab_size,
        hidden_units = 128,
        batch_size = 128,
        epochs = 1000,
        patience = 50,
        log_dir = Path("logs", "{}-{:02d}".format(timestamp, i))
    )
    model = model.fit(x_num_train, x_cat_train, y_train, x_num_val, x_cat_val, y_val)
    models.append(model)

# predict
y_test = np.zeros(len(test))
for model in models:
    y_test += model.predict(x_num_test, x_cat_test)
y_test /= len(models)

# export predictions
submit = pd.DataFrame({
    "id": test["id"],
    "y": y_test
})

for i in range(len(submit)): 
    if submit["y"][i] < 0:
        submit["y"][i] = 0
        
submit.to_csv("NN_submit.csv", header = None, index = None)