# Initial Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers, mixed_precision, regularizers, initializers
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
import seaborn as sns

In [None]:
# Seeds for reproduciblity
random_seed = 50701

np.random.seed(random_seed)
tf.random.set_seed(random_seed)

tf.keras.utils.set_random_seed(random_seed)
tf.config.experimental.enable_op_determinism()

# Defining scaler and imputer to use throughout. Imputation is temporary so  scaling works properly.
min_max_scaler = MinMaxScaler()
temp_imputer = SimpleImputer(strategy="mean")

# Used when making predictions
le = LabelEncoder()

In [None]:
# Ensuring using best setup with colab
gpus = tf.config.experimental.list_physical_devices("GPU")

if len(gpus) > 0:
    policy = mixed_precision.Policy("mixed_float16")
    mixed_precision.set_global_policy(policy)
    GPU = True

    print("TensorFlow is using the GPU.")
else:
    GPU = False
    print("TensorFlow is not using the GPU.")

TensorFlow is not using the GPU.


# Decide on the Data
These can be modified to test 1 specific model which is broken down fully in the
following sections.

To test all models the section is towards the end under "Train and Evaluate All
Models"

In [None]:
# Select numerical features for imputation to impute
numerical_features = ["mean arterial pressure", "heart rate", "respiratory rate", "PCO2 (Arterial)",
                      "PO2 (Arterial)", "FiO2", "arterial pH", "sodium", "postassium", "creatinine",
                      "hematocrit", "white blood cell", "HCO3 (serum)"]

# Decide data specifics to test either raw or artificial with different missing mechanisms
is_raw_missing = False  # Artificial otherwise
missing_type = "mcar"  # Artificially only - mcar, mnar_central, mnar_upper, mnar_lower

# Depends on missing limits
# - Artificial is a percentage missing i.e. 0.2, 0.5 and 0.7
# - Raw is number of values missing per row i.e. 2, 5, 10
if is_raw_missing:
    level_missing = 5
else:
    level_missing = 0.5

# Read and Prepare Data

In [None]:
def get_data_and_reference(is_raw_missing=False, level_missing=0.5, missing_type="mcar"):
    """
    Given the specified flags return the specified original data, the numerical features to be
    imputed and a reference specifying the data used.

    :param is_raw_missing: Boolean flag specifying whether the data is artficially missing or raw.
    :param level_missing: A float or integer representing either the percentage of missingness or
                          the number of missing values per row.
    :param missing_type: A string representing the type of missingness used - only applicable to
                         artificially missing data.
    :return:
    """
    if is_raw_missing:
        reference = "raw_{}".format(level_missing)
        df = pd.read_csv("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/missing/raw/measurements_{}.csv".format(level_missing))
    else:
        reference = "artificial_{}_{}".format(level_missing, missing_type)
        df = pd.read_csv("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/missing/artificial/measurements_{}_{}.csv".format(level_missing, missing_type))

    # Shuffling the data
    df = df.sample(frac=1, random_state=507).reset_index(drop=True)

    df_features = df[numerical_features]

    return df, df_features, reference

In [None]:
def fit_scaler_and_imputer(df_to_fit_to, imputer, scaler):
    """
    Given a dataset containing the training data, the temporary imputer and scaler this will fit
    both the imputer and scale on the given data and return them.

    :param df_to_fit_to: The dataframe for which the imputer and scaler are to be fitted to.
    :param imputer: The intialised imputer.
    :param scaler: The initialised scaler.
    :return: The fitted imputer and scaler.
    """
    # Fitting imputer on original data
    imputer.fit(df_to_fit_to.values)

    # Temporarily imputing the data
    filled_train = imputer.transform(df_to_fit_to.values)

    # Fitting the scaler on the temporary imputation
    scaler.fit(filled_train)

    return imputer, scaler

In [None]:
def scale_data(features_df, imputer, scaler):
    """
    Given the data with missing features this will temporarily fill them through the given imputer
    and scale the features.

    :param features_df: The dataframe containing the missing features to be scaled.
    :param imputer: The fitteed imputer.
    :param scaler: The fitted scaler.
    :return: The raw data and the scaled features.
    """
    feature_values = features_df.values
    missing_mask = features_df.isna()

    # Using temporary imputation to fill nan's before scalling
    features_temp_filled = imputer.transform(feature_values)

    # Scaling the data
    featured_scaled_filled = scaler.transform(features_temp_filled)

    # Restoring the nans
    featured_scaled_filled[missing_mask] = np.nan

    scaled_features = pd.DataFrame(featured_scaled_filled, index=features_df.index,
                                   columns=features_df.columns)

    return feature_values, scaled_features

In [None]:
# Get the specified data
shuffled_data, shuffled_features, data_reference = get_data_and_reference(is_raw_missing,
                                                                          level_missing,
                                                                          missing_type)

In [None]:
# Scale the data, using placeholder imputationss
imputer, scaler = fit_scaler_and_imputer(shuffled_features, temp_imputer, min_max_scaler)
shuffled_features, scaled_featured = scale_data(shuffled_features, imputer, scaler)

scaled_featured.head(10)

Unnamed: 0,mean arterial pressure,heart rate,respiratory rate,PCO2 (Arterial),PO2 (Arterial),FiO2,arterial pH,sodium,postassium,creatinine,hematocrit,white blood cell,HCO3 (serum)
0,0.21372,0.287356,0.085271,,,1.0,0.525641,0.363636,,0.04186,0.453125,0.156395,
1,,0.183908,,0.188976,,1.0,,,,0.037209,0.6875,,
2,,,,0.173228,0.393324,,0.615385,,,,0.398438,0.069583,0.390244
3,0.258575,,0.093023,,0.571843,,,,,0.046512,0.367188,0.088138,0.463415
4,,0.551724,,0.362205,0.044993,1.0,,,0.734177,,,0.237906,
5,0.28496,,0.147287,,,1.0,,0.530303,0.468354,0.069767,0.390625,0.132538,
6,,,0.100775,,0.33672,,,,,,0.40625,,
7,0.224274,,,0.212598,,,,0.469697,0.253165,0.027907,,,0.390244
8,,0.396552,,0.141732,0.119013,,,,,0.027907,0.445312,,
9,0.271768,0.33908,,0.220472,,,,0.439394,,0.027907,,0.059642,0.463415


# Using the Model

## Build the Model

In [None]:
def build_generator(n_features, l2_reg=1e-5, layer_sizes=None, noise_level=10, add_noise=False):
    """
    Used to build a generator for the WGAIN. It takes the number of features that require imputation,
    the regularisation rate, the sizes for the layers (A tuple i.e. (256, 128, 64, 32)) and
    variables to add noise to the generator to help mitigate mean collapse.

    :param n_features: The number of features to be imputed.
    :param l2_reg: The regualirsation rate.
    :param layer_sizes: The individual layer sizes for the generator.
    :param noise_level: The level of noise to be added to the inputs.
    :param add_noise: Flag to indicate if noise should be added to the inputs.
    :return: The specified generator model.
    """
    # Weight initialiser - need to look into why they used this
    xavier_init = initializers.GlorotUniform()

    # Setting default layers with most complex architecutre
    if layer_sizes is None:
        layer_sizes = (32, 64, 128)

    # Separating inputs for the missing data and mask
    data_input = layers.Input(shape=(n_features,), name="missing_data_input")
    mask_input = layers.Input(shape=(n_features,), name="missing_mask_input")
    noise_input = layers.Input(shape=(noise_level,), name="noise_input")

    # Combining input data
    x = layers.Concatenate()([data_input, mask_input, noise_input])

    # Building the architecture from the given layer sizes. Noise is optional and an activation is
    # always used.
    for size in layer_sizes:
        x = layers.Dense(units=size, kernel_initializer=xavier_init,
                         kernel_regularizer=regularizers.l2(l2_reg))(x)

        if add_noise:
            x = layers.GaussianNoise(0.05)(x)

        x = layers.Activation("relu")(x)

    x = layers.Flatten()(x)

    # Outputting 13 features (imputations) with sigmoid so in range of 0-1 to match min-max scaling
    imputation = layers.Dense(n_features, activation="sigmoid", name="imputed_data")(x)

    # Built model on the given specification
    generator = models.Model(inputs=[data_input, mask_input, noise_input], outputs=imputation, name="generator")

    return generator

In [None]:
def build_discriminator(n_features, l2_reg, layer_sizes=None, dropout=0.2):
    """
    Used to build a discriminator for the WGAIN. It takes the number of features that require imputation,
    the regularisation rate, the sizes for the layers (A tuple i.e. (256, 128, 64, 32)) and
    variables to add noise to configure dropout layers. They will be applied to all hidden layers
    if dropout > 0.

    :param: n_features: The number of features to be imputed.
    :param: l2_reg: The regularisation rate.
    :param layer_sizes: The individual layer sizes for the discrminator.
    :param: dropout: The dropout rate (Float) to be applied after each dense layer.
    :return: The specified discriminator model.
    """
    # Weight initialiser
    xavier_init = initializers.GlorotUniform()

    # Setting default layers with most complex architecutre. Default is slightly worse than the
    # generator because the discrminator can easily overpower.
    if layer_sizes is None:
        layer_sizes = (128, 64, 32)

    # Defining the inputs to take the imputed data from the generator and a hint matrix.
    data_input = layers.Input(shape=(n_features,), name="imputed_input")
    hint_input = layers.Input(shape=(n_features,), name="hint_input")

    # Combining the data and the hint matrix for the input

    x = layers.Concatenate()([data_input, hint_input])

    # Building the hidden layers. If dropout specified it is added to every layer.
    for size in layer_sizes:
        x = layers.Dense(units=size, kernel_initializer=xavier_init,
                         kernel_regularizer=regularizers.l2(l2_reg))(x)

        if dropout:
            x = layers.Dropout(dropout)(x)

        x = layers.Activation("relu")(x)

    # Using None as loss function uses logits - need to understand these more
    predictions = layers.Dense(1, activation=None, name="predictions")(x)

    # Final discrimnator model to predict whether data is real or fake
    discriminator = models.Model(inputs=[data_input, hint_input], outputs=predictions,
                                 name="discriminator")

    return discriminator

In [None]:
def calc_gradient_penalty(discriminator, real_data, fake_data, hint_matrix):
    """
    Penalises gradients of the discrminator that are not equal to 1 to stabilise learning. Used to
    enforce the Lipschitz norm requirement. Gradients are calulcated from a new combination of real
    and fake data.

    :param discriminator: The trained to discrminator to assess.
    :param real_data: DataFrame containing only real non-missing or imputed data.
    :param fake_data: DataFrame containing the fake imputed data.
    :param hint_matrix: Hint matrix for the discrminator (0.1 - 0.9)
    :return: The gradient penalty for the discriminator where gradients are not 1.
    """
    batch_size = tf.shape(real_data)[0]

    # New version of the batch with real and fake data combined. Alpha decides on how much is real
    # versus how much is fake.
    alpha = tf.random.uniform(shape=[batch_size, 1], minval=0, maxval=1)
    new_combination = alpha * real_data + (1 - alpha) * fake_data

    # Testing the discriminators predictions on the new combination of data
    with tf.GradientTape() as gp_tape:
        gp_tape.watch(new_combination)
        pred = discriminator([new_combination, hint_matrix], training=True)

    # Compute gradients of predictions w.r.t the new combination of real and fake data
    disc_gradients = gp_tape.gradient(pred, new_combination)
    grad_norm = tf.sqrt(tf.reduce_sum(tf.square(disc_gradients), axis=1))

    # Applying the penalty for any gradients different than 1 to stabilise learning. Used to enforce
    # the Lipshitz norm requirement
    gradient_penalty = tf.reduce_mean((grad_norm - 1.0) ** 2)

    return gradient_penalty

## Train the Model

### Training Functions

In [None]:
def train_step(batch, generator, discriminator, generator_optimizer, discriminator_optimizer,
            update_generator=False, p_hint=0.2, noise_level=10, lambda_g_p=10, alpha=10, beta=10):
    """
    Completes a fully training step for the given batch and models. Update generator is used to
    decide whether it will be trained this round, p_hint is the probability used in the hint matrix.

    :param batch: Batched data to train the models on.
    :param generator: Intialised or partially trained generator.
    :param discriminator: Initalised or partially trained discriminator.
    :param generator_optimizer: The optimiser for the generator.
    :param discriminator_optimizer: The optimiser for the discriminator.
    :param update_generator: Boolean flag to specify if the generator should be updated this batch.
    :param p_hint: THe hint matrix for the discrminator
    :param noise_size: The size of the noise used in the training data
    :return: overall losses the generator and discriminator with the individual reconstruction and
    advasarial losses.
    """
    alpha = tf.cast(alpha, tf.float32)
    beta = tf.cast(beta, tf.float32)
    batch = tf.cast(batch, tf.float32)
    noise_level = tf.cast(noise_level, tf.int32)
    batch_size = tf.shape(batch)[0]

    # Missing mask: 1 if observed, 0 if missing
    missing_mask = tf.where(tf.math.is_nan(batch), tf.zeros_like(batch), tf.ones_like(batch))
    missing_mask = tf.cast(missing_mask, tf.float32)

    # Feature means to impute missing values temporarily
    feature_mean = tf.reduce_sum(tf.where(tf.math.is_nan(batch), tf.zeros_like(batch), batch),
                                 axis=0) / tf.reduce_sum(missing_mask, axis=0)

    batch_mean_filled = tf.where(tf.math.is_nan(batch), tf.broadcast_to(feature_mean,
                                                                        tf.shape(batch)), batch)

    # Noise for missing values
    noise = tf.random.normal(tf.shape(batch), stddev=1)
    noisy_batch = batch_mean_filled * missing_mask + noise * (1 - missing_mask)
    noise_input = tf.random.normal(shape=(batch_size, noise_level))

    # Create hint matrix for discriminator
    B = tf.cast(tf.random.uniform(tf.shape(missing_mask)) < p_hint, tf.float32)
    hint_matrix = B * (1 - missing_mask)

    # Updating the discriminator every time
    with tf.GradientTape() as disc_tape:
        # Getting imputations from the generator
        gen_imputations = generator([noisy_batch, missing_mask, noise_input], training=True)
        gen_imputations = tf.cast(gen_imputations, tf.float32)

        # Combining the imputed data with the original data
        combined_data = tf.where(missing_mask == 1, batch_mean_filled, gen_imputations)
        combined_data = tf.cast(combined_data, tf.float32)

        # Getting discrminators predcitions on both the real data and the imputed version of the
        # dataset.
        disc_real = discriminator([batch_mean_filled, hint_matrix], training=True)
        disc_fake = discriminator([combined_data, hint_matrix], training=True)

        # Finding the overall loss for the predictions themselves
        combined_disc_loss = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
        combined_disc_loss = tf.cast(combined_disc_loss, tf.float32)


        # Adding the gradient penalty the loss
        gradient_penalty = calc_gradient_penalty(discriminator, batch_mean_filled, combined_data,
                                            hint_matrix)
        disc_loss = combined_disc_loss + lambda_g_p * gradient_penalty

     # Finding and updating the final gradients of the discriminator
    disc_grad = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    discriminator_optimizer.apply_gradients(zip(disc_grad, discriminator.trainable_variables))

    # Only updated every n_crtics per batch
    if update_generator:
        with tf.GradientTape() as gen_tape:
            # Getting imputations from the generator
            gen_imputations = generator([noisy_batch, missing_mask, noise_input], training=True)
            gen_imputations = tf.cast(gen_imputations, tf.float32)

            # Combine generated with real data
            combined_data = tf.where(missing_mask == 1, batch_mean_filled, gen_imputations)
            combined_data = tf.cast(combined_data, tf.float32)

            # Discriminator prediction on just the fake data
            disc_fake = discriminator([combined_data, hint_matrix], training=True)

            # Setting nans to 0 so that the loss can be found properly
            batch_zero_filled = tf.where(tf.math.is_nan(batch), tf.zeros_like(batch), batch)

            # Getting the adversarial loss (predictions on the imputations) and the reconstruction
            # loss (how well it recreated the missing values)
            adv_loss = -tf.reduce_mean(disc_fake)
            recon_loss = tf.reduce_mean(tf.square(missing_mask * batch_zero_filled -
                                                  missing_mask * gen_imputations))

            # Casting
            adv_loss = tf.cast(adv_loss, tf.float32)
            recon_loss = tf.cast(recon_loss, tf.float32)

            # Applying mulitpliers to get the final generator loss
            gen_loss = (alpha * adv_loss) + (beta * recon_loss)

            # FInding and updating the gradient penalties
            gen_grad = gen_tape.gradient(gen_loss, generator.trainable_variables)
            generator_optimizer.apply_gradients(zip(gen_grad, generator.trainable_variables))
    else:
        gen_loss, adv_loss, recon_loss = tf.constant(0.0), tf.constant(0.0), tf.constant(0.0)

    return gen_loss, disc_loss, adv_loss, recon_loss

In [None]:
def train(data, n_epochs, generator, discriminator, generator_optimizer, discriminator_optimizer,
          n_critic=5, patience=100, batch_size=128, alpha=10, beta=10, noise_level=10,
          gen_layers=None, disc_layers=None, record_losses=True, progress_bar=True):
    """
    To be written.

    :param data: The training data.
    :param n_epochs: The max number of epochs to train for.
    :param generator: The initalised generator.
    :param discriminator: The intialised discriminator.
    :param generator_optimizer: The generator optimiser.
    :param discriminator_optimizer: The discrminator optimiser.
    :param n_critic: The ratio of training for the discrminator to the generator per batch.
    :param patience: The limit for how many epochs can be trained without an improvement in either
    the reconstruction loss or the advasarisal loss.
    :return: The trained generator and discriminator and if specified their respective losses.
    """
    # Tracking losses
    gen_losses, disc_losses = [], []
    adv_losses, recon_losses = [], []

    best_adv_loss = float("inf")
    best_recon_loss = float("inf")

    # Initialising the count for the early stopping
    wait = 0

    # Using tf.function so it runs more efficiently
    step_function = tf.function(train_step)

    # Training for the specified number of epochs
    for epoch in range(n_epochs):
        epoch_gen_loss, epoch_disc_loss = 0.0, 0.0
        epoch_adv_loss, epoch_recon_loss = 0.0, 0.0

        batches_per_epoch = tf.data.experimental.cardinality(data).numpy()

        # If set to true then a progress bar is used to show the training progress
        if progress_bar:
            pbar = tqdm(total=batches_per_epoch, desc=f"Epoch {epoch + 1}/{n_epochs}", unit="batch",
                        leave=False)
        else:
            pbar = None

        # Training on the batches
        for i, batch in enumerate(data):
            # Checking if generator should be updated
            update_generator = (i % n_critic == 0)

            # Train on the batch
            gen_loss, disc_loss, adv_loss, recon_loss = step_function(batch, generator,
                                                                      discriminator,
                                                                      generator_optimizer,
                                                                      discriminator_optimizer,
                                                                      update_generator=update_generator,
                                                                      alpha=alpha, beta=beta,
                                                                      noise_level=noise_level
                                                                      )
            # Updating the losses for this batch
            epoch_gen_loss += gen_loss
            epoch_disc_loss += disc_loss
            epoch_adv_loss += adv_loss
            epoch_recon_loss += recon_loss

            if progress_bar:
                # Update progress bar after each batch
                pbar.update(1)

        if progress_bar:
            pbar.close()

        # Getting averages of each loss and recording them
        avg_gen_loss = float(epoch_gen_loss / batches_per_epoch)
        avg_disc_loss = float(epoch_disc_loss / batches_per_epoch)
        avg_adv_loss = float(epoch_adv_loss / batches_per_epoch)
        avg_recon_loss = float(epoch_recon_loss / batches_per_epoch)

        gen_losses.append(avg_gen_loss)
        disc_losses.append(avg_disc_loss)
        adv_losses.append(avg_adv_loss)
        recon_losses.append(avg_recon_loss)

        if record_losses:
            tf.print("\nEpoch {}: Gen: {:.4f}, Disc: {:.4f}, Adv: {:.4f}, Recon: {:.4f}".format(
                epoch+1, avg_gen_loss, avg_disc_loss, avg_adv_loss, avg_recon_loss))

        # Flag for early stopping counter
        improved = False

        # Check if advesarial loss improved
        if avg_adv_loss < best_adv_loss:
            best_adv_loss = avg_adv_loss
            improved = True

        # Check if reconstruction loss improved
        if avg_recon_loss < best_recon_loss:
            best_recon_loss = avg_recon_loss
            improved = True

        # If an improvement was found then the counter is reset and the best weights updated
        if improved:
            best_gen_weights = generator.get_weights()
            best_disc_weights = discriminator.get_weights()
            wait = 0
        else:
            wait += 1

            if wait >= patience:
                print("Early stopping at epoch {}".format(epoch+1))
                generator.set_weights(best_gen_weights)
                discriminator.set_weights(best_disc_weights)
                break

    if record_losses:
        return generator, discriminator, gen_losses, disc_losses, adv_losses, recon_losses
    else:
        return generator, discriminator

In [None]:
def split_data(features_df, batch_size):
    """
    Given the raw training data this function will scale it, shuffle it and split it into the
    desired batch sizes (with any remainder dropped).

    :param features_df: Dataframe containing the missing features to be imputed.
    :param batch_size: Size of the individual batches for the training data.
    :return: The shuffled and batched data.
    """
    # Shuffling the data and batching it
    train_dataset = tf.data.Dataset.from_tensor_slices(features_df)
    # Using high buffer size as overshooting should not affect performance
    train_dataset = train_dataset.shuffle(buffer_size=30000, seed=507).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return train_dataset

### Training and Variable Setup

In [None]:
def initialise_models(n_features, gen_layers, disc_layers, dropout=0.1, l2_reg=1e-5, noise_level=10):
    """
    Given the dimension size, number of features and regularisation rate the models are built and
    returned.

    :n_features: The number of features to be trained on.
    :gen_layers: The individual layer sizes for the generator.
    :disc_layers: The individual layer sizes for the discriminator.
    :dropout: The dropout rate for the discriminator.
    :l2_reg: The regularisation rate applied to either model.
    :return: The initialised generator and discriminator.
    """
    # Initialise the models to be trained
    generator = build_generator(n_features, l2_reg=l2_reg, layer_sizes=gen_layers,
                                noise_level=noise_level)
    discriminator = build_discriminator(n_features, l2_reg=l2_reg, layer_sizes=disc_layers,
                                        dropout=dropout)

    return generator, discriminator

In [None]:
def initialise_optimisers(gen_learning_rate=1e-3, disc_learning_rate=1e-5):
    """
    Prepares the optimisers for both the generator and discriminator. Both are setup with a learning
    schedule with exponential decay. Using Adam optimiser and clipping to prevent extreme gradients.

    :param gen_learning_rate: The learning rate for the generator.
    :param disc_learning_rate: The learning rate for the discriminator.
    :return: The initialised optimisers for the generator and discriminator.
    """
    # Seperate learning schedules for the optimisers. Using decay so as training goes on the
    # learning rate will decrease
    gen_lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=gen_learning_rate,
    decay_steps=5000,
    decay_rate=0.98)

    disc_lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=disc_learning_rate,
    decay_steps=5000,
    decay_rate=0.98)

    # Define optimisers, using clipnorm to prevent extreme gradients
    generator_optimiser = keras.optimizers.Adam(learning_rate=gen_lr_schedule, clipnorm=1.0, beta_1=0.5, beta_2=0.9)
    discriminator_optimiser = keras.optimizers.Adam(learning_rate=disc_lr_schedule, clipnorm=1.0, beta_1=0.5, beta_2=0.9)

    return generator_optimiser, discriminator_optimiser

### Check Losses

In [None]:
def plot_losses(gen_losses, disc_losses, reference=None, show=True):
    """
    Using the losses after training and the reference for saving this will plot the losses over time
    of the training.

    :param gen_losses: The tracked losses for the generator.
    :param disc_losses: THe tracked losses for the discriminator.
    :param reference: String representing the experiment reference, used to title the plot.
    :param show: Boolean to decide whether plot is shown in environment or saved as always.
    """
    # Converting skipped losses to nan to be avoided in the plot
    disc_losses = np.where(np.array(disc_losses) == 0.0, np.nan, disc_losses)

    fig, axs = plt.subplots(2, 1, figsize=(10, 18))

    # Discriminator loss
    axs[0].plot(range(len(disc_losses)), disc_losses, label="Discriminator Loss", color="r",
                linestyle="-", marker="o")
    axs[0].set_title("Discriminator Loss per Epoch")
    axs[0].set_xlabel("Epochs")
    axs[0].set_ylabel("Loss")
    axs[0].grid(True)
    axs[0].legend()

    # Total Generator loss
    axs[1].plot(range(len(gen_losses)), gen_losses, label="Generator Loss", color="g",
                linestyle="-", marker="o")
    axs[1].set_title("Generator Loss per Epoch")
    axs[1].set_xlabel("Epochs")
    axs[1].set_ylabel("Loss")
    axs[1].grid(True)
    axs[1].legend()

    plt.tight_layout()
    plt.savefig("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Visualisations/losses/{}_losses_separate.png".format(reference))

    if show:
        plt.show()
    else:
        plt.close(fig)

### Saving the Models

In [None]:
def save_and_clear_models(generator, discriminator, reference):
    """
    This saves the trained models under the given reference and clears the keras sesssion to avoid
    data leakage.

    :param generator: The trained generator.
    :param discriminator: The trained discriminator.
    :param reference: String representing the experiment reference, used to name the model files.
    """
    # Outputting trained models
    generator.save("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Models/{}_generator.keras".format(reference))
    discriminator.save("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Models/{}_discriminator.keras".format(reference))

    # Clearing state for new model
    keras.backend.clear_session()

# Evaluate the Imputation

## Functions to Test Imputation

In [None]:
def impute_from_model(missing_features, generator, scaler, imputer, noise_level=10):
    """
    Given the data containing missing data and the relevant generator, scaler and imputer this will
    return the final imputation from the trained generator. The scaler and imputer should be fitted
    to the passed missing data.

    The returned data is not complete and requires calling "combine_imputations_with_categorical"
    for demographic data to be added back.

    :param missing_features: Thhe Dataframe containing the missing features to be imputed.
    :param generator: The trained generator to impute with.
    :param scaler: Scaler to scale the data before imputing.
    :param imputer: Imputer for temporary filling before scaling.
    :param noise_level: Decides how much is injected into the data.
    :return: The imputations from the trained generator.
    """
    # Getting the feature values and the missing mask
    feature_values = missing_features.values.astype(np.float32)
    missing_mask = (~np.isnan(feature_values)).astype(np.float32)

    # Filling and scaling the features to work with gen
    features_filled = imputer.transform(feature_values)
    features_scaled = scaler.transform(features_filled)

    # Add noise where values are missing - this was filled in by colab so might need verifying
    std = features_scaled.std(axis=0, keepdims=True)
    noise = np.random.normal(0, std, size=features_scaled.shape)
    X_noisy = features_scaled * missing_mask + noise * (1 - missing_mask)

    noise_input = tf.random.normal(shape=(features_scaled.shape[0], noise_level))

    # Using the generator to impute the missing values
    feature_imputations_scaled = generator.predict([X_noisy, missing_mask, noise_input])

    # Unscale the imputations
    feature_predictions_unscaled = scaler.inverse_transform(feature_imputations_scaled)

    # Replace the missing values with the imputations
    final_imputed_features = np.where(np.isnan(feature_values), feature_predictions_unscaled,
                                      feature_values)

    df_pred = pd.DataFrame(final_imputed_features, columns=missing_features.columns,
                           index=missing_features.index)

    return df_pred

In [None]:
def combine_imputations_with_categorical(imputed_data, missing_data, missing_features):
    """
    Given an imputed dataset containg just the imputed features this function will add back the
    demographic data and return it.

    :param imputed_data: The data which has already been imputed by the generator.
    :param missing_data: The original dataset with the missing values and desired columns.
    :param missing_features: The ?

    :return: The imputed data with the non-feature data returned.
    """
    # Adding the demographic data back
    non_feature_cols = missing_data.drop(columns=missing_features.columns)
    df_final = pd.concat([non_feature_cols, imputed_data], axis=1)

    return df_final

In [None]:
def plot_imputed_distributions(original_features, imputed_data, reference, show=True):
    """
    This will plot the differences in distributions of the imputed and original datasets. It works
    with both raw and artficially missing data and the histograms are normalised.

    Still work on colours - very bad

    :param original_features: The original dataset with missing values that the imputed data comes
                              from
    :param imputed_data: The imputed dataset
    :param reference:  String representing the experiment reference, used to title the plot.
    :param show: Boolean to decide whether plot is shown in environment or saved as always.
    """
    fig, axes = plt.subplots(4, 4, figsize=(18, 16))
    axes = axes.flatten()

    for i, col in enumerate(original_features):
        ax = axes[i]

        # Density normalises it - look into better colours
        ax.hist(imputed_data[col], alpha=0.5, label="Imputed", color="blue", edgecolor="black",
                density=True)
        ax.hist(original_features[col], alpha=0.5, label="Original", color="orange",
                edgecolor="black", density=True)

        ax.set_title(col)
        ax.tick_params(axis="x")
        ax.tick_params(axis="y")
        ax.legend()

    # Odd number of features so need to remove extra plots
    axes[13].axis('off')
    axes[14].axis('off')
    axes[15].axis('off')

    plt.suptitle("Variable Distribututions for {}".format(reference), fontsize=20)
    plt.tight_layout()
    plt.savefig("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Visualisations/imputed distributions/{}_variable_distribututions.png".format(reference))

    if show:
        plt.show()
    else:
        plt.close(fig)

## Evaluate Artifically Missing

In [None]:
def evaluate_normalised_mae(ground_truth, imputation, missing_data):
    """
    If the imputed data has a ground truth then this will evaluate the imputations through
    the normalised mean squared error. It will return the normalised MAE per features and the
    average.

    :param ground_truth: The ground truth data with no missing values.
    :param imputation: The imputed dataset from the artificially missing dataset.
    :param missing_data: The original dataset with the missing values.
    :return: The normalised MAE per feature and the average as a DataFrame.
    """
    # Tracking the individual values and the results so they can be averaged
    norm_mae_values = []
    norm_mae_results = {}

    # Identify where the data was missing before imputation
    missing_mask = missing_data.isna()

    for feature in imputation.columns:
        if feature not in missing_mask.columns:
            continue

        # Comparing values that were missing only
        missing_ground_truth = ground_truth[feature][missing_mask[feature]]
        missing_imputed = imputation[feature][missing_mask[feature]]

        # MAE
        feature_mae = mean_absolute_error(missing_ground_truth, missing_imputed)

        # Normalising through IQR
        Q1 = missing_ground_truth.quantile(0.25)
        Q3 = missing_ground_truth.quantile(0.75)
        IQR = Q3 - Q1

        norm_feature_mae = feature_mae / IQR

        norm_mae_results[feature] = norm_feature_mae
        norm_mae_values.append(norm_feature_mae)

    norm_mae_results["average_norm_mae"] = np.mean(norm_mae_values)

    return norm_mae_results

In [None]:
def plot_normalised_mae(norm_mae_results, reference, show=True):
    """
    This plots the normalised MAE for each of the features

    :param norm_mae_results: The normalised MAE results from the evaluation.
    :param reference: String representing the experiment reference, used to title the plot.
    :param show: Boolean to decide whether plot is shown in environment or saved as always.
    """
    features = list(norm_mae_results.keys())
    values = list(norm_mae_results.values())

    plt.figure(figsize=(10, 5))
    bars = plt.barh(features, values, edgecolor="black")
    plt.xlabel("Normalised Mean Absolute Error")
    plt.title("Normalised Mean Absolute Error by Feature for {}".format(reference))
    plt.gca().invert_yaxis()

    for bar in bars:
      width = bar.get_width()
      plt.text(width + 0.005, bar.get_y() + bar.get_height() / 2, "{:.3f}".format(width))

    plt.tight_layout()
    plt.savefig("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Visualisations/nmae/{}_feature_norm_mae.png".format(reference))

    if show:
        plt.show()
    else:
        plt.close()

In [None]:
def evaluate_artifically_missing(imputed_data, reference, missing_data, show=True):
    """
    This is a wrapper to read in the ground truth data, evaluate the imputed data through nMAE,
    plot the nMAE per feature and return a dataframe containing the results.
    """
    # Reading in the ground truth data
    reference_df = pd.read_csv("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/missing/raw/measurements_0.csv")

    # Getting the nMAE for the imputations
    norm_mae_results = evaluate_normalised_mae(reference_df, imputed_data, missing_data)

    # Shows the individual nMAE for each feature
    plot_normalised_mae(norm_mae_results, reference, show=show)

    # Final df to represent the evaluation
    results_df = pd.DataFrame(norm_mae_results, index=[0])

    return results_df

# Evaluate Real Missing Through Predictive Performance

In [None]:
%pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.7.0 scikit-optimize-0.10.2


In [None]:
# Importing optimiser package from scikit
from skopt import gp_minimize, BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

# Used to train model
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

# Tidy memory
import gc

# Used for processing
from types import SimpleNamespace
from datetime import datetime

In [None]:
xgboost_search_space = {
    "gamma": Categorical([0.01, 0.1]),
    "learning_rate": Categorical([0.001, 0.01, 0.1]),
    "max_depth": Categorical([3, 6, 9]),
    "n_estimators": Categorical([100, 200, 300])
}

In [None]:
def data_setup(score_data):
    """
    Split the data into training and test data for both the features and predicted values. Using
    stratified sampling to get even class distributions.

    :param score_data: The data to be split
    :return: X_train, X_test, y_train, y_test
    """
    # Splitting into features and target variables
    X = score_data[numerical_features].copy()
    y = score_data["outcome_encoded"].copy()

    # Splitting into training and test data, stratifying due to limited data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=507,
                                                        stratify=y)

    return X_train, X_test, y_train, y_test

In [None]:
def xgb_grid_search_optimisation(score_data, search_reference="no missing data", save_results=True):
    """
    Perform a grid search hyperparameter optimisation for XGBoost using the specified parameters in constants.py.
    Models are evaluated using accuracy, recall and the F-1 score with cross validation.
    :param score_data: The training data as a dataframe, this will be prepared through the defined function prior.
    :param search_reference: Used to label saved results
    """
    # Setting up model for grid search
    xgb_model = xgb.XGBClassifier(enable_categorical=True)
    stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=507)
    X_train, X_test, y_train, y_test = data_setup(score_data)

    bayes_search = BayesSearchCV(estimator=xgb_model, search_spaces=xgboost_search_space,
                                 scoring=["accuracy", "precision", "recall", "f1", "roc_auc"],
                                 refit="roc_auc", n_iter=20, cv=stratified_cv, verbose=0)
    bayes_search.fit(X_train, y_train)

    # Saving results of grid search in order of F1 score
    df = pd.DataFrame(bayes_search.cv_results_)

    # Recording the best results of all grid searches with given reference
    best_result = df.iloc[0].to_frame().T

    return best_result

# Actually Using the WGAIN

In [None]:
import gc

In [None]:
def run_training(data, missing_features, missing_data, is_raw_missing, level_missing, gen_lr,
                 disc_lr, batch_size, alpha, beta, noise_level, gen_layers, disc_layers, reference,
                 dropout=0, l2_reg=1e-5, n_critic=5, save_models=True, record_losses=True,
                 show_plots=True, progress_bar=True, timestamp=None):
    """
    This will train the given generator and discrminator with the given hyperparameters for up to
    1000 epochs or until early stopping is triggered by no improvement in either losses.
    """
    print("Recieved gen_lr: {}, disc_lr: {}, dropout: {}, l2_reg: {}, n_critic: {},\
        batch_size: {}, alpha: {}, beta: {}, noise_level: {}, gen_layers: {} and disc_layers: {}".format(
        gen_lr, disc_lr, dropout, l2_reg, n_critic, batch_size, alpha, beta, noise_level,
        len(gen_layers), len(disc_layers))
        )
    # Number of variables to impute (feature length)
    n_features = len(numerical_features)

    # Preparing the models and their optimisers
    generator, discriminator = initialise_models(n_features, gen_layers, disc_layers, dropout,
                                                 l2_reg, noise_level=noise_level)
    generator_optimiser, discriminator_optimiser = initialise_optimisers(gen_learning_rate=gen_lr,
                                                                         disc_learning_rate=disc_lr)

    # Only keeping n_epochs as testing different batch sizes
    n_epochs = 1000

    training_data = split_data(data.values, batch_size)

    if record_losses:
        # Starting training
        generator, discriminator, gen_losses, disc_losses, adv_losses, recon_losses \
        = train(training_data, n_epochs, generator, discriminator, generator_optimiser,
                discriminator_optimiser, n_critic, batch_size=batch_size, alpha=alpha, beta=beta,
                noise_level=noise_level, gen_layers=gen_layers, disc_layers=disc_layers,
                record_losses=record_losses, progress_bar=progress_bar)
        if show_plots:
          # Visualising losses for this model
          plot_losses(gen_losses, disc_losses, reference=reference, show=show_plots)

        del gen_losses, disc_losses, adv_losses, recon_losses
    else:
        generator, discriminator = train(training_data, n_epochs, generator, discriminator,
                                         generator_optimiser, discriminator_optimiser, n_critic,
                                         batch_size=batch_size, alpha=alpha, beta=beta,
                                         noise_level=noise_level, gen_layers=gen_layers,
                                         disc_layers=disc_layers, record_losses=record_losses,
                                         progress_bar=progress_bar)

    # Using that trained model to impute the data
    complete_imputation = impute_from_model(missing_features, generator, scaler, imputer,
                                            noise_level=noise_level)

    complete_imputation.to_csv("imputed_data.csv")

    # Checking distributions for mean collapse
    if show_plots:
      plot_imputed_distributions(missing_features, complete_imputation, reference, show=show_plots)

    if "artificial" in reference:
        imputation_scores = evaluate_artifically_missing(complete_imputation, reference,
                                                         missing_data, show=show_plots)
    else:
        # Restoring the outcome column for prediction
        complete_imputation = combine_imputations_with_categorical(complete_imputation,
                                                                   missing_data, missing_features)
        # Encoding outcome for prediction
        complete_imputation["outcome_encoded"] = le.fit_transform(complete_imputation["outcome"])

        # Accuracy, Precision, Recall, F-1 and ROC-AUC with means and std.'s
        imputation_scores = xgb_grid_search_optimisation(complete_imputation,
                                                         search_reference="testing",
                                                         save_results=False)

    if save_models:
        if timestamp is not None:
            reference = reference + "_" + timestamp

        save_and_clear_models(generator, discriminator, reference)

    tf.keras.backend.clear_session()

    # Deleting after their use
    del generator, discriminator, generator_optimiser, discriminator_optimiser, training_data, \
     data, complete_imputation
    gc.collect()

    plt.close("all")

    return imputation_scores

# Grid Search

In [None]:
# Combinations from here will be tested in a bayesian search
wgain_search_space = [
    Categorical([0.5, 0.4, 0.3, 0.2, 0.1, 0.01, 0.001, 0.0001], name="gen_lr"),
    Categorical([0.01, 0.001, 0.0001, 0.00001], name="disc_lr"),
    Categorical([0.05, 0.1, 0.2, 0.3], name="dropout"),
    Categorical([1e-5, 1e-4, 1e-3, 1e-2], name="l2_reg"),
    Categorical([1, 2, 3, 4, 5], name="n_critic"),
    Categorical([8, 16, 32, 64, 128, 256, 512], name="batch_size"),
    Categorical([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], name="alpha"),
    Categorical([10], name="beta"), # Forgot to modify so unfortunately not included in final results
    Categorical([5, 10, 15, 20], name="noise_level"),
    Categorical([6, 5, 4, 3, 2, 1], name="gen_layers_count"),
    Categorical([6, 5, 4, 3, 2, 1], name="disc_layers_count")
    ]

In [None]:
# Directory to save the result of each grid search iteration
score_save_path = "/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/results/wgain_{}_individual_scores.csv"

In [None]:
def generate_layer_sizes(layer_count, model_type="gen", base_sizes=[256, 128, 64, 32, 16, 8]):
    """
    Given a number of layers this will return the matching layer sizes to be used. i.e. a layer
    count of 3 will return (32, 16, 8), with a maximum of 6 going from 256 to 8.

    :param layer_count: An integer representing the total number of layers to be returned.
    :param model_type: String representing whether the model is a generator or discriminator.
    :param base_sizes: The sizes of the layers to make the selection from, the default is [256, 128,
                       64, 32, 16, 8]
    :return: A list of layer sizes to be used. If generator it will go from small to large and vice
             versa for the discrminator
    """
    n_sizes = len(base_sizes)

    if layer_count > n_sizes:
        layer_count = n_sizes

    layer_sizes = base_sizes[-layer_count:]

    if model_type == "gen":
        return layer_sizes[::-1]
    elif model_type == "disc":
        return layer_sizes
    else:
        raise ValueError("model_type must be 'gen' or 'disc'")

In [None]:
def check_previously_run(reference, results_dir):
    """
    If bayesian search or testing has been done for a specific dataset the results can be passed
    to the optimiser to speed up the process.

    :param reference: The reference for the data that is being tested.
    :param results_dir: The directory containing the results of specific grid search.
    :return: Boolean specifying whether the given test has already been completed.
    """
    previous_runs = pd.read_csv(results_dir)

    if "artificial" in reference:
        m_level, m_type = reference.replace("artificial_", "").split("_", 1)
    else:
        m_type, m_level = reference.split("_", 1)

    m_level = float(m_level)

    relevant_row = previous_runs[(previous_runs["missing_type"] == m_type) &
                                (previous_runs["missing_level"] == m_level)]

    if relevant_row.empty:
        return False
    else:
        return True

In [None]:
def save_scores(scores, reference, timestamp, parameters, save_path):
    """

    :param scores:
    :param reference:
    :param timestamp:
    :param parameters:
    :param save_path:
    """
    row = {
        "reference": reference,
        "timestamp": timestamp,
        "mean_test_accuracy": scores["mean_test_accuracy"][0],
        "std_test_accuracy": scores["std_test_accuracy"][0],
        "mean_test_precision": scores["mean_test_precision"][0],
        "std_test_precision": scores["std_test_precision"][0],
        "mean_test_recall": scores["mean_test_recall"][0],
        "std_test_recall": scores["std_test_recall"][0],
        "mean_test_f1": scores["mean_test_f1"][0],
        "std_test_f1": scores["std_test_f1"][0],
        "mean_test_roc_auc": scores["mean_test_roc_auc"][0],
        "std_test_roc_auc": scores["std_test_roc_auc"][0]
    }

    row.update(parameters)

    if os.path.exists(save_path):
        pd.DataFrame([row]).to_csv(save_path, mode="a", header=False, index=False)
    else:
        pd.DataFrame([row]).to_csv(save_path, mode="w", header=True, index=False)


In [None]:
def get_previous_search_results(save_path, reference):
    """

    :param save_path:
    :param reference:
    :return:
    """
    if os.path.exists(save_path):
        if "raw" in reference:
            metric = "mean_test_roc_auc"
        elif "artificial" in reference:
            metric = "average_nmae"
        else:
            raise ValueError("Invalid reference provided. Must be either artificial or raw")

        previous_runs = pd.read_csv(save_path)

        x0 = previous_runs[["gen_lr", "disc_lr", "dropout", "l2_reg", "n_critic", "batch_size", "alpha", "beta", "noise_level", "gen_Layers", "disc_layers"]].values.tolist()
        y0 = previous_runs[metric].values.tolist()

        # roc-auc is saved as a positive, but need to minimise in context of bayesian search so
        # making values negative.
        if metric == "mean_test_roc_auc":
            y0 = [-val for val in y0]
    else:
        x0, y0 = None, None

    return x0, y0

In [None]:
def run_bayesian_optimisation(features, is_raw_missing, missing_features, missing_data, level_missing, reference, grid_search_dir):
    """
    Completes a bayesian search with 15 random points and 15 targeted points. This minimises either
    the MAE when using ground truth data or the AUC-ROC when using real missing data.

    :param features: Dataframe containing missing features that require imputation.
    :param is_raw_missing: Boolean flag representing whether using raw missing data or data with
                            ground truth available.
    :param missing_features: The missing features that require imputation.
    :param missing_data: The complete dataset from which the features are selected from.
    :param level_missing: Either the percentage or number of values missing per feature/row.
    :param reference: String representing the reference for this test.
    :param grid_search_dir: String representing the directory to store results in.
    :return: The best parameters and score found in the bayesian search.
    """
    already_tested = check_previously_run(reference, grid_search_dir)
    if already_tested:
        print("Already tested: {}, so skipping.".format(reference))
        return

    save_path = score_save_path.format(reference)
    features_filled = features.fillna(features.mean())

    x0, y0 = get_previous_search_results(save_path, reference)
    # Default is 40 calls
    n_calls = 40
    initial_points = 20

    if x0 == None:
        print("No previous results found for {}, so starting new bayesian search".format(reference))
    # There are previous runs available to include in search
    else:
        # Checking how many calls remain in total and at random
        n_calls -= len(x0)
        initial_points -= len(x0)

        # There are no remaining calls so returning the best found result
        if n_calls < 1:
            print("Completed all calls so returning best results.")

            best_run_id = y0.index(min(y0))
            best_params = x0[best_run_id]
            best_score = y0[best_run_id]

            # Early return, matching format of the minimise function
            return SimpleNamespace(x=best_params, fun=best_score)

        # Checking if any random calls remain
        elif initial_points < 1:
            print("Completed all random points so now refining.")
            initial_points = 0

        print("Using previous search results for {}, doing {} calls with {} random".format(reference, n_calls, initial_points))

    # Using scikit optimiser function - objective function is to test models using either
    # - nMAE for ground truth data
    # - ROC-AUC for raw data
    @use_named_args(wgain_search_space)
    def objective(gen_lr, disc_lr, dropout, l2_reg, n_critic, batch_size, alpha, beta, noise_level,
                  gen_layers_count, disc_layers_count):
        # Setting up the model architecutres
        gen_layers = generate_layer_sizes(gen_layers_count)
        disc_layers = generate_layer_sizes(disc_layers_count)

        params = {
            "gen_lr": gen_lr,
            "disc_lr": disc_lr,
            "dropout": dropout,
            "l2_reg": l2_reg,
            "n_critic": n_critic,
            "batch_size": batch_size,
            "alpha": alpha,
            "beta": beta,
            "noise_level": noise_level,
            "gen_Layers": gen_layers_count,
            "disc_layers": disc_layers_count
            }

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Fully training a model and evaluating it given passed values
        scores = run_training(features_filled, missing_features, missing_data, is_raw_missing,
                              level_missing, gen_lr=gen_lr, disc_lr=disc_lr, dropout=dropout,
                              l2_reg=l2_reg, n_critic=n_critic, timestamp=timestamp,
                              batch_size=batch_size, alpha=alpha, beta=beta,
                              noise_level=noise_level, gen_layers=gen_layers,
                              disc_layers=disc_layers, save_models=True, record_losses=False,
                              reference=reference, show_plots=False, progress_bar=False)

        if is_raw_missing:
            save_scores(scores, reference, timestamp, params, save_path)
            return -scores["mean_test_roc_auc"].iloc[0]
        else:
            # Returning the average nMAE value
            return scores["average_norm_mae"][0]

    # Minimising nMAE, 20 searches within specified search space. 20 random searches and then 20
    # probability based searches. Can give x0 and y0 as known good configuration to build on
    return gp_minimize(objective, dimensions=gain_search_space, n_calls=n_calls,
                       n_initial_points=initial_points, random_state=507, verbose=True, x0=x0,
                       y0=y0)

# Train and Evaluate Models on Artifically Missing Data

In [None]:
# Used to go through all the combinations of artificially missing data
missing_types = ["mcar", "mnar_central", "mnar_upper", "mnar_lower"]
missing_levels = ["0.2", "0.5", "0.7"]

In [None]:
def grid_search_for_artificial_data():
    """
    Completes a grid search for the artificial data by minimising MAE through a bayesian search,
    tuning the generator and discriminator hyperparameters.
    """
    artificial_grid_search_dir = ("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/artificial_wgain_gridsearch.csv")
    # Ground truth data
    reference_df = pd.read_csv("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/missing/raw/measurements_0.csv")
    imputer, scaler = fit_scaler_and_imputer(reference_df[numerical_features], temp_imputer,
                                             min_max_scaler)

    # DF to store the results
    if not os.path.exists(artificial_grid_search_dir):
        blank_df = pd.DataFrame(columns=["missing_type", "missing_level", "best_nmae", "gen_lr",
                                         "disc_lr", "dropout", "l2_reg", "n_critic", "batch_size",
                                         "alpha", "beta", "noise_level", "gen_Layers",
                                         "disc_layers"])
        blank_df.to_csv(artificial_grid_search_dir, index=False)
        # Only required for first appendage
        header = True
    else:
        header = False

    results = []

    # Go through every combination of artificially missing data
    for m_type in missing_types:
        for m_level in missing_levels:
            # Get the data with the specified missing type and level
            artificial_df, artificial_df_missing, artificial_reference = get_data_and_reference(
                is_raw_missing=False, level_missing=m_level, missing_type=m_type)
            # Scaling the data
            artificial_feature_values, artificial_scaled_features = scale_data(artificial_df_missing,
                                                                               imputer, scaler)

            print("Training and testing with {}".format(artificial_reference))

            # Running a bayesian search to optimise
            search_result = run_bayesian_optimisation(
                                                      features=artificial_scaled_features,
                                                      missing_features=artificial_df_missing,
                                                      missing_data=artificial_df,
                                                      is_raw_missing=False,
                                                      level_missing=m_level,
                                                      reference=artificial_reference,
                                                      grid_search_dir=artificial_grid_search_dir
                                                      )

            if search_result is None:
                continue

            # Getting the optimal parameters found
            gen_lr, disc_lr, dropout, l2_reg, n_critic, batch_size, alpha, beta, noise_level, \
             gen_layers, disc_layers = search_result.x

            # Confirming the results
            print("Best nMAE {}".format(search_result.fun))
            print("Best parameters {}".format(search_result.x))

            # Saving results to csv, appending so if runtime expires a recovery is possible
            result_dict = {"missing_type": m_type, "missing_level": m_level,
                           "best_nmae": search_result.fun, "gen_lr": gen_lr, "disc_lr": disc_lr,
                           "dropout": dropout, "l2_reg": l2_reg, "n_critic": n_critic,
                           "batch_size": batch_size, "alpha": alpha, "beta": beta,
                           "noise_level": noise_level, "gen_layers": gen_layers,
                           "disc_layers": disc_layers
                           }
            result_df = pd.DataFrame([result_dict])
            result_df.to_csv(artificial_grid_search_dir, mode='a', header=header, index=False)

            header=False

    print("Finished")

In [None]:
# grid_search_for_artificial_data()

#### Checking the results on artificial data and finalising the imputations

In [None]:
artificial_scores = pd.read_csv("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/artificial_wgain_gridsearch.csv")
artificial_scores.head(12)

Unnamed: 0,missing_type,missing_level,best_nmae,gen_lr,disc_lr,dropout,l2_reg,n_critic,batch_size,alpha,beta,noise_level,gen_Layers,disc_layers
0,mcar,0.2,0.726789,0.3,0.01,0.05,0.01,3,32,20,10,15,1,6
1,mcar,0.5,0.71669,0.2,0.0001,0.05,0.01,1,256,20,10,15,1,2
2,mcar,0.7,0.718369,0.2,1e-05,0.3,0.001,3,256,100,10,15,2,3
3,mnar_central,0.2,0.696785,0.2,0.0001,0.05,0.01,2,16,30,10,5,1,5
4,mnar_central,0.5,0.790741,0.1,0.001,0.2,0.01,1,32,20,10,10,4,1
5,mnar_central,0.7,0.990974,0.1,0.0001,0.2,0.0001,4,128,30,10,10,2,5
6,mnar_upper,0.2,0.697681,0.3,0.0001,0.2,1e-05,3,256,90,10,15,3,3
7,mnar_upper,0.5,0.690121,0.1,1e-05,0.1,1e-05,3,128,30,10,20,4,1
8,mnar_upper,0.7,0.703761,0.2,1e-05,0.3,0.01,5,512,70,10,10,1,6
9,mnar_lower,0.2,0.718231,0.2,1e-05,0.05,0.001,2,16,80,10,20,3,1


In [None]:
def generate_best_imputations(scores, test_type="artificial"):
    """
    Function to re-run the best found models with the complete MAE for all the features instead
    of purely the average.

    :param scores: Dataframe containing the results of the grid search.
    :param test_type: String representing the main cateogry of missing data (artificial or raw)
    """
    if test_type == "artificial":
        is_raw_missing = False
        best_scores_dir = ("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/results/artificial_wgain_scores.csv")
        df_columns = ["reference", "mean arterial pressure",
                                         "heart rate", "respiratory rate", "PCO2 (Arterial)",
                                         "PO2 (Arterial)", "FiO2", "arterial pH", "sodium",
                                         "postassium", "creatinine", "hematocrit",
                                         "white blood cell", "HCO3 (serum)", "average_norm_mae"]
    elif test_type == "raw":
        is_raw_missing = True
        best_scores_dir = ("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/results/raw_wgain_scores.csv")
        df_columns = ["missing_type", "missing_level", "mean_test_accuracy", "std_test_accuracy",
                      "mean_test_precision", "std_test_precision", "mean_test_recall",
                      "std_test_recall", "mean_test_f1", "std_test_f1", "mean_test_roc_auc",
                      "std_test_roc_auc"]
    else:
        raise ValueError("Invalid score type")

    all_tests = scores.groupby(["missing_type", "missing_level"])
    # Check if the results file exists
    if not os.path.exists(best_scores_dir):
        blank_df = pd.DataFrame(columns=df_columns)
        blank_df.to_csv(best_scores_dir, index=False)
        # Only required for first appendage
        header = True
    else:
        header = False

    # Going through each of the grid search results
    for (missing_type, missing_level), group in all_tests:
        # Preparing data and reference for imputation
        shuffled_data, shuffled_features, data_reference = get_data_and_reference(is_raw_missing,
                                                                                  level_missing=missing_level,
                                                                                  missing_type=missing_type)

        tested_combination = check_previously_run(data_reference, best_scores_dir)

        if tested_combination:
            print("Already tested {}, so skipping.".format(data_reference))
            continue
        else:
            print("Testing, {}".format(data_reference))

        imputer, scaler = fit_scaler_and_imputer(shuffled_data[numerical_features], temp_imputer,
                                                 min_max_scaler)

        shuffled_feature_values, scaled_features = scale_data(shuffled_features, imputer, scaler)
        features_filled = scaled_features.fillna(scaled_features.mean())

        # Extracting the best combination of variables
        gen_lr = group["gen_lr"].iloc[0]
        disc_lr = group["disc_lr"].iloc[0]
        dropout = group["dropout"].iloc[0]
        l2_reg = group["l2_reg"].iloc[0]
        n_critic = group["n_critic"].iloc[0]
        batch_size = group["batch_size"].iloc[0]
        alpha = group["alpha"].iloc[0]
        beta = group["beta"].iloc[0]
        noise_level = group["noise_level"].iloc[0]
        gen_layers_count = group["gen_Layers"].iloc[0]
        disc_layers_count = group["disc_layers"].iloc[0]

        # Getting the layer sizes for the models
        gen_layers = generate_layer_sizes(gen_layers_count)
        disc_layers = generate_layer_sizes(disc_layers_count)

        # Training and evaluating the quality of the imputation for the provided model
        scores = run_training(data=features_filled, missing_features=shuffled_features,
                              missing_data=shuffled_data, is_raw_missing=False,
                              level_missing=missing_level, gen_lr=gen_lr, disc_lr=disc_lr,
                              dropout=dropout, l2_reg=l2_reg, n_critic=n_critic,
                              batch_size=batch_size, alpha=alpha, beta=beta,
                              noise_level=noise_level, gen_layers=gen_layers,
                              disc_layers=disc_layers, save_models=True, record_losses=False,
                              reference=data_reference, show_plots=False, progress_bar=False)

        if test_type == "artificial":
            # Preparing scores and saving to shared file
            score_df = pd.DataFrame({
                "missing_type": [missing_type],
                "missing_level": [missing_level],
                "mean arterial pressure": [scores["mean arterial pressure"][0]],
                "heart rate": [scores["heart rate"][0]],
                "respiratory rate": [scores["respiratory rate"][0]],
                "PCO2 (Arterial)": [scores["PCO2 (Arterial)"][0]],
                "PO2 (Arterial)": [scores["PO2 (Arterial)"][0]],
                "FiO2": [scores["FiO2"][0]],
                "arterial pH": [scores["arterial pH"][0]],
                "sodium": [scores["sodium"][0]],
                "postassium": [scores["postassium"][0]],
                "creatinine": [scores["creatinine"][0]],
                "hematocrit": [scores["hematocrit"][0]],
                "white blood cell": [scores["white blood cell"][0]],
                "HCO3 (serum)": [scores["HCO3 (serum)"][0]],
                "average_norm_mae": [scores["average_norm_mae"][0]]
            })
        else:
            score_df = pd.DataFrame({
                "missing_type": [missing_type],
                "missing_level": [missing_level],
                "mean_test_accuracy": [scores["mean_test_accuracy"][0]],
                "std_test_accuracy": [scores["std_test_accuracy"][0]],
                "mean_test_precision": [scores["mean_test_precision"][0]],
                "std_test_precision": [scores["std_test_precision"][0]],
                "mean_test_recall": [scores["mean_test_recall"][0]],
                "std_test_recall": [scores["std_test_recall"][0]],
                "mean_test_f1": [scores["mean_test_f1"][0]],
                "std_test_f1": [scores["std_test_f1"][0]],
                "mean_test_roc_auc": [scores["mean_test_roc_auc"][0]],
                "std_test_roc_auc": [scores["std_test_roc_auc"][0]]
            })

        score_df.to_csv(best_scores_dir, mode='a', header=header, index=False)
        header = False

    complete_scores = pd.read_csv(best_scores_dir)
    complete_scores.head(12)

In [None]:
generate_best_imputations(artificial_scores, test_type="artificial")

Already tested artificial_0.2_mcar, so skipping.
Already tested artificial_0.5_mcar, so skipping.
Already tested artificial_0.7_mcar, so skipping.
Already tested artificial_0.2_mnar_central, so skipping.
Already tested artificial_0.5_mnar_central, so skipping.
Already tested artificial_0.7_mnar_central, so skipping.
Already tested artificial_0.2_mnar_lower, so skipping.
Already tested artificial_0.5_mnar_lower, so skipping.
Already tested artificial_0.7_mnar_lower, so skipping.
Already tested artificial_0.2_mnar_upper, so skipping.
Already tested artificial_0.5_mnar_upper, so skipping.
Already tested artificial_0.7_mnar_upper, so skipping.


# Train Models on All Levels of Real Missing Data
- evaluation takes place in main code, only getting the imputed datasets to test.

In [None]:
missing_levels = [2, 5, 10]
is_raw_missing = True

In [None]:
def grid_search_real_missing():
    """
    Completes a grid search for the artificial data by maximising ROC-AUC through a bayesian search,
    tuning the generator and discriminator hyperparameters.
    """
    real_grid_search_dir = ("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/real_wgain_gridsearch.csv")

    # Number of variables to impute
    n_features = len(numerical_features)

    # DF to store the results
    if not os.path.exists(real_grid_search_dir):
        blank_df = pd.DataFrame(columns=["missing_type", "missing_level", "best_roc_auc", "gen_lr", "disc_lr",
                                         "dropout", "l2_reg", "n_critic", "batch_size", "alpha",
                                         "beta", "gen_Layers", "disc_layers"])
        blank_df.to_csv(real_grid_search_dir, index=False)
        # Only required for first appendage
        header = True
    else:
        header = False
    results = []

    for m_level in missing_levels:
        # Get the data with the specified missing type and level
        real_df, real_df_missing, real_reference = get_data_and_reference(
            is_raw_missing=True, level_missing=m_level, missing_type=None)

        print("Testing, {}".format(real_reference))

        # Scaling the data
        imputer, scaler = fit_scaler_and_imputer(real_df[numerical_features], temp_imputer,
                                                 min_max_scaler)
        real_feature_values, real_scaled_features = scale_data(real_df_missing, imputer, scaler)
        # Running a bayesian search to optimise
        search_result = run_bayesian_optimisation(features=real_scaled_features,
                                                    missing_features=real_df_missing,
                                                    missing_data=real_df,
                                                    is_raw_missing=True,
                                                    level_missing=m_level,
                                                    reference=real_reference,
                                                    grid_search_dir=real_grid_search_dir
                                                    )
        if search_result is None:
            continue

        # Getting the optimal parameters found
        gen_lr, disc_lr, dropout, l2_reg, n_critic, batch_size, alpha, beta, noise_level, \
         gen_layers, disc_layers = search_result.x

        # Confirming the results
        print("Best ROC-AUC {}".format(search_result.fun))
        print("Best parameters {}".format(search_result.x))

        # Saving results to csv, appending so if runtime expires a recovery is possible
        result_dict = {"missing_type": "raw", "missing_level": m_level,
                       "best_roc_auc": search_result.fun, "gen_lr": gen_lr,
                       "disc_lr": disc_lr, "dropout": dropout, "l2_reg": l2_reg,
                       "n_critic": n_critic, "batch_size": batch_size, "alpha": alpha, "beta": beta,
                       "noise_level": noise_level, "gen_layers": gen_layers,
                       "disc_layers": disc_layers
                       }
        result_df = pd.DataFrame([result_dict])
        result_df.to_csv(real_grid_search_dir, mode='a', header=header, index=False)

        header=False

    print("Finished")

In [None]:
# Training the models to work on the real data with different levels of missingness
grid_search_real_missing()

Testing, raw_2
Completed all calls so returning best results.
Best ROC-AUC -0.8454809770470092
Best parameters [0.2, 0.01, 0.05, 0.001, 2.0, 32.0, 90.0, 10.0, 20.0, 4.0, 5.0]
Testing, raw_5
Completed all calls so returning best results.
Best ROC-AUC -0.8425605432761577
Best parameters [0.1, 0.0001, 0.1, 0.0001, 4.0, 32.0, 40.0, 10.0, 10.0, 2.0, 2.0]
Testing, raw_10
Completed all calls so returning best results.
Best ROC-AUC -0.8411343328340013
Best parameters [0.5, 0.001, 0.05, 1e-05, 3.0, 128.0, 100.0, 10.0, 10.0, 4.0, 1.0]
Finished


In [None]:
real_scores = pd.read_csv("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/real_wgain_gridsearch.csv")
real_scores.head(12)

Unnamed: 0,missing_type,missing_level,best_roc_auc,gen_lr,disc_lr,dropout,l2_reg,n_critic,batch_size,alpha,beta,gen_Layers,disc_layers
raw,2,-0.832858,0.1,1e-05,0.1,0.001,4.0,256.0,70.0,10.0,5.0,1.0,4.0
raw,2,0.771727,0.0001,0.001,0.3,1e-05,3.0,128.0,20.0,10.0,20.0,4.0,1.0
raw,2,0.771727,0.0001,0.001,0.3,1e-05,3.0,128.0,20.0,10.0,20.0,4.0,1.0
raw,2,0.771727,0.0001,0.001,0.3,1e-05,3.0,128.0,20.0,10.0,20.0,4.0,1.0
raw,2,-0.845481,0.2,0.01,0.05,0.001,2.0,32.0,90.0,10.0,20.0,4.0,5.0
raw,2,-0.845481,0.2,0.01,0.05,0.001,2.0,32.0,90.0,10.0,20.0,4.0,5.0
raw,5,-0.842561,0.1,0.0001,0.1,0.0001,4.0,32.0,40.0,10.0,10.0,2.0,2.0
raw,2,-0.845481,0.2,0.01,0.05,0.001,2.0,32.0,90.0,10.0,20.0,4.0,5.0
raw,5,-0.842561,0.1,0.0001,0.1,0.0001,4.0,32.0,40.0,10.0,10.0,2.0,2.0
raw,2,-0.845481,0.2,0.01,0.05,0.001,2.0,32.0,90.0,10.0,20.0,4.0,5.0


In [None]:
def extract_best_scores(test_type="raw"):
    """
    Used to extract the best ROC-AUC scores from the complete grid search. A new csv is created
    containing just the best runs with their hyperparameters and either complete ground truth scores
    or the downstream scores.

    This is used instead of generate best imputations as it is less intensive and more telling.

    :param test_type: String representing the main cateogry of missing data (artificial or raw).
    """
    if test_type == "raw":
        # Only 3 raw files and tested in downstream.
        file_references = ["raw_2", "raw_5", "raw_10"]
        metric = "mean_test_roc_auc"
    elif test_type == "artificial":
        # 12 combinations of missing types and different missing levels
        missing_types = ["mcar", "mnar_central", "mnar_upper", "mnar_lower"]
        missing_levels = ["0.2", "0.5", "0.7"]
        file_references = []

        for missing_type in missing_types:
            for missing_level in missing_levels:
                file_references.append("{}_{}".format(missing_type, missing_level))

        # Tested in their ground truth reconstruction
        metric = "average_norm_mae"
    else:
        raise ValueError("Invalid score type.")
    best_results = []

    # Go through each  of the results files
    for file in file_references:
        file_dir = "/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/results/wgain_{}_individual_scores.csv".format(file)

        complete_scores = pd.read_csv(file_dir)
        complete_scores = complete_scores.drop(columns=["timestamp"])

        # Extract result with the best result, maximising ROC-AUC and minimising nMAE
        if test_type == "raw":
           best_result = complete_scores.iloc[complete_scores[metric].idxmax()]
        else:
            best_result = complete_scores.iloc[complete_scores[metric].idxmin()]
        # Save the best result to list
        best_results.append(best_result)

    # Convert best results to a dataframe and save
    best_results_df = pd.DataFrame(best_results)
    best_results_df.to_csv("/content/drive/MyDrive/Sheffield/6000 Dissertation/Imputing Health Care Data/Data/results/best_{}_wgain_scores.csv".format(test_type), index=False)

    return best_results_df

In [None]:
extract_best_scores()

Unnamed: 0,reference,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall,mean_test_f1,std_test_f1,mean_test_roc_auc,...,disc_lr,dropout,l2_reg,n_critic,batch_size,alpha,beta,noise_level,gen_Layers,disc_layers
31,raw_2,0.875148,0.001215,0.884116,0.001184,0.981573,0.002692,0.930295,0.000769,0.845481,...,0.01,0.05,0.001,2,32,90,10,20,4,5
24,raw_5,0.92088,0.001753,0.927293,0.001396,0.99076,0.001625,0.957975,0.000932,0.842561,...,0.0001,0.1,0.0001,4,32,40,10,10,2,2
36,raw_10,0.918327,0.001105,0.92538,0.000808,0.990144,0.001281,0.956667,0.000597,0.841134,...,0.001,0.05,1e-05,3,128,100,10,10,4,1
