## UPDATES 
* v5
    * Increased depth of the NN
    * fitted the model with entire train data after K-Fold
    * printed out a classification report
    * created a confusion matrix

In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import tensorflow as tf
import tensorflow.keras.backend as K
import optuna

In [None]:
# Reading in data

train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')

In [None]:
# Create a list of features and target

features = [col for col in train.columns if col not in ['target']]
target = 'target'

# Target Values

In [None]:
train[target].value_counts().plot(kind='bar')

### INSIGHTS
There is a similar split between the categories. In other words there is a class balance between each bacteria.

In [None]:
# Encoding the classes of target into numerical values

le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

## Data Preprocessing

In [None]:
# Create a list of features and target

features = [col for col in train.columns if col not in ['target']]
target = 'target'

In [None]:
# Scaling the data

sc = StandardScaler()
train[features] = sc.fit_transform(train[features])

In [None]:
# Split the data into train valid

X_train, X_valid, y_train, y_valid = train_test_split(
    train[features],
    train['target'],
    test_size=0.3,
    stratify=train['target'],
    random_state=0
)

## Modeling

### Baseline Model

In [None]:
# Parameters for model

dims = train[features].shape[1]
classes = train['target'].nunique()
epochs = 50
regularizer = 0.001
seed = 0

In [None]:
# Function that trains a Neural Network model

def nn_model(X_train, y_train, X_valid, y_valid, verbose=1):

    # Clearing the tensorflow backend
    K.clear_session()
    
    # Creating the model
    inputs = tf.keras.Input(shape=(None, dims))
    layer = tf.keras.layers.Dense(
        32,
        activation='relu',
        kernel_initializer=tf.keras.initializers.HeNormal(seed),
        kernel_regularizer=tf.keras.regularizers.l2(regularizer),
    )(inputs)
#     layer = tf.keras.layers.BatchNormalization()(layer)
    
    layer = tf.keras.layers.Dense(
        32,
        activation='relu',
        kernel_initializer=tf.keras.initializers.HeNormal(seed),
        kernel_regularizer=tf.keras.regularizers.l2(regularizer),
    )(layer)
#     layer = tf.keras.layers.BatchNormalization()(layer)
    

    layer = tf.keras.layers.Dense(
        32,
        activation='relu',
        kernel_initializer=tf.keras.initializers.HeNormal(seed),
        kernel_regularizer=tf.keras.regularizers.l2(regularizer),
    )(layer)
#     layer = tf.keras.layers.BatchNormalization()(layer)
    
    layer = tf.keras.layers.Dense(
        32,
        activation='relu',
        kernel_initializer=tf.keras.initializers.HeNormal(seed),
        kernel_regularizer=tf.keras.regularizers.l2(regularizer),
    )(layer)
#     layer = tf.keras.layers.BatchNormalization()(layer)

    layer = tf.keras.layers.Dense(
        32,
        activation='relu',
        kernel_initializer=tf.keras.initializers.HeNormal(seed),
        kernel_regularizer=tf.keras.regularizers.l2(regularizer),
    )(layer)

    outputs = tf.keras.layers.Dense(
        classes,
        activation='softmax',
    )(layer)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    # Compiling the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['categorical_accuracy'],
    )
    
    lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", 
        factor=0.5, 
        patience=5, 
        verbose=2
    )

    es = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5, 
        verbose=1,
        mode="min",
        restore_best_weights=True
    )
    
    # Fitting the model
    history = model.fit(
        X_train,
        y_train,
        validation_data=(X_valid, y_valid),
        epochs=epochs,
        batch_size=128,
        verbose=verbose,
        callbacks=[lr, es]
    )
    
    return history, model

In [None]:
# def f1_score(y_true, y_pred):

#     # Count positive samples.
#     c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

#     # If there are no true samples, fix the F1 score at 0.
#     if c3 == 0:
#         return 0

#     # How many selected items are relevant?
#     precision = c1 / c2

#     # How many relevant items are selected?
#     recall = c1 / c3

#     # Calculate f1_score
#     f1_score = 2 * (precision * recall) / (precision + recall)
#     return f1_score 

In [None]:
# Function plotting the accuracy and loss of 

def plot_results(history):
    plt.figure(figsize=(10,5))
    x_ticks = np.arange(len(history.history['accuracy']))+1
    plt.subplot(1,2,1)
    plt.plot(x_ticks, history.history['accuracy'], label='accuracy')
    plt.plot(x_ticks, history.history['val_accuracy'], label='val_accuracy')
    plt.xticks(x_ticks)
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.title('Accuracy Over Epochs')
    plt.legend();

    plt.subplot(1,2,2)
    plt.plot(x_ticks, history.history['loss'], label='loss')
    plt.plot(x_ticks, history.history['val_loss'], label='val_loss')
    plt.xticks(x_ticks)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.title('Loss Over Epochs')
    plt.legend();

    plt.tight_layout()

In [None]:
## Create and train NN model
history, model = nn_model(X_train, y_train, X_valid, y_valid)

In [None]:
plot_results(history)

## K-Fold Cross Validation

In [None]:
X = train[features]
y = train['target']

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=0)

fold_num = 1
accuracy_per_fold = []
loss_per_fold = []

for train_index, valid_index in kf.split(X):
    
    ## Splitting the data into train and test
    X_train, X_valid, = X.iloc[train_index,:], X.iloc[valid_index,:]
    y_train, y_valid = y[train_index], y[valid_index]
    
    ## Create and train NN model
    history, model = nn_model(X_train, y_train, X_valid, y_valid, verbose=0)
    
    ## Plotting accuracy and loss
    plot_results(history)
    
    ## Get and save the accuracy and loss
    scores = model.evaluate(X_valid, y_valid, verbose=0)
    accuracy_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])
    print(f'Score for fold {fold_num}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    
    # Increase fold number
    fold_num += 1

In [None]:
print(f"KFold Results: ")
print(f"------------------")
print(f"Accuracy mean: {np.mean(accuracy_per_fold):.3f} +/- {np.std(accuracy_per_fold):.3f}")
print(f"Loss mean: {np.mean(loss_per_fold):.3f} +/- {np.std(loss_per_fold):.3f}")

### INSIGHTS
* Based on the standard deviation the model has low variance
    * the accuracy and loss of the training and validation sets are somewhat consistent
* Base on the mean, the bias is low
    * the mean accuracy is 95% which makes the error low but maybe we can get the error lower

In [None]:
# Creating the model

tf.keras.backend.clear_session()

inputs = tf.keras.Input(shape=(None, dims))
layer = tf.keras.layers.Dense(
    32,
    activation='relu',
    kernel_initializer=tf.keras.initializers.HeNormal(seed=0),
)(inputs)

layer = tf.keras.layers.Dense(
    32,
    activation='relu',
    kernel_initializer=tf.keras.initializers.HeNormal(seed=0),
)(layer)

layer = tf.keras.layers.Dense(
    32,
    activation='relu',
    kernel_initializer=tf.keras.initializers.HeNormal(seed=0),
)(layer)

outputs = tf.keras.layers.Dense(
    classes,
    activation='softmax',
)(layer)

full_model = tf.keras.Model(inputs=inputs, outputs=outputs)


In [None]:
full_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy', 'AUC'],
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

history = full_model.fit(
    X,
    y,
    epochs=15
)

## Error Analysis

In [None]:
y_preds = full_model.predict(X_valid)
y_preds_final = np.argmax(y_preds, axis=1)

In [None]:
cf_mat = confusion_matrix(y_preds_final, y_valid)
print(cf_mat)

In [None]:
print(classification_report(y_valid, y_preds_final))

## Hyperparameter Tuning with Optuna

In [None]:
# def create_model(trial):
#     # We optimize the numbers of layers, their units and weight decay parameter.
#     n_layers = trial.suggest_int("n_layers", 1, 3)
#     weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)
#     model = tf.keras.Sequential()
#     model.add(tf.keras.layers.Flatten())
#     for i in range(n_layers):
#         num_hidden = trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True)
#         model.add(
#             tf.keras.layers.Dense(
#                 num_hidden,
#                 activation="relu",
#                 kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
#             )
#         )
#     model.add(
#         tf.keras.layers.Dense(classes, kernel_regularizer=tf.keras.regularizers.l2(weight_decay))
#     )
#     return model

In [None]:
# def create_optimizer(trial):
#     # We optimize the choice of optimizers as well as their parameters.
#     kwargs = {}
#     optimizer_options = ["RMSprop", "Adam", "SGD"]
#     optimizer_selected = trial.suggest_categorical("optimizer", optimizer_options)
#     if optimizer_selected == "RMSprop":
#         kwargs["learning_rate"] = trial.suggest_float(
#             "rmsprop_learning_rate", 1e-5, 1e-1, log=True
#         )
#         kwargs["decay"] = trial.suggest_float("rmsprop_decay", 0.85, 0.99)
#         kwargs["momentum"] = trial.suggest_float("rmsprop_momentum", 1e-5, 1e-1, log=True)
#     elif optimizer_selected == "Adam":
#         kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True)
#     elif optimizer_selected == "SGD":
#         kwargs["learning_rate"] = trial.suggest_float(
#             "sgd_opt_learning_rate", 1e-5, 1e-1, log=True
#         )
#         kwargs["momentum"] = trial.suggest_float("sgd_opt_momentum", 1e-5, 1e-1, log=True)

#     optimizer = getattr(tf.optimizers, optimizer_selected)(**kwargs)
#     return optimizer

In [None]:
# def learn(model, optimizer, dataset, mode="eval"):
    
#     (X_train, y_train), (X_valid, y_valid)
    
#     model.compile(
#         loss="sparse_categorical_crossentropy", 
#         optimizer=optimizer, 
#         metrics=["accuracy"]
#     )
    
#     model.fit(
#         X_train,
#         y_train,
#         validation_data=(X_valid, y_valid),
#         shuffle=True,
# #         batch_size=BATCHSIZE,
#         epochs=EPOCHS,
#         verbose=False,
#     )

#     # Evaluate the model accuracy on the validation set.
#     score = model.evaluate(X_valid, y_valid, verbose=0)
#     return score

In [None]:
# def objective(trial):
#     # Get MNIST data.
#     train_ds, valid_ds = (X_train, y_train), (X_valid, y_valid)

#     # Build model and optimizer.
#     model = create_model(trial)
#     optimizer = create_optimizer(trial)

#     # Training and validating cycle.
#     with tf.device("/cpu:0"):
#         for _ in range(EPOCHS):
#             learn(model, optimizer, train_ds, "train")

#         accuracy = learn(model, optimizer, valid_ds, "eval")

#     # Return last validation accuracy.
#     return accuracy.result()

In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=1)

# print("Number of finished trials: ", len(study.trials))

# print("Best trial:")
# trial = study.best_trial

# print("  Value: ", trial.value)

# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

## Test

In [None]:
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
test.drop(['row_id'], inplace=True, axis=1)

### Scaling the test data

In [None]:
test_features = test.columns
test_features

In [None]:
test[test_features] = sc.transform(test[test_features])

### Predicting on the Test Data

In [None]:
preds = full_model.predict(test)
temp = [row.argmax() for row in preds]
final = le.inverse_transform(temp)

## Creating the Submission File

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission['target'] = final
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()