**Body Performance Multiclass classification using ANNs**
===========================================================

**Tom Wright-Anderson**

Dataset: https://www.kaggle.com/datasets/kukuroo3/body-performance-data

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('C:/Users/tomwr/Datascience/Datasets/Tabular/body_performance_multiclass_classification.csv')

In [None]:
data.info()

In [None]:
#Changing datatypes
data['age'] = data['age'].astype('int64')
data['diastolic'] = data['diastolic'].astype('int64')
data['systolic'] = data['systolic'].astype('int64')

In [None]:
#Feature engineering - extra features

data['BMI'] = data['weight_kg'] / (data['height_cm']/100)**2
data['relative_jump'] = data['broad jump_cm'] / data['height_cm']

data['dummy'] = 1 # Dummy column for violinplots

In [None]:
data.info()

In [None]:
data['class'].value_counts() # Pretty much balanced classes so no class weighting needed.

**Violin plots of all variables, split coloured by gender**

In [None]:
fig, ax = plt.subplots(nrows = 4,
                       ncols = 3,
                       sharex = True,
                       sharey = False
                      )
fig.set_size_inches(16,16)

plt.suptitle('Body Performance Metrics by Gender',
             fontweight = 'bold'
            )

sns.violinplot(data = data, x = 'dummy', y = 'height_cm', hue = 'gender', split = True, ax = ax[0, 0])
ax[0, 0].set_title('Height')
ax[0, 0].set_ylabel('Height (cm)')

sns.violinplot(data = data, x = 'dummy', y = 'weight_kg', hue = 'gender', split = True, ax = ax[0, 1])
ax[0, 1].set_title('Weight')
ax[0, 1].set_ylabel('Weight (kg)')

sns.violinplot(data = data, x = 'dummy', y = 'body fat_%', hue = 'gender', split = True, ax = ax[0, 2])
ax[0, 2].set_title('Body Fat')
ax[0, 2].set_ylabel('Body fat (%)')

sns.violinplot(data = data, x = 'dummy', y = 'diastolic', hue = 'gender', split = True, ax = ax[1, 0])
ax[1, 0].set_title('Diastolic pressure minimum')
ax[1, 0].set_ylabel('Diastolic')

sns.violinplot(data = data, x = 'dummy', y = 'systolic', hue = 'gender', split = True, ax = ax[1, 1])
ax[1, 1].set_title('Systolic pressure minimum')
ax[1, 1].set_ylabel('Systolic')

sns.violinplot(data = data, x = 'dummy', y = 'gripForce', hue = 'gender', split = True, ax = ax[1, 2])
ax[1, 2].set_title('Grip strength')
ax[1, 2].set_ylabel('Grip strength (kg)')

sns.violinplot(data = data, x = 'dummy', y = 'sit and bend forward_cm', hue = 'gender', split = True, ax = ax[2, 0], cut = 0)
ax[2, 0].set_title('Sit and bend forward reach')
ax[2, 0].set_ylabel('Reach (cm)')

sns.violinplot(data = data, x = 'dummy', y = 'sit-ups counts', hue = 'gender', split = True, ax = ax[2, 1])
ax[2, 1].set_title('Sit-up count')
ax[2, 1].set_ylabel('Number of sit-ups')

sns.violinplot(data = data, x = 'dummy', y = 'broad jump_cm', hue = 'gender', split = True, ax = ax[2, 2])
ax[2, 2].set_title('Broad jump length')
ax[2, 2].set_ylabel('Jump length (cm)')

sns.violinplot(data = data, x = 'dummy', y = 'BMI', hue = 'gender', split = True, ax = ax[3, 0])
ax[3, 0].set_title('BMI')
ax[3, 0].set_ylabel('BMI Value')

sns.violinplot(data = data, x = 'dummy', y = 'relative_jump', hue = 'gender', split = True, ax = ax[3, 1])
ax[3, 1].set_title('Relative Jump')
ax[3, 1].set_ylabel('Jump relative to height ratio')




for ax in ax.flatten():
    plt.sca(ax)
    plt.tick_params('x',
                color = 'white',
                labelcolor = 'white')
    plt.xlabel('')

In [None]:
#Turn categorical variables into numerical

data['gender'].replace(['M', 'F'], [0, 1], inplace = True)
#data['class'].replace(['A', 'B', 'C', 'D'], [0, 1, 2, 3], inplace = True)
data = pd.get_dummies(data = data, columns = ['class'])

In [None]:
#Removing dummy
del data['dummy']

In [None]:
data.info()

In [None]:
#Splitting dataset
y = data[['class_A', 'class_B', 'class_C', 'class_D']].values
#X = data.loc[:, data.columns != ['class_A', 'class_B', 'class_C', 'class_D']].values
X = data.iloc[:, :13]

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5)

In [None]:
#Scaling dataset using RobustScaler as some outliers 
from sklearn.preprocessing import RobustScaler

robustscaler = RobustScaler(quantile_range = (15, 85))

X_train = robustscaler.fit_transform(X_train)
X_val = robustscaler.transform(X_val)
X_test = robustscaler.transform(X_test)


**Model creation - Tensorflow / Keras**

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, BatchNormalization, Activation
from keras.optimizers import Adam, Nadam, Adadelta
from keras.metrics import SparseCategoricalCrossentropy, KLDivergence, CategoricalCrossentropy
from keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from keras.layers.activation import *
from keras.callbacks import ReduceLROnPlateau

In [None]:
#Function for displaying model training metrics and validation performance
from sklearn.metrics import accuracy_score

def display_model_metrics(name, history):
    history_df = pd.DataFrame(history.history)
    
    x_axis = np.arange(1, len(history_df) + 1, 1)
    
    fig, ax = plt.subplots(nrows = 1,
                           ncols = 2,
                           sharex = True,
                           sharey = False
                          )
    fig.set_size_inches(10, 10) 
    ax[0].plot(x_axis, history_df['categorical_crossentropy'], label = 'categorical crossentropy')
    ax[0].plot(x_axis, history_df['val_categorical_crossentropy'], label = 'validation categorical crossentropy')
    ax[0].set_title('Loss')
    ax[0].set_ylabel('Categorical crossentropy')
    ax[0].set_xlabel('Epochs')
    
    
    ax[1].plot(x_axis, history_df['kullback_leibler_divergence'], label = 'KLDivergence')
    ax[1].plot(x_axis, history_df['val_kullback_leibler_divergence'], label = 'Validation KLDivergence')
    ax[1].set_title('KLDivergence')
    ax[1].set_ylabel('KLDivergence')
    ax[1].set_xlabel('Epochs')
    
    plt.suptitle(f'{name}\'s performance metrics',
                 fontweight = 'bold'
                )
    
    for ax in ax.flatten():
        ax.legend()
    plt.show()

In [None]:
#Function for creating and visualizing a confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def display_confusion_matrix(X_val, X_test, y_val, y_test, model):
    
    #Validation
    y_val_labels = np.argmax(y_val, axis = 1)
    
    val_pred_proba = model.predict(X_val)
    val_pred_classes = np.zeros_like(y_val)
    val_pred_max_indices = np.argmax(val_pred_proba, axis = 1)
    val_pred_classes[np.arange(len(val_pred_proba)), val_pred_max_indices] = 1
    single_val_pred_classes = np.argmax(val_pred_classes, axis = 1)
    
    
    #Testing
    y_test_labels = np.argmax(y_test, axis = 1)
    
    test_pred_proba = model.predict(X_test)
    test_pred_classes = np.zeros_like(y_test)
    test_pred_max_indices = np.argmax(test_pred_proba, axis = 1)
    test_pred_classes[np.arange(len(test_pred_proba)), test_pred_max_indices] = 1
    single_test_pred_classes = np.argmax(test_pred_classes, axis = 1)
    
    #Confusion matrices
    val_cm = confusion_matrix(y_val_labels, single_val_pred_classes, normalize = 'all')
    test_cm = confusion_matrix(y_test_labels, single_test_pred_classes, normalize = 'all')
    
    
    #Confusion matrices displays
    val_cm_display = ConfusionMatrixDisplay(val_cm, display_labels = ['Class A', 'Class B', 'Class C', 'Class D'])
    test_cm_display = ConfusionMatrixDisplay(test_cm, display_labels = ['Class A', 'Class B', 'Class C', 'Class D'])
    
    
    
    val_cm_display.plot()
    
    test_cm_display.plot()
    

    
    
    
    
    

In [None]:
#Create some callbacks
from keras.callbacks import *

reduce_lr_on_plateau_half = ReduceLROnPlateau(monitor = 'val_loss',
                                              patience = 20,
                                              factor = 0.5) #Halved

reduce_lr_on_plateau_f10 = ReduceLROnPlateau(monitor = 'val_loss',
                                             patience = 20,
                                             factor = 0.1)

early_stopping_50 = EarlyStopping(monitor = 'val_loss',
                                  patience = 50)






**Methodology**

Making incremental changes in model structure and hyperparameters to narrow down to an optimal model

In [None]:
m1_name = 'm1_TF_3x16'
m1_epochs = 5

m1 = Sequential([
    BatchNormalization(),
    
    Dense(units = 16, kernel_initializer = 'he_normal'),
    Activation('relu'),
    Dropout(0.1),
    
    Dense(units = 16, kernel_initializer = 'he_normal'),
    Activation('relu'),
    Dropout(0.1),
    
    Dense(units = 16, kernel_initializer = 'he_normal'),
    Activation('relu'),
    BatchNormalization(),
    
    Dense(units = 4, activation = Softmax())
])

m1.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m1_history = m1.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m1_epochs
                   )
           

In [None]:
display_model_metrics(m1_name, m1_history)

In [None]:
display_confusion_matrix(X_val, X_test, y_val, y_test, m1)

In [None]:
m2_name = 'm2_TF_4x8'
m2_epochs = 25

m2 = Sequential([
    BatchNormalization(),
    Dense(8, kernel_initializer = 'he_normal'),
    Activation('relu'),
    Dense(8, kernel_initializer = 'he_normal'),
    Activation('relu'),
    Dense(8, kernel_initializer = 'he_normal'),
    Activation('relu'),
    Dense(8, kernel_initializer = 'he_normal'),
    Activation('relu'),
    BatchNormalization(),
    
    Dense(4, activation = 'softmax')
])

m2.compile(loss = 'categorical_crossentropy',
           optimizer = Adadelta(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m2_lr_on_plateau = ReduceLROnPlateau(monitor = 'val_loss',
                                     patience = 25,
                                     factor = 0.9)

m2_history = m2.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m2_epochs,
                   callbacks = [m2_lr_on_plateau])
       

In [None]:
display_model_metrics(m2_name, m2_history)

display_confusion_matrix(X_val, X_test, y_val, y_test, m2)

In [None]:
m3_name = 'm3_TF_3x16_reg'
m3_epochs = 250

m3 = Sequential([
    BatchNormalization(),
    
    Dense(units = 16, kernel_initializer = 'he_normal', kernel_regularizer = 'l2'),
    Activation('relu'),
    
    Dense(units = 16, kernel_initializer = 'he_normal', kernel_regularizer = 'l2'),
    Activation('relu'),
    
    Dense(units = 16, kernel_initializer = 'he_normal', kernel_regularizer = 'l2'),
    Activation('relu'),
    BatchNormalization(),
    
    Dense(units = 4, activation = Softmax())
])

m3.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m3_history = m3.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m3_epochs,
                    callbacks = [reduce_lr_on_plateau_half, early_stopping_50]
                   )

In [None]:
display_model_metrics(m3_name, m3_history)

display_confusion_matrix(X_val, X_test, y_val, y_test, m3)

In [None]:
#Same as Model 3 but using ELU for activation rather than ReLU - supposedly more effective.


m4_name = 'm4_TF_3x16_reg'
m4_epochs = 250

m4 = Sequential([
    BatchNormalization(),
    
    Dense(units = 16, kernel_initializer = 'he_normal', kernel_regularizer = 'l2'),
    Activation('elu'),
    
    Dense(units = 16, kernel_initializer = 'he_normal', kernel_regularizer = 'l2'),
    Activation('elu'),
    
    Dense(units = 16, kernel_initializer = 'he_normal', kernel_regularizer = 'l2'),
    Activation('elu'),
    BatchNormalization(),
    
    Dense(units = 4, activation = Softmax())
])

m4.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m4_history = m4.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m4_epochs,
                    callbacks = [reduce_lr_on_plateau_half, early_stopping_50]
                   )

In [None]:
#Same as Model 4 but using SELU for activation rather than ReLU - supposedly more effective - ELU was not very effective it seemed - High Validation Loss.
#also removed batchnormalization and regularization.


m5_name = 'm5_TF_3x16_selu'
m5_epochs = 250

m5 = Sequential([
    BatchNormalization(),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 4, activation = Softmax())
])

m5.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m5_history = m5.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m5_epochs,
                    callbacks = [reduce_lr_on_plateau_half, early_stopping_50]
                   )

In [None]:
display_model_metrics(m5_name, m5_history)

display_confusion_matrix(X_val, X_test, y_val, y_test, m5)

**SELU Improved performance and both validation and training loss were still decreasing when epoch limit reached, so will increase to 500 for model 6.**

In [None]:
#Same as M5, but now 500 epoch maximum, slightly increased neurons to 18 per layer


m6_name = 'm6_TF_3x18_selu'
m6_epochs = 500

m6 = Sequential([
    BatchNormalization(),
    
    Dense(units = 18, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 18, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 18, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 4, activation = Softmax())
])

m6.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m6_history = m6.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m6_epochs,
                    callbacks = [reduce_lr_on_plateau_half, early_stopping_50]
                   )

In [None]:
display_model_metrics(m6_name, m6_history)

display_confusion_matrix(X_val, X_test, y_val, y_test, m6)

In [None]:
m7_name = 'm7_TF_3x16_selu'
m7_epochs = 500

m7 = Sequential([
    BatchNormalization(),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 4, activation = Softmax())
])

m7.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m7_history = m7.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m7_epochs,
                    callbacks = [reduce_lr_on_plateau_half, early_stopping_50]
                   )

In [None]:
#m8 - same as m6 but with a new reduceLRonplateau, reducing by 1% every 3 epochs if not improving.
#This seemed to imrpove performance!! 

reduce_lr_on_plateau_3 = ReduceLROnPlateau(monitor = 'val_loss',
                                           patience = 3,
                                           factor = 0.99)

m8_name = 'm8_TF_3x16_selu'
m8_epochs = 500

m8 = Sequential([
    BatchNormalization(),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 16, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 4, activation = Softmax())
])

m8.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m8_history = m8.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m8_epochs,
                    callbacks = [reduce_lr_on_plateau_3, early_stopping_50]
                   )

In [None]:
display_model_metrics(m8_name, m8_history)

display_confusion_matrix(X_val, X_test, y_val, y_test, m8)

In [None]:
#m9 - same as m8, but slightly more neurons, and a slightly stronger reduceLROnPlateau callback (-2% every 3 epochs)

reduce_lr_on_plateau_3_2 = ReduceLROnPlateau(monitor = 'val_loss',
                                             patience = 3,
                                             factor = 0.98)
m9_name = 'm9_TF_3x18_selu'
m9_epochs = 500

m9 = Sequential([
    BatchNormalization(),
    
    Dense(units = 18, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 18, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 18, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 4, activation = Softmax())
])

m9.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m9_history = m9.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m9_epochs,
                    callbacks = [reduce_lr_on_plateau_3_2, early_stopping_50]
                   )

In [None]:
display_model_metrics(m9_name, m9_history)

display_confusion_matrix(X_val, X_test, y_val, y_test, m9)

In [None]:
#m10 - Same as m9 but 24 units per layer, and a 4% reduction in lr, as this change previously slightly imporved performance.
#This did not seem to work, model did not converge effectively.

reduce_lr_on_plateau_3_4 = ReduceLROnPlateau(monitor = 'val_loss',
                                             patience = 3,
                                             factor = 0.96)
m10_name = 'm10_TF_3x24_selu'
m10_epochs = 500

m10 = Sequential([
    BatchNormalization(),
    
    Dense(units = 24, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 24, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 24, kernel_initializer = 'lecun_normal'),
    Activation('selu'),
    
    Dense(units = 4, activation = Softmax())
])

m10.compile(loss = CategoricalCrossentropy(),
           optimizer = Adam(),
           metrics = [CategoricalCrossentropy(), KLDivergence()]
          )
m10_history = m10.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs = m10_epochs,
                    callbacks = [reduce_lr_on_plateau_3_4, early_stopping_50]
                   )

In [None]:
display_model_metrics(m10_name, m10_history)

display_confusion_matrix(X_val, X_test, y_val, y_test, m10)

**Model 9 seemed to work best, so will evaluate performance on test set** ~ predicted classes correctly ~74% of the time.

In [None]:
m9.evaluate(X_test, y_test)

