<a href="https://colab.research.google.com/github/tsholofelo-mokheleli/ACIS-2023-New-Zealand/blob/main/Autoencoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Import Libraries**

In [69]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv1D, LSTM

from tensorflow.keras.models import Model

import matplotlib.pyplot as plt

### **Load and Preprocess Data**

**Load the dataset**

In [70]:
data = pd.read_csv("Diabetes Multi-Class.csv", sep=';', on_bad_lines='skip')

In [71]:
data = data.drop(["Patient"], axis=1)
data = data.dropna()

# Convert all columns to int data type
for column in data.columns:
    data[column] = data[column].astype(int)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Diabetes_012          10000 non-null  int64
 1   HighBP                10000 non-null  int64
 2   HighChol              10000 non-null  int64
 3   CholCheck             10000 non-null  int64
 4   BMI                   10000 non-null  int64
 5   Smoker                10000 non-null  int64
 6   Stroke                10000 non-null  int64
 7   HeartDiseaseorAttack  10000 non-null  int64
 8   PhysActivity          10000 non-null  int64
 9   Fruits                10000 non-null  int64
 10  Veggies               10000 non-null  int64
 11  HvyAlcoholConsump     10000 non-null  int64
 12  AnyHealthcare         10000 non-null  int64
 13  NoDocbcCost           10000 non-null  int64
 14  GenHlth               10000 non-null  int64
 15  MentHlth              10000 non-null  int64
 16  PhysH

**Separate features (X) and labels (y)**

In [72]:
X = data.drop(columns=["Diabetes_012"])
y = data["Diabetes_012"]

In [73]:
class_counts = data['Diabetes_012'].value_counts()
print(class_counts)

0    8216
2    1606
1     178
Name: Diabetes_012, dtype: int64


**Split the data into training and validation sets**

In [74]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Normalize the features**

In [75]:
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

### **Build Baseline Model**

In [76]:
# DNN
dnn_model = Sequential([
    Dense(128, activation='relu', input_shape=(x_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

# CNN
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(x_train_scaled.shape[1], 1)),
    Flatten(),
    Dense(3, activation='softmax')
])

# RNN
rnn_model = Sequential([
    LSTM(64, activation='relu', input_shape=(x_train_scaled.shape[1], 1)),
    Dense(3, activation='softmax')
])

# Compile models
dnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train models
dnn_model.fit(x_train_scaled, y_train, epochs=10, validation_data=(x_val_scaled, y_val))
cnn_model.fit(x_train_scaled.reshape((-1, x_train_scaled.shape[1], 1)), y_train, epochs=10, validation_data=(x_val_scaled.reshape((-1, x_val_scaled.shape[1], 1)), y_val))
rnn_model.fit(x_train_scaled.reshape((-1, x_train_scaled.shape[1], 1)), y_train, epochs=10, validation_data=(x_val_scaled.reshape((-1, x_val_scaled.shape[1], 1)), y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9c76df5090>

In [77]:
# DNN Model Results
dnn_scores = dnn_model.evaluate(x_val_scaled, y_val)
print("DNN Model - Loss:", dnn_scores[0])
print("DNN Model - Accuracy:", dnn_scores[1])

# CNN Model Results
cnn_scores = cnn_model.evaluate(x_val_scaled.reshape((-1, x_val_scaled.shape[1], 1)), y_val)
print("CNN Model - Loss:", cnn_scores[0])
print("CNN Model - Accuracy:", cnn_scores[1])

# RNN Model Results
rnn_scores = rnn_model.evaluate(x_val_scaled.reshape((-1, x_val_scaled.shape[1], 1)), y_val)
print("RNN Model - Loss:", rnn_scores[0])
print("RNN Model - Accuracy:", rnn_scores[1])

DNN Model - Loss: 0.4562285542488098
DNN Model - Accuracy: 0.8234999775886536
CNN Model - Loss: 0.44496509432792664
CNN Model - Accuracy: 0.8234999775886536
RNN Model - Loss: 0.472566694021225
RNN Model - Accuracy: 0.8245000243186951


### **Apply Autoencoder**

 **Apply the autoencoder to perform dimensionality reduction.**

In [78]:
# Define autoencoder architecture
encoding_dim = 8  # Adjust as needed
input_layer = keras.layers.Input(shape=(x_train_scaled.shape[1],))
encoded = keras.layers.Dense(encoding_dim, activation='relu')(input_layer)
decoded = keras.layers.Dense(x_train_scaled.shape[1], activation='sigmoid')(encoded)
autoencoder = Model(input_layer, decoded)

# Compile autoencoder
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train autoencoder
autoencoder.fit(x_train_scaled, x_train_scaled, epochs=50, batch_size=64, validation_data=(x_val_scaled, x_val_scaled))

# Apply autoencoder to data
encoded_x_train = autoencoder.predict(x_train_scaled)
encoded_x_val = autoencoder.predict(x_val_scaled)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### **Build Models with Encoded Data**

**Build the same deep learning classifiers using the encoded data and compare the results**

In [79]:
# Reshape encoded data for DNN
encoded_x_train_dnn = encoded_x_train  # Already in the correct shape (None, encoding_dim)
encoded_x_val_dnn = encoded_x_val      # Already in the correct shape (None, encoding_dim)


# Select only the required number of samples from encoded_x_train for y_train
# Select only the required number of samples from encoded_x_val for y_val
encoded_x_train_rnn_subset = encoded_x_train[:len(y_train)]
encoded_x_val_rnn_subset = encoded_x_val[:len(y_val)]

# Build models with encoded data
dnn_model_encoded = Sequential([
    Dense(128, activation='relu', input_shape=(21,)),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

cnn_model_encoded = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(encoding_dim, 1)),
    Flatten(),
    Dense(3, activation='softmax')
])

rnn_model_encoded = Sequential([
    LSTM(64, activation='relu', input_shape=(encoding_dim, 1)),
    Dense(3, activation='softmax')
])

# Compile and train models with encoded data
dnn_model_encoded.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model_encoded.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model_encoded.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print shape of encoded data to debug
print("Shape of encoded_x_train_dnn:", encoded_x_train_dnn.shape)
print("Shape of encoded_x_val_dnn:", encoded_x_val_dnn.shape)

print('\n')

# Reshape encoded data for CNN
encoded_x_train_cnn = encoded_x_train.reshape((-1, encoding_dim, 1))
encoded_x_val_cnn = encoded_x_val.reshape((-1, encoding_dim, 1))


# Print shapes for debugging
print("Shape of encoded_x_train_cnn:", encoded_x_train_cnn.shape)
print("Shape of encoded_x_val_cnn:", encoded_x_val_cnn.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)

print('\n')

# Select only the required number of samples from encoded_x_train_cnn for y_train
# Select only the required number of samples from encoded_x_val_cnn for y_val
encoded_x_train_cnn_subset = encoded_x_train_cnn[:len(y_train)]
encoded_x_val_cnn_subset = encoded_x_val_cnn[:len(y_val)]

# Select only the required number of samples from encoded_x_train for y_train
# Select only the required number of samples from encoded_x_val for y_val
encoded_x_train_subset = encoded_x_train[:len(y_train)]
encoded_x_val_subset = encoded_x_val[:len(y_val)]

dnn_model_encoded.fit(encoded_x_train_dnn, y_train, epochs=10, validation_data=(encoded_x_val_dnn, y_val))
cnn_model_encoded.fit(encoded_x_train_cnn_subset, y_train, epochs=10, validation_data=(encoded_x_val_cnn_subset, y_val))
# rnn_model_encoded.fit(encoded_x_train_rnn_subset.reshape((-1, encoding_dim, 1)), y_train, epochs=10, validation_data=(encoded_x_val_rnn_subset.reshape((-1, encoding_dim, 1)), y_val))

Shape of encoded_x_train_dnn: (8000, 21)
Shape of encoded_x_val_dnn: (2000, 21)


Shape of encoded_x_train_cnn: (21000, 8, 1)
Shape of encoded_x_val_cnn: (5250, 8, 1)
Shape of y_train: (8000,)
Shape of y_val: (2000,)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9c76b37550>

### **Comparison and Visualization**

**Evaluate Models**

In [80]:
# Evaluate baseline models
dnn_scores = dnn_model.evaluate(x_val_scaled, y_val)
# cnn_scores = cnn_model.evaluate(x_val_scaled.reshape((-1, x_val_scaled.shape[1], 1)), y_val)
# rnn_scores = rnn_model.evaluate(x_val_scaled.reshape((-1, x_val_scaled.shape[1], 1)), y_val)

# Evaluate models with encoded data
dnn_scores_encoded = dnn_model_encoded.evaluate(encoded_x_val, y_val)
# cnn_scores_encoded = cnn_model_encoded.evaluate(encoded_x_val.reshape((-1, encoding_dim, 1)), y_val)
# rnn_scores_encoded = rnn_model_encoded.evaluate(encoded_x_val.reshape((-1, encoding_dim, 1)), y_val)

print("Baseline DNN - Accuracy:", dnn_scores[1])
# print("Baseline CNN - Accuracy:", cnn_scores[1])
# print("Baseline RNN - Accuracy:", rnn_scores[1])

print("DNN with Encoded Data - Accuracy:", dnn_scores_encoded[1])
# print("CNN with Encoded Data - Accuracy:", cnn_scores_encoded[1])
# print("RNN with Encoded Data - Accuracy:", rnn_scores_encoded[1])

Baseline DNN - Accuracy: 0.8234999775886536
DNN with Encoded Data - Accuracy: 0.8205000162124634


**Visualization**