In [None]:
# Import tools

import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from tensorflow import keras


## Read the data

In [None]:

# Load the data and preprocess as needed

file_path = '../magic04.data'
column_names = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
data = pd.read_csv(file_path, header=None, names=column_names)


In [None]:
data

## Data cleaning and preprocessing

In [None]:
data.isnull().any()

### Label encoding for class column

In [None]:
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

In [None]:
data['class'].unique()

In [None]:
# Create correlation matrix 

corr_matrix = data.corr()

target_corr = corr_matrix['class']

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(6,5))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show() 

In [None]:
# Visualize each attributes 

plt.rcParams["figure.figsize"] = (10, 10)
data.hist()
plt.grid()

### Split data into training and testing

In [None]:
X = data.drop('class', axis=1)
y = data['class']

# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Based on the distribution of the values, we can see that the attribute class is imbalance. Hence, I will do Min-Max scaling on this data set. 

### SMOTE

In [None]:
# Apply SMOTE to all training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_val_resampled, y_val_resampled = smote.fit_resample(X_val, y_val)

In [None]:
# Standardize the data 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Convert to data type for tensorflow

In [None]:
# Convert Pandas DataFrames to NumPy arrays
X_train_scaled = np.array(X_train_resampled)
X_test_scaled = np.array(X_test)
y_train_resampled = np.array(y_train_resampled)
y_test = np.array(y_test)

## Build the ANN Model

In [None]:
import keras
from keras import regularizers
from keras.layers import Dense
from keras import metrics

# Build the ANN model
optimizer = keras.optimizers.Adam()

def build_model(n_hidden=3, n_neurons=32, a=0.001, b=0.002):
    model = keras.Sequential()
    for layer in range(n_hidden):
        model.add(Dense(n_neurons, activation='relu', kernel_regularizer=regularizers.L1L2(l1=a, l2=b)))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss=keras.losses.MeanSquaredError(), metrics=[metrics.MeanSquaredError()])
    return model

In [None]:
# Train the model

model = build_model()

history = model.fit(X_train_scaled, 
                    y_train_resampled, 
                    epochs=10, 
                    batch_size=32, 
                    validation_split=0.1)

### Visualize the loss during fitting the model 

In [None]:
'''
import matplotlib.pyplot as plt
history_dict = history.history
loss_values = history_dict["loss"]
val_loss_values = history_dict["val_loss"]
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, "bo", label="Training loss")
plt.plot(epochs, val_loss_values, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

'''

import matplotlib.pyplot as plt

# Assuming you have separate history objects for training and validation
train_history_dict = train_history.history
val_history_dict = val_history.history

train_loss_values = train_history_dict["loss"]
val_loss_values = val_history_dict["loss"]  # Use "loss" for consistency

epochs = range(1, len(train_loss_values) + 1)

plt.plot(epochs, train_loss_values, "bo", label="Training loss")
plt.plot(epochs, val_loss_values, "b", label="Validation loss")

plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")

plt.legend()
plt.show()

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)
y_pred_class = np.round(y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_class)

print("Test Accuracy:", accuracy)


## Fine tune the model

In [None]:
param_distribs = {
    'n_hidden': [2,4,6],
    'n_neurons': [32,64,128],
    'a': [0.0002, 0.001, 0.005],
    'b': [0.0002, 0.001, 0.005]
}

In [None]:
# Create the KerasRegressor
keras_reg = tf.keras.wrappers.scikit_learn.KerasRegressor(build_fn=build_model)
from sklearn.model_selection import GridSearchCV


# Create the GridSearchCV object
search_cv = GridSearchCV(keras_reg, param_distribs, cv=3, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to your data
search_cv.fit(X_train_scaled, y_train_resampled, epochs=60, validation_data=(X_val, y_val), \
              callbacks=[tf.keras.callbacks.EarlyStopping(patience=30)])

# Get the best parameters and best score
best_params = search_cv.best_params_
best_score = search_cv.best_score_

# Get the best model
best_model = search_cv.best_estimator_.model

In [None]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)
y_pred_class = np.round(y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_class)

print("Test Accuracy:", accuracy)