This notebook automates training and evaluating an MLP model across 15 datasets using TensorFlow and Keras, featuring preprocessing, dropout, batch normalization, and early stopping to enhance model generalization and prevent overfitting. Performance summaries detail accuracy and loss, offering insights into the model's effectiveness across varied data scenarios.

In [138]:
from google.colab import drive
drive.mount(('/content/drive'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [184]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

import os

In [185]:
def load_dataset(path):
    return pd.read_csv(path)

In [186]:
def data_quality_check(df):

    print("Data information:")
    print(df.info())
    print(100*"-")

    #Missing value
    missing_values = df.isnull().sum()
    data_types = df.dtypes

    columns_with_missing_values = missing_values[missing_values > 0]

    missing_values_summary = pd.DataFrame({
    'Missing Values': columns_with_missing_values,
    'Data Type': data_types[columns_with_missing_values.index]
    })

    print(f"Number of columns with missing values: {len(columns_with_missing_values)}")

    if len(columns_with_missing_values) != 0:
      print(missing_values_summary)

    print(100*"-")

    #Null values and infinity values
    nan_mask = df.isna().any(axis=1)
    inf_mask = df.replace([np.inf, -np.inf], np.nan).isna().any(axis=1)

    combined_mask = nan_mask | inf_mask
    num_rows_with_nan_or_inf = combined_mask.sum()

    print(f"Number of rows with NaN or Inf values: {num_rows_with_nan_or_inf}")

    print(100*"-")

    print(f"Shape of the dataset before dropping rows with NaN or Inf values: {df.shape}")

    df_cleaned = df.replace([np.inf, -np.inf], np.nan).dropna()
    print(f"Shape of the dataset before dropping rows with NaN or Inf values: {df_cleaned.shape}")

    print(100*"-")

    # data tranformation
    df_cleaned['marker_encoded'] = df_cleaned['marker'].map({'Attack': 1, 'Natural': 0})
    final_data = df_cleaned.copy(deep=True)
    final_data = final_data.drop('marker',axis=1)

    # Value counts
    print("Category distribution:")
    print(final_data['marker_encoded'].value_counts())

    shuffled_data = final_data.sample(frac=1, random_state=42)

    return shuffled_data

In [187]:
def preprocess_data(df, target='target'):
    X = df.drop(target, axis=1)
    y = df[target]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

In [188]:
def custom_mlp(X, y, num_classes):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X.shape[1],), kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(256, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    y_encoded = to_categorical(y, num_classes)
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

    model.fit(X_train, y_train, epochs=200, batch_size=64, validation_split=0.2, callbacks=[early_stopping, reduce_lr])

    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test accuracy: {accuracy}')

    return loss, accuracy

In [189]:
def get_dataset_performance(folder_path):
    results = []

    for i in range(1, 16):  # For datasets 1 through 15
        dataset_path = f'{folder_path}/data{i}.csv'
        df = load_dataset(dataset_path)
        shuffled_data = data_quality_check(df)
        X, y = preprocess_data(shuffled_data, target='marker_encoded')
        num_classes = len(y.unique())

        print(f"Training TensorFlow neural network on dataset {i}")
        loss, accuracy = custom_mlp(X, y, num_classes)

        results.append({
            'Dataset': f'Dataset {i}',
            'Loss': loss,
            'Accuracy': accuracy
        })

    return pd.DataFrame(results)

In [192]:
folder_path = '/content/drive/MyDrive/GRA/Project/dataset'
performance_summary = get_dataset_performance(folder_path)

Data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4966 entries, 0 to 4965
Columns: 129 entries, R1-PA1:VH to marker
dtypes: float64(112), int64(16), object(1)
memory usage: 4.9+ MB
None
----------------------------------------------------------------------------------------------------
Number of columns with missing values: 0
----------------------------------------------------------------------------------------------------
Number of rows with NaN or Inf values: 348
----------------------------------------------------------------------------------------------------
Shape of the dataset before dropping rows with NaN or Inf values: (4966, 129)
Shape of the dataset before dropping rows with NaN or Inf values: (4618, 129)
----------------------------------------------------------------------------------------------------
Category distribution:
1    3610
0    1008
Name: marker_encoded, dtype: int64
Training TensorFlow neural network on dataset 1
Epoch 1/200
Epoch 2/200
Ep

In [193]:
print(performance_summary)

       Dataset      Loss  Accuracy
0    Dataset 1  0.304196  0.946970
1    Dataset 2  0.372974  0.891258
2    Dataset 3  0.351031  0.904523
3    Dataset 4  0.317363  0.925654
4    Dataset 5  0.311155  0.932489
5    Dataset 6  0.364052  0.896175
6    Dataset 7  0.628339  0.943983
7    Dataset 8  0.287664  0.924335
8    Dataset 9  0.589250  0.809959
9   Dataset 10  0.246721  0.957719
10  Dataset 11  0.415933  0.943983
11  Dataset 12  0.403300  0.880628
12  Dataset 13  0.252985  0.958932
13  Dataset 14  0.381258  0.965921
14  Dataset 15  0.379126  0.916581
