In [None]:
import pandas as pd
import os
import numpy as np
import h5py

#### Global variables

In [None]:
font_size = 16
header_font_size = 20
SEED = 0

# Data Preprocessing

In [None]:
mat_file = "original_dataset/ReadBrownDwarf.mat"
data_path = "original_dataset/"
idTE = np.load(data_path + "idTE.npy")
idTR = np.load(data_path + "idTR.npy")
labelTE = np.load(data_path + "labelTE.npy")
labelTR = np.load(data_path + "labelTR.npy")

In [None]:
#Extracting the data from the mat file
with h5py.File(mat_file, 'r') as f:
    data = f["data"]
    data = pd.DataFrame(data).T
data

Data imputation

Adding $0$ scores as mean values of that column

In [None]:
data = pd.DataFrame(data.replace(0, data.mean()), columns=data.columns)

In [None]:
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []
for i in range(len(idTR)):
    X_train_list.append(data.iloc[idTR[i] - 1]) #idTR is 1 indexed
    X_test_list.append(data.iloc[idTE[i] - 1])  #idTE is 1 indexed
    y_train_list.append(labelTR[i])
    y_test_list.append(labelTE[i])

X_train_arr = np.array(X_train_list)
X_test_arr = np.array(X_test_list)
y_train_arr = np.array(y_train_list)
y_test_arr = np.array(y_test_list)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=13)

#Reshaping to run PCA on the features
X_train_arr_reshaped = X_train_arr.reshape(-1,26) 
X_test_arr_reshaped = X_test_arr.reshape(-1,26)
print(X_train_arr_reshaped.shape)

pca.fit(X_train_arr_reshaped)
X_train_arr = pca.transform(X_train_arr_reshaped)
X_test_arr = pca.transform(X_test_arr_reshaped)

#Reshaping back to original shape
X_train_arr = X_train_arr.reshape(5,4535,13)
X_test_arr = X_test_arr.reshape(5,1134,13)



In [None]:
X_train_arr.shape, X_test_arr.shape, y_train_arr.shape, y_test_arr.shape

# NN

Network structure: $13$ x $10$ x $5$ x $1$

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader



# Convert numpy arrays or pandas DataFrames to PyTorch tensors if needed
X_train_tens = torch.tensor(X_train_arr, dtype=torch.float32)
y_train_tens = torch.tensor(y_train_arr, dtype=torch.long)

# Defining a simple neural network class for binary classification
class NeuralNet(nn.Module):
    def __init__(self, input_size, activation=nn.Tanh()):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 10)
        self.activation = activation
        self.fc2 = nn.Linear(10, 5)
        self.fc3 = nn.Linear(5, 1)# Output layer with single neuron (binary classification)

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Apply sigmoid activation for binary classification of final output
        return x


## Training Loop

In [None]:
torch.manual_seed(SEED) # Added seed for reproducibility for all the activation functions
losses = []
for i in range(len(X_train_tens)):
    # Initializing
    input_size = X_train_tens[i].shape[1]
    model = NeuralNet(input_size)

    # Define loss function and optimizer
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = optim.Adam(model.parameters())

    # Prepare data for training using DataLoader
    batch_size = 64
    train_dataset = TensorDataset(X_train_tens[i], y_train_tens[i])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training the model
    num_epochs = 15
    temp_loss = []
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            outputs = outputs.squeeze(dim=1)  # Remove extra dimension for binary classification
            loss = criterion(outputs, labels.float())  # Calculate loss

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Print average loss per epoch
        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")
        temp_loss.append(running_loss / len(train_loader))
    losses.append(temp_loss)

    model.eval()  # Switch to evaluation mode
    with torch.no_grad():
        predictions = model(X_train_tens[i])
        predictions = (predictions > 0.5).int()  # Convert probabilities to binary predictions (0 or 1)


    dir_path = './models/'
    # Check if the directory exists
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    #Saving model for later use
    model_dir = './models/'
    os.makedirs(model_dir, exist_ok=True)  # Create directory if it doesn't exist
    model_path = os.path.join(model_dir, f'fold{i}_binary_classification_model.pth')
    torch.save(model.state_dict(), model_path)
    

In [None]:
import matplotlib.pyplot as plt

fold_colors = ['r', 'g', 'b', 'c', 'm']
for i, loss in enumerate(losses):
    plt.plot(loss, "-o", label=f"fold_{i}", color=fold_colors[i])
plt.legend(fontsize = font_size - 2)
plt.xlabel("Epoch", size = font_size)
plt.ylabel("Loss", size = font_size)
plt.title("Training Loss vs. Epoch for Different Activation Functions", size = header_font_size - 5)
plt.show()

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, confusion_matrix

mcc_scores = []
for i in range(len(X_train_tens)):
    print(f"Evaluating model on fold: {i}")
    # Load the trained model
    input_size = input_size 
    model = NeuralNet(input_size)
    model.load_state_dict(torch.load(f'models/fold{i}_binary_classification_model.pth'))  # Load the trained model state (MACos)

    X_train = X_train_tens[i].float()
    y_train = y_train_tens[i].float()

    # Prepare test dataset and dataloader
    batch_size = 64
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle for testing

    model.eval()  # Switch to evaluation mode
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in train_loader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()  # Convert probabilities to binary predictions (0 or 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculate Matthews Correlation Coefficient (MCC)
    mcc_tr = matthews_corrcoef(y_true, y_pred)
    mcc_scores.append(mcc_tr)

    print(f"Training: MCC-score: {mcc_tr}, check against the test set for overfitting")

print(f"Mean MCC score on training set: {np.mean(mcc_scores)}")

### Testing the model on the test set

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, confusion_matrix

mcc_scores = []
all_metrics = {}
all_confusion_matrices = {}
for i in range(len(X_train_tens)):
    print(f"Evaluating model on fold: {i}")
    # Load the trained model
    input_size = input_size  
    model = NeuralNet(input_size)
    model.load_state_dict(torch.load(f'models/fold{i}_binary_classification_model.pth'))  # Load the trained model state (MACos)


    X_test = torch.tensor(X_test_arr[i], dtype=torch.float32)
    y_test = torch.tensor(y_test_arr[i], dtype=torch.float32)

    # Prepare test dataset and dataloader
    batch_size = 64
    test_dataset = TensorDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle for testing

    model.eval()  # Switch to evaluation mode
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()  # Convert probabilities to binary predictions (0 or 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculating different scoring methods
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred) # Calculate Matthews Correlation Coefficient (MCC)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary') 
    recall = recall_score(y_true, y_pred, average='binary')   
    conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])

    # Store the metrics in a dictionary for easy plotting
    metrics = {'F1 Score': f1, 'MCC': mcc, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall}
    
    all_metrics[f"fold_{i}"] = metrics
    all_confusion_matrices[f"fold_{i}"] = conf_matrix

    
    print(f"Testing MCC score: {metrics['MCC']}")
    mcc_scores.append(metrics['MCC'])

    # Save the model
    dir_path = './models/'
    
    # Checking if the directory exists
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    model_path = os.path.join(dir_path, f'fold{i}_binary_classification_model.pth')
    torch.save(model.state_dict(), model_path)

print(f"Mean MCC on testset: {np.mean(mcc_scores)}")

In [None]:
data = pd.DataFrame(all_metrics)
data = data.reset_index().rename(columns={'index': 'Metrics'})


data = pd.melt(data, id_vars='Metrics', var_name='Activation Function', value_name='Value')

data = data.set_index(['Metrics', 'Activation Function']).Value
data.unstack().plot(kind='bar', stacked=False, color = fold_colors, fontsize = font_size-3)
plt.legend(fontsize = font_size-2)
plt.title("Metrics comparision over different folds", size = header_font_size)
plt.ylim(0.8,1)

## Confusion matrix comparison

In [None]:
import seaborn as sns

fig, axes = plt.subplots(1, 5, figsize=(20, 5))

for i in range(len(X_train_tens)):
    conf_matrix = all_confusion_matrices[f"fold_{i}"]

    # Create a heatmap for the confusion matrix for the i-th fold
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=axes[i], cbar = False, annot_kws={"size": font_size})

    axes[i].set_title(f'Confusion Matrix (fold_{i})', size = header_font_size)
    axes[i].set_xlabel('Predicted', size = font_size)
    axes[i].set_ylabel('True', size = font_size)
    axes[i].tick_params(axis='both', which='major', labelsize=font_size)

plt.tight_layout()
plt.show()