In [None]:
import pandas as pd
import os
import numpy as np
import h5py
import torch

Using the actual dataset from the matlab file given instead.

In [None]:
mat_file = "original_dataset/ReadBrownDwarf.mat"
data_path = "original_dataset/"
idTE = np.load(data_path + "idTE.npy")
idTR = np.load(data_path + "idTR.npy")
labelTE = np.load(data_path + "labelTE.npy")
labelTR = np.load(data_path + "labelTR.npy")

In [None]:
#Extracting the data from the mat file
with h5py.File(mat_file, 'r') as f:
    data = f["data"]
    data = pd.DataFrame(data).T
data

Imputing values with linear regression

In [None]:
from sklearn.linear_model import LinearRegression

# Check which columns have zero values
columns_with_zero = data.columns[(data == 0).any()].tolist()

print(columns_with_zero)
target_columns = columns_with_zero

for target_column in target_columns:
    df_zeros = data[data[target_column] == 0]
    df_no_zeros = data[data[target_column] != 0]

    X_train = df_no_zeros.drop(columns=target_column)
    y_train = df_no_zeros[target_column]

    X_test = df_zeros.drop(columns=target_column)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_test = model.predict(X_test)

    # Replace the zero values in the original dataframe
    data.loc[data[target_column] == 0, target_column] = y_test

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data= pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
data

In [None]:
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []
for i in range(len(idTR)):
    X_train_list.append(data.iloc[idTR[i] - 1]) #idTR is 1 indexed
    X_test_list.append(data.iloc[idTE[i] - 1])  #idTE is 1 indexed
    y_train_list.append(labelTR[i])
    y_test_list.append(labelTE[i])

X_train_arr = np.array(X_train_list)
X_test_arr = np.array(X_test_list)
y_train_arr = np.array(y_train_list)
y_test_arr = np.array(y_test_list)

In [None]:
fold_4_test = idTE[3]
for i in range(4):
    if i != 3:
        common_elements = set(fold_4_test) & set(idTR[i])
        num_common_elements = len(common_elements)
        print(f"The two lists have {num_common_elements} common elements: {common_elements}")

In [None]:
X_train_arr.shape, X_test_arr.shape, y_train_arr.shape, y_test_arr.shape

# NN

In [None]:

import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import TensorDataset


In [None]:
DEVICE = torch.device("cpu")
BATCHSIZE = 32
OUTPUT_NODES = 1
DIR = os.getcwd()
EPOCHS = 15
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10
torch.manual_seed(0)

In [None]:
def define_model(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 2)
    layers = []

    in_features = 26
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 8, 16)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        #p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        #layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, OUTPUT_NODES))
    layers.append(nn.Sigmoid())

    return nn.Sequential(*layers)

In [None]:
from sklearn.model_selection import train_test_split

def get_train_valid(X, y, test_size=0.2, BATCHSIZE=32):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    X_valid = torch.tensor(X_valid, dtype=torch.float32)
    y_valid = torch.tensor(y_valid, dtype=torch.long)

    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    valid_dataset = torch.utils.data.TensorDataset(X_valid, y_valid)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=BATCHSIZE,
        shuffle=True,
    )

    return train_loader, valid_loader

In [None]:
from sklearn.metrics import matthews_corrcoef
import warnings

warnings.filterwarnings('ignore')
def objective(trial):
    
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "Adagrad"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters())

    train_loader, valid_loader = get_train_valid(X_train_arr, y_train_arr)


   

    results = {}

 
    for fold in range(5):
        network = define_model(trial).to(DEVICE)
        network.train()

        # Training of the model.
        for epoch in range(EPOCHS):
            for batch_idx, (data, target) in enumerate(train_loader):
                # Limiting training data for faster epochs.
                if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                    break

                #data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)

                optimizer.zero_grad()
                output = model(data).squeeze()

                
            
                output =  output.float()
                loss_fn = nn.BCELoss()
               
                loss = loss_fn(output, target.float())
            
                loss.backward()
          
                optimizer.step()
             

            # Validation of the model.
            model.eval()
            correct = 0
            with torch.no_grad():
                for batch_idx, (data, target) in enumerate(valid_loader):
                    # Limiting validation data.
                    if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                        break
                
                    data, target = data.to(DEVICE), target.to(DEVICE)                
                    output = model(data).squeeze()
                    output =  output.float()
                    
                    # Get the index of the max log-probability.
                    pred = (output < 0.5)

        
            score = matthews_corrcoef(target.T, pred)

            trial.report(score, epoch)

            # Handle pruning based on the intermediate value.
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        results[fold] = score

    return sum(results.values()) / len(results)

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


In [None]:
num_epochs_global = EPOCHS
batch_size_global = BATCHSIZE
study.best_params


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader



# Convert numpy arrays or pandas DataFrames to PyTorch tensors if needed
X_train_tens = torch.tensor(X_train_arr, dtype=torch.float32)
y_train_tens = torch.tensor(y_train_arr, dtype=torch.long)

# Defining a simple neural network class for binary classification
class NeuralNet(nn.Module):
    def __init__(self, input_size, activation=nn.ReLU()):
        super(NeuralNet, self).__init__()
        self.activation = activation
        self.fc1 = nn.Linear(input_size, 11)
        self.fc2 = nn.Linear(11, 11)
        self.fc3 = nn.Linear(11, 1)
        
        

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [None]:
study.best_params

## Training Loop

In [None]:
losses = [] #Might turn into dict to make it more readable
for i in range(len(X_train_tens)):
    input_size = X_train_tens[i].shape[1]
    model = NeuralNet(input_size)

    # Define loss function and optimizer
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = optim.Adam(model.parameters(), lr =0.0017748712435351215)

    # Prepare data for training using DataLoader
    batch_size = batch_size_global
    train_dataset = TensorDataset(X_train_tens[i], y_train_tens[i])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop
    num_epochs = num_epochs_global
    temp_loss = []
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            outputs = outputs.squeeze(dim=1) 
            loss = criterion(outputs, labels.float())  # Calculate loss

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        temp_loss.append(running_loss / len(train_loader))
    losses.append(temp_loss)

    model.eval()  # Switch to evaluation mode
    with torch.no_grad():
        predictions = model(X_train_tens[i])
        predictions = (predictions > 0.5).int()  # Convert probabilities to binary predictions (0 or 1)

   
    #Saving model for later use
    model_dir = './models_experimental/'
    os.makedirs(model_dir, exist_ok=True)  # Create directory if it doesn't exist
    model_path = os.path.join(model_dir, f'fold{i}_binary_classification_model.pth')
    torch.save(model.state_dict(), model_path)
    

In [None]:
import matplotlib.pyplot as plt
for i, loss in enumerate(losses):
    plt.plot(loss, "-o", label=f"fold_{i}")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss vs. Epoch for Different Activation Functions")
plt.show()

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, confusion_matrix

mcc_scores = []
for i in range(len(X_train_tens)):
    print(f"Evaluating model on fold: {i}")
    # Load the trained model
    input_size = input_size  
    model = NeuralNet(input_size)
    model.load_state_dict(torch.load(f'models_experimental/fold{i}_binary_classification_model.pth'))  # Load the trained model state

    X_train = X_train_tens[i].float()
    y_train = y_train_tens[i].float()

    batch_size = batch_size_global
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle for testing

    model.eval()  # Switch to evaluation mode
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in train_loader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()  # Convert probabilities to binary predictions (0 or 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculate Matthews Correlation Coefficient (MCC)
    mcc_tr = matthews_corrcoef(y_true, y_pred)
    mcc_scores.append(mcc_tr)

    

    print(f"Training: MCC-score: {mcc_tr}, check against the test set for overfitting")

print(f"Mean MCC score on training set: {np.mean(mcc_scores)}")

### Testing the model on the test set

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score, confusion_matrix

mcc_scores = []
all_metrics = {}
all_confusion_matrices = {}
for i in range(len(X_train_tens)):
    print(f"Evaluating model on fold: {i}")
    # Load the trained model
    input_size = input_size  
    model = NeuralNet(input_size)
    model.load_state_dict(torch.load(f'models_experimental/fold{i}_binary_classification_model.pth'))  # Load the trained model state 

    X_test = torch.tensor(X_test_arr[i], dtype=torch.float32)
    y_test = torch.tensor(y_test_arr[i], dtype=torch.float32)

    # Prepare test dataset and dataloader
    batch_size = batch_size_global
    test_dataset = TensorDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle for testing

    model.eval()  
    y_true = []
    y_pred = []
    #print(test_loader)
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()  # Convert probabilities to binary predictions (0 or 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculate F1 score
    f1 = f1_score(y_true, y_pred)
    #print(f"F1 Score: {f1}")

    # Calculate Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_true, y_pred)
    #print("Matthews Correlation Coefficient (MCC):", mcc)

    # Compute accuracy, precision and recall
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary')  # Change average as needed
    recall = recall_score(y_true, y_pred, average='binary')  # Change average as needed


    # Confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred, labels=[1,0])

    # Store the metrics in a dictionary for easy plotting
    metrics = {'F1 Score': f1, 'MCC': mcc, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall}
    
    all_metrics[f"fold_{i}"] = metrics
    all_confusion_matrices[f"fold_{i}"] = conf_matrix

    
    print(f"Testing: {metrics['MCC']}")
    mcc_scores.append(metrics['MCC'])

    dir_path = './models/'
    
    # Check if the directory exists
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    # Save the model
    model_path = os.path.join(dir_path, f'fold{i}_binary_classification_model.pth')
    torch.save(model.state_dict(), model_path)

print(f"Mean MCC on testset: {np.mean(mcc_scores)}")

In [None]:
# Convert the all_metrics dictionary to a DataFrame
data = pd.DataFrame(all_metrics)
data = data.reset_index().rename(columns={'index': 'Metrics'})


data = pd.melt(data, id_vars='Metrics', var_name='Activation Function', value_name='Value')

data = data.set_index(['Metrics', 'Activation Function']).Value
colors = ["orange", "red", "blue", "green"]
data.unstack().plot(kind='bar', stacked=False, color = colors)
plt.ylim(0.6,1)

In [None]:
print(len(y_test[y_test == 0]))
print(len(y_test[y_test == 1]))

## Confusion matrix comparison

In [None]:
import seaborn as sns

fig, axes = plt.subplots(1, 5, figsize=(20, 5))

for i in range(len(X_train_tens)):
    conf_matrix = all_confusion_matrices[f"fold_{i}"]

    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=axes[i])

    axes[i].set_title(f'Confusion Matrix (fold_{i})')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('True')

plt.tight_layout()
plt.show()