In [1]:
import os
import numpy as np
import pandas as pd

from sklearn import metrics # Metrics for classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import KFold
from sklearn import metrics

from statsmodels.stats import contingency_tables # Contringency tables
import torch # PyTorch
from torch import nn # Modules and layers
from torch.optim import SGD
from torch.utils.data import Dataset # PyTorch dataset
from torch.utils.data import DataLoader # PyTorch Dataloader

In [2]:
from sklearn.neural_network import  MLPClassifier

## Data Loading

In [3]:
# Read csv dataset
df = pd.read_csv('data_assignment_1.csv')
df.describe()

# input
feature_cols = ['Frontal_Sup', 'Frontal_Inf', 'Cingulum_Ant', 'Cingulum_Post', 'Parietal_Sup', 'Parietal_Inf', 'Occipital_Sup', 'Occipital_Inf', 'Temporal_Sup', 'Temporal_Inf']
# output (label)
target_col = ['diagnosis']

## Datasets and Dataloaders


In [4]:
class BrainDataset(Dataset):
    def __init__(self, df, feature_cols, target_col):
        # data loading
        self.X = torch.tensor(df[feature_cols].values, dtype=torch.float32) # inputs: brain regional biomarkers
        self.y = torch.tensor(df[target_col].values, dtype=torch.float32) # output: diagnosis (0 or 1)
        # self.n_samples = len(self.y)

    def __getitem__(self, index):
        # dataset[0]
        return self.X[index], self.y[index]

    def __len__(self):
        # len(data)
        return len(self.y)


brain_data = BrainDataset(df, feature_cols, target_col)
# print(brain_data.y)

Cross-Validation

In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

batch_size = 25
dataloaders = []

for train_idx, test_idx in kf.split(df):
    train_df_kfold = df.iloc[train_idx]
    test_df_kfold = df.iloc[test_idx]

    train_ds_kfold = BrainDataset(train_df_kfold, feature_cols=feature_cols, target_col=target_col)
    test_ds_kfold = BrainDataset(test_df_kfold, feature_cols=feature_cols, target_col=target_col)

    train_dl_kfold = DataLoader(train_ds_kfold, batch_size=batch_size, shuffle=True)
    test_dl_kfold = DataLoader(test_ds_kfold, batch_size=batch_size, shuffle=False)

    dataloaders.append((train_dl_kfold, test_dl_kfold))

# # --------------------------- TESTPRINT
# # Retrieve the dataloader for one of the folds (e.g., the first fold)
# train_dataloader, _ = dataloaders[3]  # getting the train dataloader for the first fold

# # Retrieve one batch of data
# batch = next(iter(train_dataloader))
# inputs, outputs = batch

# # Print or inspect the inputs and outputs
# print("Input features for one batch:", inputs)
# print("Outputs for one batch:", outputs)


In [6]:

# # divide the 'df' into training and testing
# train_df = df.sample(frac = 0.75)
# test_df = df.drop(train_df.index)
# # prepare the datasets in the appropiate format 'ds'
# train_ds = BrainDataset(train_df, feature_cols=feature_cols, target_col=target_col)
# test_ds = BrainDataset(test_df, feature_cols=feature_cols, target_col=target_col)

Dataloader


In [7]:
# batch_size = 5 # Number of samples on each iteration

# # Create dataloaders
# train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True) # Shuffle the train dataset!
# test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [8]:
# def normalize_data(data):
#     # Rescale data to [0,1]
#     data_min = data.min()
#     data_max = data.max()
#     data = (data - data_min) / (data_max - data_min)
    
#     # Normalize
#     mean = data.mean()
#     std = data.std()
#     data = (data - mean) / std
    
#     return data

## Classifier

In [9]:
input_size = 10 # Brain regions biomarkers value
hidden_size = 6 # any number
output_size = 1 # diagnosis (yer or no)

class AdClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AdClassifier, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)  # First (input) layer
        self.relu = nn.LeakyReLU()                         # nonlinear activation function
        self.linear2 = nn.Linear(hidden_size, output_size) # Second (hidden) layer
        self.sigmoid = nn.Sigmoid()                        # Sigmoid for binary classification
        self.flatten = nn.Flatten()
    
    def forward(self, x):
        out = self.flatten(x)
        out = self.linear1(out)
        out = self.relu(out)
        out = self.linear2(out)
        output = self.sigmoid(out)
        return output
model = AdClassifier(input_size,hidden_size,output_size)
# for param in model.parameters():
#     param.requires_grad = True


In [30]:
# loss and optimizer
learning_rate = 0.0001
loss_func = nn.BCELoss()
# optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training Loop (LBFGS optimizer)


In [11]:
# def train_model(model, train_dl, optimizer, loss_func, num_epochs, print_loss_every=10):

#     def closure():
#         optimizer.zero_grad()
#         y_predicted = model(X)
#         loss = loss_func(y_predicted, y)
#         loss.backward()
#         return loss

#     for epoch in range(num_epochs):
#         total_loss = 0.0
#         num_batches = len(train_dl)

#         for X, y in train_dl:
#             optimizer.step(closure)
            
#             # accumulate loss for entire dataloader
#             with torch.no_grad():  # Ensure you're not computing gradients here
#                 y_predicted = model(X)
#                 loss = loss_func(y_predicted, y)
#                 total_loss += loss.item()

#         avg_loss = total_loss / num_batches

#         if (epoch+1) % print_loss_every == 0:
#             print(f'epoch: {epoch+1}, loss ={avg_loss:.4f}')


## Training Loop(Other Optimizer)

In [27]:
def train_model(model, train_dl, optimizer, loss_func, num_epochs, print_loss_every=100, normalize=False):
    for epoch in range(num_epochs):

        total_loss = 0.0
        num_batches = len(train_dl)
        optimizer.zero_grad()
        for X, y in train_dl:
            for param in model.parameters():
                param.requires_grad = True
            # X.requires_grad = True
            # print(X.requires_grad)
            # Normalize data if flag is set
            if normalize:
                X = normalize_data(X)
            # forward and loss
            y_predicted = model(X)
            
            # print(y_predicted.dtype)

            loss = loss_func(y_predicted, y)
            # backward
            loss.backward()
            #update
            optimizer.step()
            optimizer.zero_grad()
            # accumulate loss for entire dataloader
            total_loss += loss.item()
        
        avg_loss = total_loss / num_batches
        if (epoch+1) % print_loss_every == 0:
            print(f'epoch: {epoch+1}, loss: {avg_loss:.4f}')
        for param in model.parameters():
            if param.requires_grad:
                if param.grad is not None:
                    print(param.grad.data.norm())
                else:
                    print("Gradient is None for this parameter.")


In [20]:
# num_epochs = 500
# eval_every = 20
# train_model(model, train_dl, optimizer, loss_func, num_epochs, eval_every)

In [28]:
# Evaluation loop
def eval_model(model, test_dl):
    y_true = []
    y_pred = []
    for X, y in test_dl:
        pred = model(X)
        y_true += list(y.detach().cpu().numpy())
        y_pred += list(pred.argmax(1).detach().cpu().numpy())

    # accuracy = metrics.accuracy_score(targets, predictions)
    # balanced_accuracy = metrics.balanced_accuracy_score(targets, predictions)

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    print(f"y_pred:, {y_pred},y_true:, {y_true} Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, AUC: {auc}" )

Evalutaion

In [None]:
# # Evaluate model
# print("Performance on the train set")
# eval_model(model, train_dl)

# print("Performance on the test set")
# eval_model(model, test_dl)

In [29]:
# Assuming you have already defined your model, optimizer, loss function etc.
# Store metrics over the 5 folds
accuracies = []
precisions = []
recalls = []
aurocs = []

# Train model on each k-fold
for train_dl_kfold, test_dl_kfold in dataloaders:
    # Assuming you have a function 'train_model' for training your model
    train_model(model, train_dl_kfold, optimizer, loss_func, num_epochs)
    
    # Evaluate model
    print("Performance on the train set")
    eval_model(model, train_dl_kfold)

    print("Performance on the test set")
    eval_model(model, test_dl_kfold)

Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
G

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
G

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
G

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


y_pred:, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],y_true:, [array([0.], dtype=float32), array([1.], dtype=float32), array([0.], dtype=float32), array([0.], dtype=float32), array([0.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([0.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([0.], dtype=float32), array([0.], dtype=float32), array([1.], dtype=float32), array([0.], dtype=float32), array([0.], dtype=float32), array([0.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([0.], 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
Gradient is None for this parameter.
G

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
