In [20]:
import os, glob
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
#from torchvision.transforms import ToTensor
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import multiprocessing as mp
#import tensorflow

In [21]:
num_roi = 128
num_tasks = 4

# Load Data

In [22]:
def get_tensor(sub_data):
    """
    this function take each subject's connectivity dataframe, and convert them to input and label
    returns: X = 1d-array connectivity input; y = 1d-array task label
    """
    #remove self correlations
    sub_data = sub_data.loc[sub_data.level_0 != sub_data.level_1]
    #get only lower triangle
    sub_data = sub_data.sort_values(by='correlation').iloc[::2, :]
    #sanity check
    assert(len(sub_data['correlation'])) == num_roi*(num_roi-1)/2*num_tasks
        
    #separate into input and label
    mydict = sub_data.sort_values(by=['task', 'level_0', 'level_1']).groupby('task')['correlation'].apply(list)
    labels = [i for i,v in enumerate(mydict.keys())]
    data = [i for i in mydict]
    #turn to tensor: input and label
    X = torch.from_numpy(np.hstack(data).reshape(4, -1))
    #y = torch.nn.functional.one_hot(torch.tensor(labels))
    y = torch.tensor(labels)
    return X, y

In [23]:
#load data
def load_data():
    """
    this function reads all subject connectivity data and returns all inputs and labels
    look at get tensor for specifics
    """
    all_inputs = torch.tensor([])
    all_labels = torch.tensor([])
    for i in os.listdir('processed_data'):
        try:
            sub_data = pd.read_csv('processed_data/' + i, index_col=[0])
            X, y = get_tensor(sub_data)
            all_inputs = torch.cat([all_inputs, X])
            all_labels = torch.cat([all_labels, y])    
        except:
            pass
        
    #make sure they are of the same size
    assert(torch.Tensor.size(all_inputs)[0] == torch.Tensor.size(all_labels)[0])
    #get input and label type right
    all_inputs = all_inputs.to(torch.float32)
    all_labels = all_labels.type(torch.LongTensor)
    
    return all_inputs, all_labels

In [24]:
all_inputs, all_labels = load_data()
len(all_inputs)

652

In [6]:
#load subdata

# sub = 'sub-10249'
# sub_data = pd.read_csv('processed_data/' + sub + '.csv', index_col=[0])
# #remove self correlations
# sub_data = sub_data.loc[sub_data.level_0 != sub_data.level_1]
# #get only lower triangle
# sub_data = sub_data.sort_values(by='correlation').iloc[::2, :]
# sub_data.head()

# Define Network

In [41]:
# Define model
class NeuralNetwork(torch.nn.Module):
    def __init__(self):
        ### model
        super(NeuralNetwork, self).__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(int(num_roi*(num_roi-1)/2), 4096),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(4096, 1024),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(1024, 512),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(512, 32),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(32, num_tasks),
            torch.nn.Softmax(dim=1)
        )
        
        ### hyperparameters
        #batch size
        self.batch_size = 80
        #train epochs
        self.num_epochs = 120
        #learning rate
        self.lr = 0.075
        #l2 regularization
        self.l2 = 0
        #kfold stratified crossvalidation
        self.num_folds = 5
        
        #optimizer
        self.optimizer = torch.optim.SGD(self.parameters(), lr=self.lr, weight_decay=self.l2) 
    
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
    def accuracy(self, pred, labels):
        return (torch.Tensor.argmax(pred, axis=1)==labels).sum().item() / len(labels)
    
#print(model)

In [26]:
#size = len(train_dataloader.dataset)
def train(model,
          all_inputs, 
          all_labels, 
          ):
    
    batch_size = model.batch_size
    batch_num = int(np.ceil(len(all_inputs)/batch_size))
    loss_fn = torch.nn.CrossEntropyLoss()
    all_loss = [] #record loss
    
    for epoch in range(model.num_epochs):
        #shuffle
        #all_inputs, all_labels = shuffle(all_inputs, all_labels, random_state = epoch)
        accuracy = 0

        # Walk through each training batch:
        for i in range(batch_num):
            #get batch X and y
            batch_X = all_inputs[i*batch_size: min((i+1)*batch_size, len(all_inputs))]
            batch_y = all_labels[i*batch_size: min((i+1)*batch_size, len(all_labels))]

            # Compute prediction and loss:
            pred = model(batch_X)
            loss = loss_fn(pred, batch_y)
            accuracy += model.accuracy(pred, batch_y)

            # Backpropagation
            model.optimizer.zero_grad()
            loss.backward()
            model.optimizer.step()

        # Print out a status message every so often.
        if (1+epoch)%10==0:
            #get loss and accuracy
            (loss, current) = (loss.item(), epoch)
            all_loss.append(loss)
            accuracy = accuracy / batch_num

            print(f"loss: {loss:>7f}  [{current:>5d}/{model.num_epochs}]")
            print(f"train accuracy: {accuracy:>5f}")
            
    return model, all_loss, accuracy

In [39]:
def test(model, 
        all_inputs,
        all_labels):
    
    #correct = 0
    #total = 0
    
    with torch.no_grad():
        outputs = model(all_inputs)
        accuracy = model.accuracy(outputs, all_labels)
        # pred = torch.Tensor.argmax(outputs, axis=1)
        # total += len(all_labels)
        # correct += (pred == all_labels).sum().item()
    #accuracy = correct / total
    print(f"Validation Accuracy: {accuracy}")
    
    return accuracy, outputs

In [40]:
#cross validation
def cv(model,
       all_inputs,
       all_labels):
    
    #set up multiprocessing
    # p = mp.Pool(processes = num_processes) 
    # p.map(cv_group, (train_indices, test_indices) in enumerate(kfold.split(all_inputs, all_labels)))
    kfold = StratifiedKFold(n_splits=model.num_folds, shuffle=True)
    for fold, (train_indices, test_indices) in enumerate(kfold.split(all_inputs, all_labels)):
        print(f"fold {fold} / {model.num_folds}")
        #get data
        train_X = all_inputs[train_indices]
        train_y = all_labels[train_indices]
        test_X = all_inputs[test_indices]
        test_y = all_labels[test_indices]

        #Initialize the neural network
        model = NeuralNetwork()
        loss_fn = torch.nn.CrossEntropyLoss()
        
        #train
        model, pred, all_loss = train(model, train_X, train_y)
        #test
        test_acc, outputs = test(model, test_X, test_y)
    
    return model, outputs, test_acc

In [18]:
# test on one train and test
model = NeuralNetwork()
train_X = all_inputs[0:500,:]
train_y = all_labels[0:500]
model, all_loss, accuracy = train(model, train_X, train_y)

test_X = all_inputs[500:, :]
test_y = all_labels[500:]
model, pred, test_acc = test(model, test_X, test_y)

loss: 1.385216  [    9/100]
train accuracy: 0.250000
loss: 1.383489  [   19/100]
train accuracy: 0.250000
loss: 1.381379  [   29/100]
train accuracy: 0.250000
loss: 1.378439  [   39/100]
train accuracy: 0.292857
loss: 1.373510  [   49/100]
train accuracy: 0.458929
loss: 1.362621  [   59/100]
train accuracy: 0.646429
loss: 1.320657  [   69/100]
train accuracy: 0.266071
loss: 1.221604  [   79/100]
train accuracy: 0.446429
loss: 1.190167  [   89/100]
train accuracy: 0.766071
loss: 1.169870  [   99/100]
train accuracy: 0.910714
Validation Accuracy: 0.41935483870967744


In [42]:
#test on cv
model = NeuralNetwork()
cv(model, all_inputs, all_labels)

fold 0 / 5
loss: 1.385378  [    9/120]
train accuracy: 0.250828
loss: 1.384856  [   19/120]
train accuracy: 0.250828
loss: 1.384269  [   29/120]
train accuracy: 0.250828
loss: 1.383527  [   39/120]
train accuracy: 0.250828
loss: 1.382506  [   49/120]
train accuracy: 0.250828
loss: 1.381053  [   59/120]
train accuracy: 0.250828
loss: 1.378875  [   69/120]
train accuracy: 0.250828
loss: 1.375256  [   79/120]
train accuracy: 0.259582
loss: 1.367819  [   89/120]
train accuracy: 0.321559
loss: 1.341822  [   99/120]
train accuracy: 0.349956
loss: 1.221852  [  109/120]
train accuracy: 0.487456
loss: 1.133414  [  119/120]
train accuracy: 0.649260
Validation Accuracy: 0.48854961832061067
fold 1 / 5
loss: 1.383261  [    9/120]
train accuracy: 0.310932
loss: 1.382432  [   19/120]
train accuracy: 0.337631
loss: 1.381486  [   29/120]
train accuracy: 0.364329
loss: 1.380283  [   39/120]
train accuracy: 0.400044
loss: 1.378584  [   49/120]
train accuracy: 0.428441
loss: 1.375966  [   59/120]
train ac

(NeuralNetwork(
   (flatten): Flatten(start_dim=1, end_dim=-1)
   (linear_relu_stack): Sequential(
     (0): Linear(in_features=8128, out_features=4096, bias=True)
     (1): LeakyReLU(negative_slope=0.1)
     (2): Linear(in_features=4096, out_features=1024, bias=True)
     (3): LeakyReLU(negative_slope=0.1)
     (4): Linear(in_features=1024, out_features=512, bias=True)
     (5): LeakyReLU(negative_slope=0.1)
     (6): Linear(in_features=512, out_features=32, bias=True)
     (7): LeakyReLU(negative_slope=0.1)
     (8): Linear(in_features=32, out_features=4, bias=True)
     (9): Softmax(dim=1)
   )
 ),
 tensor([[3.6233e-01, 4.1442e-01, 1.3699e-01, 8.6252e-02],
         [3.7354e-01, 6.9289e-02, 3.4270e-01, 2.1447e-01],
         [4.6834e-01, 1.1806e-01, 2.3133e-01, 1.8227e-01],
         [3.9202e-01, 8.0767e-02, 1.7623e-01, 3.5098e-01],
         [4.3203e-01, 1.4018e-01, 2.1141e-01, 2.1638e-01],
         [5.0410e-01, 1.1286e-01, 1.8746e-01, 1.9558e-01],
         [3.4323e-01, 5.5678e-02, 3.0

In [11]:
# for fold, (train_indices, val_indices) in enumerate(kfold.split(all_inputs, all_labels)):
#     print(f"Fold {fold + 1}/{num_folds}")
    
#     # Create data loaders for training and validation sets
#     train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
#     train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    
#     val_sampler = torch.utils.data.SubsetRandomSampler(val_indices)
#     val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)
    
#     # Initialize the neural network
#     model = NeuralNetwork()
    
#     # Define loss function and optimizer
#     loss_fn = torch.nn.CrossEntropyLoss()
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
#     for epoch in range(num_epochs):
#         model.train()
#         total_loss = 0
        
#         for inputs, labels in train_loader:
#             # Initialize optimizer
#             optimizer.zero_grad()
#             # Compute prediction and loss:
#             outputs = model(inputs)
#             loss = loss_fn(outputs, labels)
#             # Backpropagation
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
            
#         print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {total_loss}")
        
#         # Validation loop
#         model.eval()
#         correct = 0
#         total = 0

#         with torch.no_grad():
#             for inputs, labels in val_loader:
#                 outputs = model(inputs)
#                 _, predicted = torch.max(outputs.data, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()

#         accuracy = correct / total
#         print(f"Validation Accuracy: {accuracy}")
#         print("-" * 40)