# clasification based on function

In [1]:
# Classification of ASD vs Controls based on different atlases.

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader
from torch import nn
import torch.optim as optim
from torch.autograd import Variable
import torch
import torch.nn.functional as F
from pprint import pprint
from sklearn.utils import shuffle
from scipy.stats import mode
from sklearn.metrics import accuracy_score
import os.path as osp
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold
import torch.utils.data as data_utils
from sklearn.metrics import confusion_matrix

### Lunch wandb ai

In [2]:
import wandb
!wandb login 390734ff44d817dbba59927d4eb542e564627b3b

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /data/zmohaghegh/.netrc


### Model Architecture

In [3]:
class Abide1DConvNet(nn.Module):
    def __init__(self, nROIS):
        super(Abide1DConvNet, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels= nROIS, out_channels=64, kernel_size=3)
        self.avg = nn.AdaptiveAvgPool1d((1))
        self.linear1 = nn.Linear(in_features=64, out_features=1, bias=True)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.avg(x).view(-1, 64)
        x = self.linear1(x)
        #x = F.softmax(x,dim=1)
        
        #print(x.shape, x.min(), x.max())
        
        return x

### validation

In [4]:
def validate_model(net, val_data_loader, fold):

    criterion = nn.BCEWithLogitsLoss()
    net.eval()
    
    total=0
    correct=0
    total_valid_loss=0
    
    for i, data in enumerate(val_data_loader,0):

        inputs, labels= data
        labels = labels.double()
   
        # forward pass
        output = net(inputs)
        outputs=output.squeeze(1)

        # calculate loss
        loss = criterion(outputs, labels)
        
        predict = outputs.data > 0.0
        
        total_valid_loss += loss.item()
        total += labels.size(0)
        correct += (predict == labels).sum().item()
        
    wandb.log({f"valid_Loss_fold_{fold}":total_valid_loss/total, f"valid_acc_fold_{fold}": 100 * correct / total })

    
    # Calculate acc
    valid_accuracy= 100 * correct /total
    valid_loss = total_valid_loss/total

    return valid_loss, valid_accuracy

### train

In [5]:
def train_model(train_data, val_data, nepochs, batch_size, learning_rate, fold, atlas_name, nr_RIO):
    train_data_loader = DataLoader(train_data, batch_size=batch_size,shuffle=True)
    val_data_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
    
    net = Abide1DConvNet(nROIS=nr_RIO)
    net = net.double()
    
    criterion = nn.BCEWithLogitsLoss() # weight=class_weigths
    optimizer = optim.SGD(net.parameters(), momentum=0.9, lr = learning_rate, weight_decay=0.02)
    
    net.train()
    train_loss = []
    val_loss = []
    
    best_val_acc = None
    #best_net_valid = None

    for i_epoch in range(nepochs):
        
        total=0
        correct=0
        train_loss = 0
        
        for i, data in enumerate(train_data_loader, 0):
            
            inputs, labels = data
            
            optimizer.zero_grad()

            # forward pass
            output = net(inputs)
            outputs=output.squeeze(1)
            
            labels=labels.double()
            
            # prediction 
            predicted = outputs.data > 0  # the loss function contain also a sigmoid layer <0 = false >0 =true
            
            # print(outputs,predicted)
            
            # Calculate loss
            #print(outputs.size() , labels.size())
            
            loss = criterion(outputs, labels)# [32,1] ---> [32]
            loss.backward()
            
            optimizer.step()

            correct += (predicted == labels).sum().item()
            train_loss += loss.item()
            total += labels.size(0)
            
            #print(correct,total,loss)
        
        wandb.log({"epoch": i_epoch,"train_Loss":train_loss/total, "train_acc": 100 * correct / total })

        # Validation
        epoch_val_loss, epoch_val_acc = validate_model(net, val_data_loader, fold)
        
        #Save model with best validation loss
        if not best_val_acc or best_val_acc < epoch_val_acc:
            best_net_valid = net
            best_val_acc = epoch_val_acc
            
            #path_best_loss_atlas = f'/data/zmohaghegh/TempStats_3D-CNN/best_model_atlas/best_model_atlast.pth'
            #torch.save(net.state_dict(), path_best_loss_atlas)
                                    
    return best_net_valid

### test model

In [6]:
def test_model(net, test_data, batch_size):
    
    test_data_loader = DataLoader(test_data, batch_size=batch_size,shuffle=True)
    
    net.eval()
    
    total = 0
    correct=0
    
    criterion = nn.BCEWithLogitsLoss() 
    
    for i, data in enumerate(test_data_loader,0):
        inputs, labels = data
            
        # forward pass
        output = net(inputs)
        outputs=output.squeeze(1)
        labels=labels.double()
        
        # calculate loss
        loss = criterion(outputs, labels)
        predict = outputs.data > 0.0
            
        total += labels.size(0)
        correct += (predict == labels).sum().item()
    
    # Calculate acc
    test_acc= 100 * correct /total
    wandb.log({"test_acc": test_acc })
    
    return test_acc

### K-fold cross validation

In [7]:
def run_kfold(atlas_name,nTime_min, zscore,folds,nepochs,batch_size,learning_rate,
              root_dir='/dbstore/zmohaghegh/UKBiobank_subset/Time_course_New/', 
              data_info_file='data_info.csv',
              exp_dir='/data/zmohaghegh/TempStats_3D-CNN/atlas_model/'):    
    
        print("preparing dataset ....")
        
        # Read the parent CSV file
        data_info = pd.read_csv(os.path.join(root_dir, data_info_file))
        data_info = shuffle(data_info)
        
        # Determine the nchannels (=nrois) from the data by using the first sample
        sample_file = data_info['tc_file'].iloc[0].replace('ATLAS', atlas_name)
        nrois = pd.read_csv(sample_file).values.shape[1] # number of channel = number of brain region in Atlas
        
        total_subjects = len(data_info)
        
        # Initialize an np array to store all timecourses and labels
        tc_data = np.zeros((total_subjects, nrois, nTime_min))
        labels = np.zeros(total_subjects, dtype=int)
        
        # Load data       
        for i, sub_i in enumerate(data_info.index):
            tc_file = data_info['tc_file'].loc[sub_i].replace('ATLAS', atlas_name)
            tc_vals = pd.read_csv(tc_file).values.transpose()[:, :nTime_min]

            if zscore:       
                tc_vals =  np.array([(tc_vals[:,i] - np.mean(tc_vals[:,i]))/np.std(tc_vals[:,i]) for i in range (tc_vals.shape[1])])
                tc_data[i] = tc_vals.transpose()
            else:
                tc_data[i] = tc_vals ### might need "transponse"

            labels[i] = data_info['DX_GROUP'].loc[sub_i]
            
        #labels = np.eye(2)[labels] 
        
        kfold = KFold(n_splits=folds, shuffle=True)
        #kfold = StratifiedKFold(n_splits=folds, shuffle=True)

        total_accuracy=0
        total_sensitivity=0
        total_specificity=0
        
        # loop  cross validation over folds
        
        for fold, (train_index, test_index) in enumerate(kfold.split(tc_data)):
        #for fold, (train_index, test_index) in enumerate(kfold.split(tc_data, labels)):
            print(f'Fold_{fold}_Atlas_{atlas_name}------------')
            
            #spltitting training fold into 90% training and 10% validation    
            train_split = int(0.9 * len(train_index))
            train_i = train_index[0:train_split]
            val_i = train_index[train_split:]
            
            # nested Stratified kfold
            # train_val_folds = StratifiedKFold(n_splits=2, shuffle=True)
            # train_i, val_i = list(train_val_folds.split(tc_data[train_index], labels[train_index]))[0]
            
            # nested KFOLD
            #train_val_folds = KFold(n_splits=2, shuffle=True)
            #train_i, val_i = list(train_val_folds.split(tc_data[train_index], labels[train_index]))[0]
            
            # Create training,testing and validation datasets
            train_data = torch.from_numpy(tc_data[train_i])
            train_labels= torch.from_numpy(labels[train_i])
            
            val_data = torch.from_numpy(tc_data[val_i])
            val_labels = torch.from_numpy(labels[val_i])
            
            test_data = torch.from_numpy(tc_data[test_index])
            test_labels = torch.from_numpy(labels[test_index])
                   
            train = data_utils.TensorDataset(train_data, train_labels)
            val = data_utils.TensorDataset(val_data, val_labels)
            test = data_utils.TensorDataset(test_data, test_labels)
            
            #train network
            print('Start Training ...')

            validated_network = train_model(train,val,nepochs,batch_size,learning_rate,fold,atlas_name,nrois)
            print('Validation finished...')
            
            #test network
            print('Start Testing...')

            test_accuracy= test_model(validated_network,test,batch_size)
            
            total_accuracy += test_accuracy
            #total_sensitivity += test_sens
            #total_specificity += test_spec
                        
            print("----Test results of of fold {} are : {} acc ----".format(fold, test_accuracy))
                
        acc_test_average = total_accuracy/folds
        #sens_test_average = total_sensitivity/folds
        #spec_test_average = total_specificity/folds
                
        return acc_test_average

### run time

In [10]:
ntimes  = [200]  #100,200,300,500
atlases = ['AAL'] #'HO_cort_maxprob_thr25-2mm'

# hyperparameter
    
batch_size = 1
learning_rate =0.0001
nepochs = 150
zscore=True
nr_folds = 5

for atlas in atlases:
    wandb.init(project=f'1d-cnn-UKBB-timecourse-atlas-{atlas}')
    for ntime in ntimes:
        accuracy_test_total = run_kfold(zscore=zscore,folds =nr_folds,atlas_name=atlas,nTime_min=ntime, 
                                        nepochs =nepochs, batch_size = batch_size ,learning_rate =learning_rate)
        print(f'******Accuracy_test_Avg********= {accuracy_test_total}')
        print('-------------------------------------------')

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

preparing dataset ....
Fold_0_Atlas_AAL------------
Start Training ...


  allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag


Validation finished...
Start Testing...
----Test results of of fold 0 are : 57.926829268292686 acc ----
Fold_1_Atlas_AAL------------
Start Training ...
Validation finished...
Start Testing...
----Test results of of fold 1 are : 56.707317073170735 acc ----
Fold_2_Atlas_AAL------------
Start Training ...
Validation finished...
Start Testing...
----Test results of of fold 2 are : 59.146341463414636 acc ----
Fold_3_Atlas_AAL------------
Start Training ...
Validation finished...
Start Testing...
----Test results of of fold 3 are : 53.987730061349694 acc ----
Fold_4_Atlas_AAL------------
Start Training ...
Validation finished...
Start Testing...
----Test results of of fold 4 are : 55.214723926380366 acc ----
******Accuracy_test_Avg********= 56.59658835852163
-------------------------------------------


In [None]:
ntimes  = [200]
atlases = ['HO_cort_maxprob_thr25-2mm']

# hyperparameter
    
batch_size = 1
learning_rate =0.0005
nepochs = 250
zscore=True
nr_folds = 5

for atlas in atlases:
    wandb.init(project=f'1d-cnn-UKBB-timecourse-atlas-{atlas}')
    for ntime in ntimes:
        accuracy_test_total = run_kfold(zscore=zscore,folds =nr_folds,atlas_name=atlas,nTime_min=ntime, nepochs =nepochs, batch_size = batch_size ,learning_rate =learning_rate)
        print(f'******Accuracy_test_Avg********= {accuracy_test_total}')
        print('-------------------------------------------')

## Debugging

In [None]:
df_data_info == !!!

In [None]:
df_data_info = pd.read_csv('/dbstore/zmohaghegh/Japanese_subset/Time_course/data_info.csv')

In [None]:
df_data_info.to_csv('/dbstore/zmohaghegh/Japanese_subset/Time_course/data_info_1.csv', index=False)

In [None]:
exp_dir='/dbstore/zmohaghegh/Japanese_subset/atlas_model/',
atlas_name='AAL'
root_dir='/dbstore/zmohaghegh/Japanese_subset/Time_course/', 
data_info_file='data_info.csv'

nTime_min=84 
zscore=False
folds = 5    
kfold = KFold(n_splits=folds, shuffle=True)

#### hyper_parameter


In [None]:
data_info = pd.read_csv('/dbstore/zmohaghegh/Japanese_subset/Time_course/data_info.csv')
data_info

In [None]:
print("preparing dataset ")
        
# Read the parent CSV file
data_info = pd.read_csv('/dbstore/zmohaghegh/Japanese_subset/Time_course/data_info.csv')
data_info = shuffle(data_info,random_state = 1)

In [None]:
nTime_min = 84

# Determine the nchannels (atlas region) (=nrois) from the data by using the first sample
#sample_file = pd.read_csv('/dbstore/zmohaghegh/Japanese_subset/Time_course/sub-0487/tc/AALtimecourse.csv')
sample_file = pd.read_csv(data_info['tc_file'].iloc[0].replace('ATLAS', atlas_name))

In [None]:
nrois = sample_file.values.shape[1]
total_subjects = len(data_info)
nrois

In [None]:
total_subjects 

In [None]:
# Initialize an np array to store all timecourses and labels
tc_data = np.zeros((total_subjects, nrois, nTime_min))
labels = np.zeros(total_subjects, dtype=int)
ids = np.zeros(total_subjects, dtype=int)

In [None]:
len(labels)

In [None]:
# Load data       
for i, sub_i in enumerate(data_info.index):
    tc_file = data_info['tc_file'].loc[sub_i].replace('ATLAS', atlas_name)
    tc_vals = pd.read_csv(tc_file).values.transpose()[:, :nTime_min]

    if (zscore):       
        tc_vals =  np.array([(tc_vals[:,i] - np.mean(tc_vals[:,i]))/np.std(tc_vals[:,i]) for i in range (tc_vals.shape[1])])
        tc_data[i] = tc_vals.transpose()
    else:
        tc_data[i] = tc_vals

    labels[i] = data_info['DX_GROUP'].loc[sub_i]

In [None]:
tc_vals.shape

In [None]:
labels

In [None]:
data_info

In [None]:
len(labels)

In [None]:
labels

In [None]:
len(data_info['tc_file'])

In [None]:
tc_file

In [None]:
tc_vals.shape

In [None]:
tc_data.shape

In [None]:
#labels = np.eye(2)[labels] 
kfold = KFold(folds, True, 1)

j = 0
total_accuracy=0
accuracies=[]

In [None]:
import wandb
!wandb login 390734ff44d817dbba59927d4eb542e564627b3b

In [None]:
# k fold cross validation 

for fold, (train_index, test_index) in enumerate(kfold.split(tc_data)):
                
    # Spltitting training fold into 90% training and 10% validation    
    train_split = int(0.8 * len(train_index))
    train_i = train_index[0:train_split]
    val_i = train_index[train_split:]

            
    # Create training,testing and validation datasets
    train_data = torch.from_numpy(tc_data[train_i])
    train_labels= torch.from_numpy(labels[train_i])
    
            
    val_data = torch.from_numpy(tc_data[val_i])
    val_labels = torch.from_numpy(labels[val_i])
    #print(val_i,tc_data[val_i])
    
            
    test_data = torch.from_numpy(tc_data[test_index])
    test_labels = torch.from_numpy(labels[test_index])
            
    train = data_utils.TensorDataset(train_data, train_labels)
    val = data_utils.TensorDataset(val_data, val_labels)
    test = data_utils.TensorDataset(test_data, test_labels)
            

In [None]:
train_data.shape

In [None]:
tc_data[test_index][100]

In [None]:
test_index

In [None]:
labels[test_index]

In [None]:
nrois

In [None]:
fold

In [None]:
atlas_name

In [None]:
train_data[0].shape

In [None]:
#def train_model():
#wandb.init(project='1D-CNN-atlast-timecourse')

from torch.utils.data import DataLoader

atlas_name='AAL'
fold=4
nr_RIO=nrois

batch_size = 1
learning_rate =.0001
nepochs = 20

train_data_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
val_data_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=True)
    
net = Abide1DConvNet(nROIS=nr_RIO)
net= net.double()
    
criterion = nn.BCEWithLogitsLoss() # weight=class_weigths
optimizer =  optim.Adam(net.parameters(), lr=learning_rate , weight_decay=0.02)
#optimizer = optim.SGD(net.parameters(), momentum=0.9, lr = learning_rate, weight_decay=0.02)
    
net.train()
train_loss = []
val_loss = []
    
best_val_acc = None
#best_net_valid = None

for i_epoch in range(nepochs):
        
    total=0
    correct=0
    epoch_loss = 0

    for i, (inputs, labels) in enumerate(train_data_loader):

        #inputs, labels = data
        optimizer.zero_grad()

        # forward pass
        output = net(inputs)
        outputs=output.squeeze(1)

        labels=labels.double()
        #print(i, labels,outputs)

        # prediction 
        predicted = outputs.data >0  # the loss function contain also a sigmoid layer <0 = false >0 =true

        # print(outputs,predicted)

        # Calculate loss
        # print(outputs.size() , labels.size())

        loss = criterion(outputs, labels)# [32,1] ---> [32]
        
        loss.backward()
        optimizer.step()

        correct += (predicted == labels).sum().item()
        epoch_loss += loss.item()
        total += labels.size(0)
        
        #print(total,correct)
        
    acc= 100 * correct / total
    print(f'acc_{acc}')    
    #wandb.log({"epoch": i_epoch , f"train_Loss_fold_{fold}_Atlas_{atlas_name}":epoch_loss/total, f"train_acc_fold_{fold}_Atlas_{atlas_name}": 100 * correct / total })

    # Validation
    epoch_val_loss, epoch_val_acc = validate_model(net, val_data_loader)
    #print(f' vali acc: {epoch_val_acc}')
    
    #Save model with best validation loss
    if not best_val_acc or best_val_acc < epoch_val_acc:
        best_net_valid = net
        best_val_acc = epoch_val_acc

In [None]:

        wandb.log({"epoch": i_epoch,f"train_Loss_fold_{fold}_Atlas_{atlas_name}":epoch_loss/total, f"train_acc_fold_{fold}_Atlas_{atlas_name}": 100 * correct / total })

        # Validation
        epoch_val_loss, epoch_val_acc = validate_model(net, val_data_loader)
        print(epoch_val_acc, best_val_acc)
        
        #Save model with best validation loss
        if not best_val_acc or best_val_acc < epoch_val_acc:
            best_net_valid = net
            best_val_acc = epoch_val_acc
            
            #path_best_loss_atlas = f'/data/zmohaghegh/TempStats_3D-CNN/best_model_atlas/best_model_atlast.pth'
            #torch.save(net.state_dict(), path_best_loss_atlas)
                                    
    return best_net_valid

In [None]:
acc = total_accuracy/folds
#sens = total_sensitivity/folds
#spec = total_specificity/folds
        
print("{} in {} nTime results are: {} acc. ".format(atlas_name, nTime_min, acc))
print(acc,total_subjects)

In [None]:
subject_list_info = pd.read_csv('UKBB_merged_subjects_info.csv')

In [None]:
subject_list_info