In [1]:
import numpy as np
from nilearn import plotting
import nilearn
import pandas as pd
import os
import scipy.io
import csv

Numpy File to Matlab File Conversion

In [None]:
#Please designate which directory filled with npy connectivity matrices to use (ex. feature_directory = '/Users/emafikre/Desktop/vincent/Features_1000')
feature_directory = ''
#Please designate which directory should be filled with the converted matlab files 
mat_feature_directory = ''

In [2]:
#Designates list of file names which serves as list of participant IDs
file_list = os.listdir(feature_directory)

#Selects only numpy files in case your directory has other file types
numpy_files = [f for f in file_list]
numpy_files.sort()

#This for loop converts the numpy connectivity array into a matlab file with name: subject_n.mat, while also making sure that the data is 2 dimensional.
for i, numpy_file in enumerate(numpy_files):
    data = np.load(os.path.join(feature_directory, numpy_file))
    if len(data.shape) == 3:
        data = data[0, :, :]

    number = str(i+1).zfill(3)

    mat_file = 'subject_{}.mat'.format(number)
    
    scipy.io.savemat(mat_file, {'data': data}, do_compression=True)
    os.rename(mat_file, os.path.join(mat_feature_directory, mat_file))

Brain Regions File Creation (2018 Schaefer)

In [3]:
#Downloads and interprets atlas brain region data and splits it into coordinates and label_names.
image = nilearn.datasets.fetch_atlas_schaefer_2018(n_rois=1000)
coordinates, label_names= plotting.find_parcellation_cut_coords(labels_img=image.maps, return_label_names=True)

In [10]:
#Designates name for the csv file. This convention follows the NBS-Predict structure.
output_file = 'BrainRegions.csv'

#This creates a csv file and seperates the coordinates and label_names into their appropriate columns.
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['x', 'y', 'z', 'label'])
    for i in range(len(coordinates)):
        x, y, z = coordinates[i]
        label = label_names[i]
        writer.writerow([x, y, z, label])

#Signals that the data has been saved.
print(f"Data has been saved to {output_file}.")

Data has been saved to BrainRegions.csv.


Design Matrix Creation

In [None]:
#Please designate tsv files that include participant_id, diagnosis, age, and sex (ex: dataset_paths = ['/Users/emafikre/Desktop/vincent/KTT_participants.tsv', '/Users/emafikre/Desktop/vincent/participants_LA5c.tsv',
#'/Users/emafikre/Desktop/vincent/COBRE_control_participants.tsv', '/Users/emafikre/Desktop/vincent/COBRE_scz_participants.tsv'])
dataset_paths = ['']

#Please designate a directory that includes files with names of participant IDs. (ex. '/Users/emafikre/Desktop/vincent/Features')
filename_directory = ''

In [25]:
#Designates the array of file paths and unzips the files.
datasets = []
for path in dataset_paths:
    dataset = pd.read_csv(path, sep='\t')
    datasets.append(dataset)

#This long for loop essentially makes all datasets have the same format so that concat function can work without hiccup. 
for i in range(len(datasets)):
    dataset = datasets[i]

    if 'diag' in dataset.columns:
        dataset.rename(columns={'diag': 'diagnosis'}, inplace=True)
    elif 'dx' in dataset.columns:
        dataset.rename(columns={'dx': 'diagnosis'}, inplace=True)
    elif 'gender' in dataset.columns:
        dataset.rename(columns={'gender': 'sex'}, inplace=True)

    dataset = dataset[['participant_id', 'diagnosis', 'age', 'sex']]

    dataset = dataset.replace(to_replace=4, value="1")
    dataset = dataset.replace(to_replace=0, value="0")
    dataset = dataset.replace(to_replace=["CONTROL"], value="0")
    dataset = dataset.replace(to_replace=["SCHZ"], value="1")
    dataset = dataset.replace(to_replace=["No_Known_Disorder"], value="0")
    dataset = dataset.replace(to_replace=["Schizophrenia_Strict"], value="1")
    dataset = dataset[(dataset['diagnosis'] == '0') | (dataset['diagnosis'] == '1')]
    dataset = dataset.replace(to_replace=["M"], value="1")
    dataset = dataset.replace(to_replace=["F"], value="2")
    dataset = dataset.replace(to_replace=["male"], value="1")
    dataset = dataset.replace(to_replace=["female"], value="2")

    dataset['participant_id'] = dataset['participant_id'].apply(lambda x: 'sub-' + x if not x.startswith('sub-') else x)

    datasets[i] = dataset

#Merges all datasets into a single dataset.
y = pd.concat(datasets, ignore_index=True)

#This checks whether or not some of your participants within the dataset do not match with a file name within your filename_directory
filenames = os.listdir(filename_directory)

for participant in y['participant_id']:
    if participant not in filenames:
        y.drop(y[y['participant_id'] == participant].index, inplace = True)

#Formats the dataset according to the necessary convention of NBS-Predict
y = y[['diagnosis','diagnosis', 'age', 'sex']]
y.reset_index(drop=True, inplace=True)
y.iloc[:, 1] = y.iloc[:,1].replace({'0': '1', '1': '0'})
y

Unnamed: 0,diagnosis,diagnosis.1,age,sex
0,1,0,37,1
1,1,0,29,1
2,1,0,27,2
3,1,0,35,1
4,1,0,44,2
...,...,...,...,...
437,1,0,55,1
438,1,0,40,2
439,1,0,21,1
440,1,0,52,1


In [None]:
#Please designate where you would like to save your pre-formatted design matrix (ex. '/Users/emafikre/Desktop/MatlabProject/predesign.csv')
preformat_design = ''

In [26]:
#Saves the preformatted dataset to a csv file
y.to_csv(preformat_design)

#Redownloads the csv file with no header, index, and column titles as requested by NBS-Predict
csv = pd.read_csv(preformat_design, header= None)
y_fixed = csv.drop(index=0, columns=0)

#Converts the dataset into a matlab file with the proper file name as requested by NBS-Predict
mat = y_fixed.values
scipy.io.savemat('design.mat', {'design': mat})