### Libraries

In [36]:
import numpy as np
import torch
import torch.nn.functional as F
import pandas as pd
import os

### Load and transpose the time-series file

In [2]:
seq_len = 100
file = np.load("/pscratch/sd/p/pakmasha/MBBN_data/Amsterdam-VUmc_sub-916002/Amsterdam-VUmc_sub-916002_filtered_0.01_0.1.npy")[20:20+seq_len].T
print(file.shape)

(316, 100)


### Create a mapping of indices where padding should be inserted

##### Load the full list of Schaefer labels

In [23]:
# Load the look-up table
roi_lut = pd.read_csv("/global/homes/p/pakmasha/ENIGMA-OCD results/QC/all_regions_Schaefer2018_400Parcels_17Networks_LUT.csv", sep=",")
print(roi_lut.head())

# Save the ROI names
roi_names = roi_lut['Schaeffer_Yeon_labels'].values

   roi_ID        R         G         B          Schaeffer_Yeon_labels  \
0       1  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_1   
1       2  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_2   
2       3  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_3   
3       4  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_4   
4       5  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_5   

                 halfpipe_labels Schaeffer_Yeon_17_networks  \
0  17Networks_LH_VisCent_ExStr_1                    VisCent   
1  17Networks_LH_VisCent_ExStr_2                    VisCent   
2  17Networks_LH_VisCent_ExStr_3                    VisCent   
3  17Networks_LH_VisCent_ExStr_4                    VisCent   
4  17Networks_LH_VisCent_ExStr_5                    VisCent   

  halfpipe_17_networks  
0              VisCent  
1              VisCent  
2              VisCent  
3              VisCent  
4              VisCent  


##### Remove subcortical and cerebellar ROIs

In [26]:
print(f"{len(roi_names)} Schaefer, subcortical, and cerebellar ROIs")

roi_names = roi_names[:400]
print(f"{len(roi_names)} Schaefer ROIs")
print(f"Last ROIs: {roi_names[-5:]}")

438 Schaefer, subcortical, and cerebellar ROIs
400 Schaefer ROIs
Last ROIs: ['17Networks_RH_TempPar_6' '17Networks_RH_TempPar_7'
 '17Networks_RH_TempPar_8' '17Networks_RH_TempPar_9'
 '17Networks_RH_TempPar_10']


##### Load the ROI list of the ENIGMA-OCD dataset

In [32]:
# File path
file_path = "/pscratch/sd/p/pakmasha/MBBN_data/Amsterdam-VUmc_sub-916002/Amsterdam-VUmc_sub-916002.tsv"

# Load the TSV file
data = pd.read_csv(file_path, sep="\t")

# Get the column names
column_names = data.columns.tolist()

# Print the column names
print("Column (ROI) names:", column_names)
print(f"Total of {len(column_names)} ROIs")

Column (ROI) names: ['17Networks_LH_VisCent_ExStr_1', '17Networks_LH_VisCent_ExStr_2', '17Networks_LH_VisCent_ExStr_4', '17Networks_LH_VisCent_ExStr_6', '17Networks_LH_VisCent_ExStr_7', '17Networks_LH_VisCent_ExStr_9', '17Networks_LH_VisCent_ExStr_11', '17Networks_LH_VisPeri_ExStrInf_1', '17Networks_LH_VisPeri_ExStrInf_2', '17Networks_LH_VisPeri_ExStrInf_3', '17Networks_LH_VisPeri_ExStrInf_4', '17Networks_LH_VisPeri_ExStrInf_5', '17Networks_LH_VisPeri_StriCal_1', '17Networks_LH_VisPeri_StriCal_2', '17Networks_LH_VisPeri_ExStrSup_1', '17Networks_LH_VisPeri_ExStrSup_2', '17Networks_LH_VisPeri_ExStrSup_3', '17Networks_LH_VisPeri_ExStrSup_5', '17Networks_LH_SomMotA_1', '17Networks_LH_SomMotA_2', '17Networks_LH_SomMotA_4', '17Networks_LH_SomMotA_5', '17Networks_LH_SomMotA_6', '17Networks_LH_SomMotA_7', '17Networks_LH_SomMotA_8', '17Networks_LH_SomMotA_9', '17Networks_LH_SomMotA_10', '17Networks_LH_SomMotA_11', '17Networks_LH_SomMotA_12', '17Networks_LH_SomMotA_13', '17Networks_LH_SomMotA_15

##### Identify indices of columns that should be removed (subcortical and cerebellar ROIs)

In [33]:
# Identify ROIs in column_names but not in roi_names
unmatched_rois = [roi for roi in column_names if roi not in roi_names]

# Get the indices of these unmatched ROIs in column_names
unmatched_indices = [i for i, roi in enumerate(column_names) if roi not in roi_names]

# Print the results
print("Unmatched ROIs (subcortical and cerebellar):", unmatched_rois)
print("Indices of unmatched ROIs:", unmatched_indices)
print(f"Total of {len(unmatched_rois)} unmatched ROIs")

Unmatched ROIs (subcortical and cerebellar): ['FreeSurfer_Left-Thalamus', 'FreeSurfer_Left-Caudate', 'FreeSurfer_Left-Putamen', 'FreeSurfer_Left-Pallidum', 'FreeSurfer_Left-Hippocampus', 'FreeSurfer_Right-Thalamus', 'FreeSurfer_Right-Caudate', 'FreeSurfer_Right-Putamen', 'FreeSurfer_Right-Pallidum', 'FreeSurfer_Right-Hippocampus', 'Buckner2011_17Networks_4']
Indices of unmatched ROIs: [307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317]
Total of 11 unmatched ROIs


##### Remove unmatched entries

In [34]:
# Remove entries corresponding to unmatched_indices
filtered_column_names = [name for i, name in enumerate(column_names) if i not in unmatched_indices]

# Print the result
print("Filtered column names:", filtered_column_names)
print(f"Total of {len(filtered_column_names)} ROIs")

Filtered column names: ['17Networks_LH_VisCent_ExStr_1', '17Networks_LH_VisCent_ExStr_2', '17Networks_LH_VisCent_ExStr_4', '17Networks_LH_VisCent_ExStr_6', '17Networks_LH_VisCent_ExStr_7', '17Networks_LH_VisCent_ExStr_9', '17Networks_LH_VisCent_ExStr_11', '17Networks_LH_VisPeri_ExStrInf_1', '17Networks_LH_VisPeri_ExStrInf_2', '17Networks_LH_VisPeri_ExStrInf_3', '17Networks_LH_VisPeri_ExStrInf_4', '17Networks_LH_VisPeri_ExStrInf_5', '17Networks_LH_VisPeri_StriCal_1', '17Networks_LH_VisPeri_StriCal_2', '17Networks_LH_VisPeri_ExStrSup_1', '17Networks_LH_VisPeri_ExStrSup_2', '17Networks_LH_VisPeri_ExStrSup_3', '17Networks_LH_VisPeri_ExStrSup_5', '17Networks_LH_SomMotA_1', '17Networks_LH_SomMotA_2', '17Networks_LH_SomMotA_4', '17Networks_LH_SomMotA_5', '17Networks_LH_SomMotA_6', '17Networks_LH_SomMotA_7', '17Networks_LH_SomMotA_8', '17Networks_LH_SomMotA_9', '17Networks_LH_SomMotA_10', '17Networks_LH_SomMotA_11', '17Networks_LH_SomMotA_12', '17Networks_LH_SomMotA_13', '17Networks_LH_SomMotA

##### Identify indices of columns where zero-padding should be applied

In [35]:
# Get the indices in roi_names that are not in filtered_column_names
padding_indices = [i for i, roi in enumerate(roi_names) if roi not in filtered_column_names]

# Print the results
print("Indices for zero padding:", padding_indices)
print(f"Total of {len(padding_indices)} zero-padded ROIs")

Indices for zero padding: [2, 4, 6, 8, 10, 22, 26, 37, 39, 41, 42, 59, 61, 70, 71, 79, 80, 99, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 122, 133, 134, 139, 149, 160, 161, 164, 166, 167, 168, 177, 178, 180, 183, 184, 191, 204, 205, 208, 210, 222, 238, 239, 240, 241, 242, 258, 259, 266, 271, 279, 280, 302, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 335, 336, 338, 346, 347, 348, 357, 367, 368, 371, 373, 375, 379, 380, 385, 388, 390]
Total of 93 zero-padded ROIs


### Apply zero-padding and remove subcortical and cerebellar ROIs from MBBN_data files

In [37]:
# Paths and indices
base_path = "/pscratch/sd/p/pakmasha/MBBN_data"
target_num_rois = 400      # Total number of ROIs after padding

# Function to process and save the modified data
def process_data_file(tsv_file_path, output_dir):
    # Read the .tsv file into a DataFrame
    data = pd.read_csv(tsv_file_path, sep="\t")
    
    # Convert to a NumPy array
    data_array = data.to_numpy()  # Shape: [# of timepoints x # of ROIs]
    
    # Step 1: Remove ROIs corresponding to unmatched_indices
    filtered_data = np.delete(data_array, unmatched_indices, axis=1)  # Remove columns
    
    # Step 2: Add zero-padding at positions specified by padding_indices
    padded_data = np.zeros((filtered_data.shape[0], target_num_rois))  # Initialize padded array
    existing_indices = [i for i in range(target_num_rois) if i not in padding_indices]  # Indices for valid ROIs
    padded_data[:, existing_indices] = filtered_data  # Copy data to the padded array
    
    # Step 3: Save the modified data as a .npy file
    output_file_path = os.path.join(output_dir, os.path.basename(tsv_file_path).replace(".tsv", ".npy"))
    np.save(output_file_path, padded_data)
    print(f"Processed and saved: {output_file_path}")

# Walk through the subject folders and process .tsv files
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith(".tsv"):
            tsv_file_path = os.path.join(root, file)
            process_data_file(tsv_file_path, root)

Processed and saved: /pscratch/sd/p/pakmasha/MBBN_data/Barcelona-HCPB_sub-008/Barcelona-HCPB_sub-008.npy
Successfully processed Barcelona-HCPB_sub-008.tsv
Processed and saved: /pscratch/sd/p/pakmasha/MBBN_data/Brazil_sub-C002061/Brazil_sub-C002061.npy
Successfully processed Brazil_sub-C002061.tsv
Processed and saved: /pscratch/sd/p/pakmasha/MBBN_data/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225.npy
Successfully processed Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225.tsv
Processed and saved: /pscratch/sd/p/pakmasha/MBBN_data/Seoul-SNU_sub-NOR117CSJ/Seoul-SNU_sub-NOR117CSJ.npy
Successfully processed Seoul-SNU_sub-NOR117CSJ.tsv
Processed and saved: /pscratch/sd/p/pakmasha/MBBN_data/Dresden_sub-GEROME3073/Dresden_sub-GEROME3073.npy
Successfully processed Dresden_sub-GEROME3073.tsv
Processed and saved: /pscratch/sd/p/pakmasha/MBBN_data/Bergen_sub-00059/Bergen_sub-00059.npy
Successfully processed Bergen_sub-00059.tsv
Processed and sa

In [4]:
pad = 400 - 316

# Convert to PyTorch tensor
data_tensor = torch.from_numpy(file)

# Apply padding to the ROI dimension
padded_data = F.pad(data_tensor, (0, 0, pad // 2, pad // 2), "constant", 0)  # (time, time, ROI, ROI)

# Convert back to NumPy (optional)
padded_data_numpy = padded_data.numpy()

In [10]:
print(f"Before padding: {file[:,:1]}")
print(f"\nAfter padding: {padded_data_numpy[:,:1]}")

Before padding: [[ 1.53256367e+01]
 [-6.17037989e+00]
 [ 2.42531307e+00]
 [ 2.25608207e+00]
 [ 7.28538826e+00]
 [ 5.72790600e-01]
 [-1.46230566e+01]
 [-1.26545591e+01]
 [-2.74412824e+00]
 [-1.97037792e+01]
 [-1.17004657e+01]
 [ 8.00839931e+00]
 [ 6.95879902e+00]
 [ 7.49275508e+00]
 [ 3.25638599e+00]
 [ 5.79628680e+00]
 [-2.26465024e+00]
 [ 1.14184302e+01]
 [ 4.57214133e+00]
 [-3.99674915e+00]
 [-8.30907861e+00]
 [ 2.64638379e+00]
 [-3.63461737e+00]
 [ 2.19149795e+01]
 [-6.56320167e-01]
 [-7.86497405e+00]
 [-8.88970519e+00]
 [ 1.10641103e+01]
 [-1.21983270e+01]
 [-1.20452727e+01]
 [-5.17294042e+00]
 [ 1.41164908e+00]
 [ 1.16259915e+01]
 [ 1.70307401e+01]
 [ 3.89560206e+01]
 [ 3.86261995e+00]
 [ 2.31193925e+01]
 [ 1.28329634e+01]
 [ 4.54117720e+01]
 [-2.43034122e+00]
 [-1.19849922e+01]
 [-1.15398425e+01]
 [-3.88623797e+00]
 [-2.23987900e+01]
 [-7.81042165e+00]
 [-1.60202959e+00]
 [ 3.10054244e+00]
 [ 6.02443375e+00]
 [-5.82007802e+00]
 [ 9.48546146e+00]
 [ 6.60391618e-01]
 [-3.52488825e+

In [11]:
print(f"Before padding: {file.shape}")
print(f"\nAfter padding: {padded_data_numpy.shape}")

Before padding: (316, 100)

After padding: (400, 100)
