# QC summary (covariate data, MRI QC, motion criteria, EPI coverage)

### Create a full list of subjects with time-series data

In [4]:
import os
import pandas as pd

# Define the base directory
base_dir = "/pscratch/sd/p/pakmasha/ENIGMA_unzip"

# Initialize a list to store subjects and samples
subjects_data = []

# Traverse through sample folders and subsample folders
for sample_folder in os.listdir(base_dir):
    sample_path = os.path.join(base_dir, sample_folder)
    if os.path.isdir(sample_path):
        # Check if this is a direct sample folder with a "halfpipe" folder
        halfpipe_path = os.path.join(sample_path, "halfpipe")
        if os.path.exists(halfpipe_path):
            # Direct sample: Traverse subjects in the "halfpipe" folder
            for subject_folder in os.listdir(halfpipe_path):
                subjects_data.append({"Subject": subject_folder, "Sample": sample_folder})
        else:
            # Subsamples exist: Traverse each subsample folder
            for subsample_folder in os.listdir(sample_path):
                subsample_path = os.path.join(sample_path, subsample_folder)
                halfpipe_path = os.path.join(subsample_path, "halfpipe")
                if os.path.exists(halfpipe_path):
                    # Traverse subjects in the subsample "halfpipe" folder
                    for subject_folder in os.listdir(halfpipe_path):
                        subjects_data.append(
                            {"Subject": subject_folder, "Sample": f"{sample_folder}/{subsample_folder}"}
                        )

# Convert the list to a DataFrame
subjects_with_ts = pd.DataFrame(subjects_data)

# Save the DataFrame to a CSV file
subjects_with_ts.to_csv("/global/homes/p/pakmasha/ENIGMA-OCD results/QC/subjects_with_ts_data.csv", index=False)

# Display the first few rows to verify the data
print("Full list of subjects")
print(subjects_with_ts.head())
print(subjects_with_ts.shape)

Full list of subjects
   Subject           Sample
0  sub-054  Vancouver_BCCHR
1  sub-052  Vancouver_BCCHR
2  sub-014  Vancouver_BCCHR
3  sub-062  Vancouver_BCCHR
4  sub-071  Vancouver_BCCHR
(2969, 2)


### Exclude the subjects without time-series data from meta-data file

##### Read the meta-data file

In [6]:
import pandas as pd

# Define the file path
file_path = "/pscratch/sd/p/pakmasha/ENIGMA_OCD_MBBN_git/ENIGMA_OCD_MBBN/ENIGMA-OCD results/QC/Formatted meta-data.xlsx"

# Load the Excel file into a pandas DataFrame
meta_data = pd.read_excel(file_path)

# Display the first few rows to verify the data
print(meta_data.head())
print(meta_data.shape)

          Sample Subject ID Formatted ID  Unique ID  OCD  Age of onset  \
0  Amsterdam_AMC        101      sub-101        NaN  2.0           NaN   
1  Amsterdam_AMC        103      sub-103        NaN  2.0           NaN   
2  Amsterdam_AMC        104      sub-104        NaN  2.0           NaN   
3  Amsterdam_AMC        105      sub-105        NaN  2.0           NaN   
4  Amsterdam_AMC        106      sub-106        NaN  2.0           NaN   

   Medication  Y-BOCS   Age  Age range  ...  Education  Depression current  \
0         1.0     0.0  49.0        3.0  ...       15.0                 0.0   
1         1.0     2.0  52.0        3.0  ...       15.0                 0.0   
2         1.0     0.0  31.0        3.0  ...       18.0                 0.0   
3         1.0     0.0  24.0        3.0  ...       17.0                 0.0   
4         1.0     0.0  25.0        3.0  ...       18.0                 0.0   

   Depression lifetime  Anxiety current  Anxiety lifetime Agr_Check  Clean  \
0       

##### Save subjects that have both meta- and time-series data to the filtered_meta_data

In [7]:
# Initialize counters for matches and non-matches
match_count = 0
no_match_count = 0

# Initialize an empty DataFrame for filtered_meta_data
filtered_meta_data = pd.DataFrame(columns=meta_data.columns)

# Iterate through each row in subjects_with_ts
for index, row in subjects_with_ts.iterrows():
    subject = row['Subject']
    sample = row['Sample']
    
    # Find matching rows in meta_data
    matching_rows = meta_data[
        (meta_data['Formatted ID'] == subject) & (meta_data['Sample'] == sample)
    ]
    
    # Add the matching rows to the filtered_meta_data DataFrame
    filtered_meta_data = pd.concat([filtered_meta_data, matching_rows], ignore_index=True)
    
    if not matching_rows.empty:
        # If a match is found, print the details
        match_count += 1
        # print(f"{subject} from {sample} matches with row {matching_rows.index.tolist()} in meta_data.")
    else:
        # If no match is found, print the message
        no_match_count += 1
        print(f"{subject} from {sample} does not have a match.")

# Print the summary of matches and non-matches
print(f"\nNumber of matches: {match_count}")
print(f"Number of non-matches: {no_match_count}")
print(f"Dimensions of the filtered meta-data: {filtered_meta_data.shape}")
print(filtered_meta_data.head())

# Save the DataFrame to a CSV file
filtered_meta_data.to_csv("/global/homes/p/pakmasha/ENIGMA-OCD results/QC/Meta-data_subject_with_ts.csv", index=False)

  filtered_meta_data = pd.concat([filtered_meta_data, matching_rows], ignore_index=True)


sub-subIDIBELL15P27 from Barcelone_Bellvitge/ANTIGA_1.5T does not have a match.
sub-C002030 from Brazil does not have a match.
sub-P00334220170726 from Brazil does not have a match.
sub-P00327820170517 from Brazil does not have a match.
sub-P00342220171004 from Brazil does not have a match.
sub-P00271920160404 from Brazil does not have a match.
sub-P00208520150303 from Brazil does not have a match.
sub-P00257820151215 from Brazil does not have a match.
sub-102 from Barcelona_HCPB does not have a match.
sub-C0168 from Bangalore_NIMHANS does not have a match.
sub-ODP130 from Bangalore_NIMHANS does not have a match.

Number of matches: 2958
Number of non-matches: 11
Dimensions of the filtered meta-data: (2958, 21)
            Sample Subject ID Formatted ID  Unique ID  OCD  Age of onset  \
0  Vancouver_BCCHR        054      sub-054        NaN  2.0           NaN   
1  Vancouver_BCCHR        052      sub-052        NaN  1.0           1.0   
2  Vancouver_BCCHR        014      sub-014        N

### Exclude subjects based on covariate data, MRI QC, and motion criteria

##### Remove duplicates from the QC.json

In [8]:
import pandas as pd
import json

# Load the QC JSON file
qc_file_path = "/pscratch/sd/p/pakmasha/ENIGMA_OCD_MBBN_git/ENIGMA_OCD_MBBN/ENIGMA-OCD results/QC/QC.json"
with open(qc_file_path, "r") as f:
    qc_data = json.load(f)

# Define the desired order of criteria
criteria_order = ["total", "covariates", "medicated_hc", "MRI", "motion", "ybocs_hc", "ybocs_hc_weak"]

# Filter the QC data to match the desired order
qc_data_sorted = [entry for criterion in criteria_order for entry in qc_data if entry["type"] == criterion]

# Traverse the sorted QC criteria and save failed_subjects
site_list = []
criterion_list = []
failed_subjects_list = []

for qc_entry in qc_data_sorted:
    site = qc_entry["site"]
    criterion = qc_entry["type"]
    failed_subjects = qc_entry["failed_subjects"]
    
    site_list.extend([site] * len(failed_subjects))  # save the site as many times as many failed_subjects there are
    criterion_list.extend([criterion] * len(failed_subjects))    # save the criterion as many times as many failed_subjects there are
    failed_subjects_list.extend(failed_subjects)

# Check if lists have the same length
print(f"Total number of sites: {len(site_list)}")
print(f"Total number of criteria: {len(criterion_list)}")
print(f"Total number of failed_subjects: {len(failed_subjects_list)}\n")

# Loop through the failed_subjects_list and delete dupicates
filtered_site_list = []
filtered_criterion_list = []
filtered_failed_subjects_list = []
duplicate_count = 0

for index in range(len(failed_subjects_list)):
    if failed_subjects_list[index] not in filtered_failed_subjects_list:
        filtered_site_list.append(site_list[index])
        filtered_criterion_list.append(criterion_list[index])
        filtered_failed_subjects_list.append(failed_subjects_list[index])
    else:
        duplicate_count += 1
        print(f"Subject {failed_subjects_list[index]} from {site_list[index]} is already excluded, total of {duplicate_count} duplicates")
            
print(f"\nTotal number of filtered sites: {len(filtered_site_list)}")
print(f"Total number of filtered criteria: {len(filtered_criterion_list)}")
print(f"Total number of filtered failed_subjects: {len(filtered_failed_subjects_list)}")

# Check if the number of duplicates equals the difference between the number of failed_subjects and filtered_failed_subjects
print(f"Duplicate count = {duplicate_count}, difference between the number of failed_subjects and filtered_failed_subjects = {len(failed_subjects_list) - len(filtered_failed_subjects_list)}")

# Create and save a dataframe
QC_csv = pd.DataFrame({
    'Sample': filtered_site_list,
    'Criterion': filtered_criterion_list,
    'Formatted ID': filtered_failed_subjects_list
})

Total number of sites: 537
Total number of criteria: 537
Total number of failed_subjects: 537

Subject sub-subSEQ1NKISENR45 from New_York is already excluded, total of 1 duplicates
Subject sub-subSEQ1NKISENR68 from New_York is already excluded, total of 2 duplicates
Subject sub-subSEQ1NKISENR107 from New_York is already excluded, total of 3 duplicates
Subject sub-subSEQ1NKISENR116 from New_York is already excluded, total of 4 duplicates
Subject sub-subSEQ1NKISENR118 from New_York is already excluded, total of 5 duplicates
Subject sub-subSEQ1NKISENR145 from New_York is already excluded, total of 6 duplicates
Subject sub-subSEQ1NKISENR149 from New_York is already excluded, total of 7 duplicates
Subject sub-subSEQ1NKISENR151 from New_York is already excluded, total of 8 duplicates
Subject sub-subSEQ1NKISENR176 from New_York is already excluded, total of 9 duplicates
Subject sub-subSEQ1NKISENR116 from New_York is already excluded, total of 10 duplicates
Subject sub-subSEQ1NKISENR118 from N

##### Remove subject IDs from QC_csv that are not in the Meta-data_subject_with_ts.csv

In [9]:
import pandas as pd

# Load the metadata file stored in the specified repository
metadata_csv_path = "/pscratch/sd/p/pakmasha/ENIGMA_OCD_MBBN_git/ENIGMA_OCD_MBBN/ENIGMA-OCD results/QC/Meta-data_subject_with_ts.csv"
metadata_df = pd.read_csv(metadata_csv_path)

# Print the number of rows in the QC dataframe before exclusion
rows_before = len(QC_csv)
print(f"Number of rows in QC dataframe before exclusion: {rows_before}")

# Perform exclusion: Keep only rows in QC where "Formatted ID" is also in Metadata
QC_csv_filtered = QC_csv[QC_csv['Formatted ID'].isin(metadata_df['Formatted ID'])]

# Print the number of rows in the QC dataframe after exclusion
rows_after = len(QC_csv_filtered)
print(f"Number of rows in QC dataframe after exclusion: {rows_after}")


Number of rows in QC dataframe before exclusion: 453
Number of rows in QC dataframe after exclusion: 433


In [10]:
print(QC_csv_filtered)

                        Sample      Criterion             Formatted ID
0               Amsterdam_VUmc          total               sub-916016
1               Amsterdam_VUmc          total               sub-916044
2               Amsterdam_VUmc          total               sub-916046
3               Amsterdam_VUmc          total               sub-916074
4      Yale_Pittinger/HCP_Trio          total    sub-YaleHCPTriota6521
..                         ...            ...                      ...
448  Yale_Pittinger/HCP_Prisma  ybocs_hc_weak  sub-YaleHCPPrismapb3883
449  Yale_Pittinger/HCP_Prisma  ybocs_hc_weak  sub-YaleHCPPrismapb3868
450              Amsterdam_AMC  ybocs_hc_weak                  sub-103
451              Amsterdam_AMC  ybocs_hc_weak                  sub-123
452              Amsterdam_AMC  ybocs_hc_weak                  sub-127

[433 rows x 3 columns]


##### Count the number of excluded subjects per criterion

In [11]:
# Count the number of rows per each value of the "Criterion" column
exclusion_counts = QC_csv_filtered['Criterion'].value_counts()

# Define the desired order of criteria
criteria_order = ["total", "covariates", "medicated_hc", "MRI", "motion", "ybocs_hc", "ybocs_hc_weak"]

# Reindex the exclusion counts to match the desired order
ordered_exclusion_counts = exclusion_counts.reindex(criteria_order)

# Display the summary in the desired order
print("Number of excluded subjects per exclusion criterion (ordered):")
print(ordered_exclusion_counts)

Number of excluded subjects per exclusion criterion (ordered):
Criterion
total             14
covariates        22
medicated_hc       2
MRI              268
motion           115
ybocs_hc           7
ybocs_hc_weak      5
Name: count, dtype: int64


##### Save the QC_without_duplicates.csv

In [12]:
QC_csv_filtered.to_csv("/pscratch/sd/p/pakmasha/ENIGMA_OCD_MBBN_git/ENIGMA_OCD_MBBN/ENIGMA-OCD results/QC/QC_without_duplicates.csv", index=False)

##### Did by hand (Output: Meta-data_after_motion_QC.csv)

In [128]:
# # For each failed_subject, check if it is in the filtered_meta_data
# exclusion_count = 0
# subjects_not_in_metadata = 0

# keys = ["total", "covariates", "medicated_hc", "MRI", "motion", "ybocs_hc", "ybocs_hc_weak"]
# exclusion_counts_by_criterion = dict.fromkeys(keys, 0)
# excluded_subjects_by_criterion = dict.fromkeys(keys, [])

# for index in range(len(filtered_failed_subjects_list)):
#     if filtered_failed_subjects_list[index] in filtered_meta_data['Formatted ID'].values:
#         row_index = filtered_meta_data.loc[filtered_meta_data['Formatted ID'] == filtered_failed_subjects_list[index]].index.tolist()
#         print(f"Subject {filtered_failed_subjects_list[index]} matches the row {row_index} in the meta-data")
        
#         # Traverse through rows in case one subject name is in several sites
#         for row in row_index:
            
#             # Check if the Sample column matches filtered_site_list[index]
#             if filtered_meta_data.loc[row, 'Sample'] == filtered_site_list[index]:
#                 # Remove the row from filtered_meta_data
#                 filtered_meta_data = filtered_meta_data.drop(index=row)
#                 exclusion_count += 1
#                 exclusion_counts_by_criterion[filtered_criterion_list[index]] += 1
#                 excluded_subjects_by_criterion[filtered_criterion_list[index]].append(filtered_failed_subjects_list[index])
                

#                 # Optionally reset the index for consistency
#                 filtered_meta_data.reset_index(drop=True, inplace=True)
                
#                 # Print out the report
#                 print(f"Subject {filtered_failed_subjects_list[index]} from {filtered_site_list[index]} has been excluded for {filtered_criterion_list[index]}, total of {exclusion_count} excluded subjects")
            
#             else:
#                 print(f"The {filtered_site_list[index]} from QC.json does not match {filtered_meta_data.loc[row, 'Sample']} from the meta_data")
            
#     else:
#         subjects_not_in_metadata += 1
#         print(f"Subject {filtered_failed_subjects_list[index]} from {filtered_site_list[index]} is not in the meta-data, total of {subjects_not_in_metadata} non-present subjects")
        
        
# # Check the exclusion results
# print(f"\nTotal number of excluded subjects: {exclusion_count}")
# print(f"Total total of non-present subjects: {subjects_not_in_metadata}")
# print(f"Number of excluded subjects by criterion: {exclusion_counts_by_criterion}")
# print(f"Number of rows in the filtered meta-data: {filtered_meta_data.shape[0]}")
# print(f"Excluded subjects by criterion: {excluded_subjects_by_criterion}")

### Exclude the samples with less than 10 subjects per class

##### Read the Meta-data_after_motion_QC.csv file

In [13]:
metadata_after_motion_QC = pd.read_csv("/pscratch/sd/p/pakmasha/ENIGMA_OCD_MBBN_git/ENIGMA_OCD_MBBN/ENIGMA-OCD results/QC/Meta-data_after_motion_QC.csv")
metadata_after_motion_QC.head()

Unnamed: 0,Sample,Subject ID,Formatted ID,Unique ID,OCD,Age of onset,Medication,Y-BOCS,Age,Age range,...,Education,Depression current,Depression lifetime,Anxiety current,Anxiety lifetime,Agr_Check,Clean,Sex_Rel,Hoard,Ord
0,Amsterdam_AMC,101,sub-101,,2,,1.0,0.0,49.0,3,...,15.0,0.0,0.0,0.0,0,0.0,0.0,0.0,,0.0
1,Amsterdam_AMC,104,sub-104,,2,,1.0,0.0,31.0,3,...,18.0,0.0,0.0,0.0,0,0.0,0.0,0.0,,0.0
2,Amsterdam_AMC,105,sub-105,,2,,1.0,0.0,24.0,3,...,17.0,0.0,0.0,0.0,0,0.0,0.0,0.0,,0.0
3,Amsterdam_AMC,107,sub-107,,2,,1.0,0.0,29.0,3,...,18.0,0.0,0.0,0.0,0,0.0,0.0,0.0,,1.0
4,Amsterdam_AMC,108,sub-108,,2,,1.0,0.0,64.0,3,...,17.0,0.0,0.0,0.0,0,1.0,0.0,1.0,,1.0


##### Remove rows with all NaNs

In [14]:
# Remove rows where all entries are NaN
metadata_after_motion_QC = metadata_after_motion_QC.dropna(how='all')

# Display the cleaned DataFrame
metadata_after_motion_QC.tail()

Unnamed: 0,Sample,Subject ID,Formatted ID,Unique ID,OCD,Age of onset,Medication,Y-BOCS,Age,Age range,...,Education,Depression current,Depression lifetime,Anxiety current,Anxiety lifetime,Agr_Check,Clean,Sex_Rel,Hoard,Ord
2520,Rome_SLF,sub-AOCD006,sub-subAOCD006,,1,1.0,3.0,16.0,20.0,3,...,13.0,1.0,,1.0,,1.0,1.0,0.0,0.0,1.0
2521,Rome_SLF,sub-AOCD007,sub-subAOCD007,,1,1.0,3.0,36.0,47.0,3,...,8.0,1.0,,1.0,,1.0,1.0,1.0,0.0,1.0
2522,Rome_SLF,sub-AOCD015,sub-subAOCD015,,1,1.0,3.0,24.0,41.0,3,...,14.0,1.0,,1.0,,1.0,1.0,0.0,0.0,1.0
2523,Rome_SLF,sub-AOCD016,sub-subAOCD016,,1,2.0,3.0,28.0,23.0,3,...,13.0,1.0,,1.0,,1.0,1.0,0.0,0.0,1.0
2524,Rome_SLF,sub-AOCD017,sub-subAOCD017,,1,1.0,3.0,8.0,43.0,3,...,23.0,,,,,1.0,0.0,0.0,0.0,1.0


##### Count the number of subjects per Sample and Class

In [15]:
# Group by Sample and count participants with diagnosis values 1 and 2
class_count_df = (
    metadata_after_motion_QC.groupby("Sample")["OCD"]
    .value_counts()
    .unstack(fill_value=0)  # Fill missing combinations with 0
    .rename(columns={1: "Number of OCD", 2: "Number of HC"})
    .reset_index()
)

# Ensure columns for diagnosis 1 and 2 exist even if some samples have none
if "Number of OCD" not in class_count_df.columns:
    class_count_df["Number of OCD"] = 0
if "Number of HC" not in class_count_df.columns:
    class_count_df["Number of HC"] = 0

# Display the first few rows of the resulting DataFrame
print(class_count_df)
print(f"\nTotal number of OCD: {sum(class_count_df['Number of OCD'].values)}")
print(f"Total number of HC: {sum(class_count_df['Number of HC'].values)}")
print(f"Total number of subjects: {sum(class_count_df['Number of OCD'].values) + sum(class_count_df['Number of HC'].values)}")

OCD                           Sample  Number of OCD  Number of HC
0                      Amsterdam_AMC             23            16
1                     Amsterdam_VUmc             39            34
2                  Bangalore_NIMHANS            187           204
3                     Barcelona_HCPB             37            28
4    Barcelone_Bellvitge/ANTIGA_1.5T             67            94
5    Barcelone_Bellvitge/COMPULSE_3T             26             0
6      Barcelone_Bellvitge/PROV_1.5T             56            26
7    Barcelone_Bellvitge/RESP_CBT_3T             10            49
8                             Bergen             37            26
9            Braga_UMinho/Braga_1.5T             28            18
10       Braga_UMinho/Braga_1.5T_act             49            61
11             Braga_UMinho/Braga_3T             32            30
12                            Brazil             60            35
13             Cape_Town_UCT/Allegra              3             6
14        

##### Remove the samples with less than 10 subjects per class

In [16]:
# Filter the rows where either "Number of OCD" or "Number of HC" is less than 10
small_samples = class_count_df[(class_count_df['Number of OCD'] < 10) | (class_count_df['Number of HC'] < 10)]

# Extract the "Sample" names that meet the condition
small_sample_names = small_samples['Sample']

# Print the sample names
print("Samples with less than 10 subjects in either OCD or HC group:")
print(small_sample_names.tolist())

metadata_after_site_removal = metadata_after_motion_QC
number_before = metadata_after_site_removal.shape[0]
print(f"Number of subjects before removal: {number_before}")

values_to_exclude = small_sample_names.tolist()
metadata_after_site_removal = metadata_after_site_removal[~metadata_after_site_removal['Sample'].isin(values_to_exclude)]  # ~ is a bitwise NOT
number_after = metadata_after_site_removal.shape[0]
print(f"Number of subjects after removal: {number_after}")
print(f"{number_before - number_after} subjects removed from {len(small_sample_names.tolist())} samples")

Samples with less than 10 subjects in either OCD or HC group:
['Barcelone_Bellvitge/COMPULSE_3T', 'Cape_Town_UCT/Allegra', 'Kyushu', 'Rome_SLF']
Number of subjects before removal: 2525
Number of subjects after removal: 2446
79 subjects removed from 4 samples


In [17]:
metadata_after_motion_QC.tail()

Unnamed: 0,Sample,Subject ID,Formatted ID,Unique ID,OCD,Age of onset,Medication,Y-BOCS,Age,Age range,...,Education,Depression current,Depression lifetime,Anxiety current,Anxiety lifetime,Agr_Check,Clean,Sex_Rel,Hoard,Ord
2520,Rome_SLF,sub-AOCD006,sub-subAOCD006,,1,1.0,3.0,16.0,20.0,3,...,13.0,1.0,,1.0,,1.0,1.0,0.0,0.0,1.0
2521,Rome_SLF,sub-AOCD007,sub-subAOCD007,,1,1.0,3.0,36.0,47.0,3,...,8.0,1.0,,1.0,,1.0,1.0,1.0,0.0,1.0
2522,Rome_SLF,sub-AOCD015,sub-subAOCD015,,1,1.0,3.0,24.0,41.0,3,...,14.0,1.0,,1.0,,1.0,1.0,0.0,0.0,1.0
2523,Rome_SLF,sub-AOCD016,sub-subAOCD016,,1,2.0,3.0,28.0,23.0,3,...,13.0,1.0,,1.0,,1.0,1.0,0.0,0.0,1.0
2524,Rome_SLF,sub-AOCD017,sub-subAOCD017,,1,1.0,3.0,8.0,43.0,3,...,23.0,,,,,1.0,0.0,0.0,0.0,1.0


# QC based on coverage data from timeseries JSON files (Steps 1-4)

### Extract coverage data from JSON files and keep the participants that passed the QC above

In [18]:
import os
import json
import pandas as pd

# Initialize an empty list to store coverage data along with subject and sample information
all_coverage_data = []
number_of_samples = 0

# Define the base directory
base_dir = "/pscratch/sd/p/pakmasha/ENIGMA_unzip"

# Function to process a halfpipe folder
def process_halfpipe(halfpipe_path, sample_name, subject_prefix=""):    
    
    for subject_folder in os.listdir(halfpipe_path):
        subject_path = os.path.join(halfpipe_path, subject_folder)
        
        # Navigate to the "func" folder inside the subject folder
        func_path = os.path.join(subject_path, "func")
        
        if os.path.exists(func_path):
            
            # Search for the desired JSON file
            for file_name in os.listdir(func_path):
                
                if "corrMatrix2_atlas-schaefer2011" in file_name and file_name.endswith(".json"):
                    file_path = os.path.join(func_path, file_name)
                    
                    # Read the JSON file
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                        coverage = data.get("Coverage", [])
                        
                        # Append the coverage data with subject and sample folder names
                        if coverage:
                            all_coverage_data.append(
                                coverage + [subject_folder, sample_name]
                            )

# Traverse through sample folders
for sample_folder in os.listdir(base_dir):
    print(sample_folder)    
    sample_path = os.path.join(base_dir, sample_folder)
    
    if os.path.isdir(sample_path):
        # Check if this is a direct sample folder with a "halfpipe" folder
        halfpipe_path = os.path.join(sample_path, "halfpipe")
        
        if os.path.exists(halfpipe_path):
            # Process this sample folder
            number_of_samples += 1
            process_halfpipe(halfpipe_path, sample_folder)
            
        else:
            
            # If no "halfpipe", assume subsample folders exist
            for subsample_folder in os.listdir(sample_path):
                print(subsample_folder)
                subsample_path = os.path.join(sample_path, subsample_folder)
                
                if os.path.isdir(subsample_path):
                    
                    # Check if the subsample contains a "halfpipe" folder
                    halfpipe_path = os.path.join(subsample_path, "halfpipe")
                    
                    if os.path.exists(halfpipe_path):
                        # Process this subsample folder
                        number_of_samples += 1
                        process_halfpipe(halfpipe_path, f"{sample_folder}/{subsample_folder}")

                        
# Convert to a DataFrame
coverage_df = pd.DataFrame(all_coverage_data)

# Check the number of samples and subjects
print(f"Total of {coverage_df.shape[0]} subjects from {number_of_samples} samples")

# Rename the last two columns for clarity
coverage_df.rename(columns={coverage_df.columns[-2]: "Formatted ID", coverage_df.columns[-1]: "Sample"}, inplace=True)

# Display the first few rows to verify the data
print(coverage_df.head())
print(coverage_df.shape)

# Filter coverage_df to only include subjects present in metadata_after_site_removal
coverage_df_filtered = coverage_df.merge(
    metadata_after_site_removal[["Formatted ID", "Sample"]],
    on=["Formatted ID", "Sample"],
    how="inner"  # Keep only rows that match
)

# Check the number of subjects after filtering
print(f"Total of {coverage_df_filtered.shape[0]} subjects")

Vancouver_BCCHR
UCLA
Barcelone_Bellvitge
COMPULSE_3T
RESP_CBT_3T
ANTIGA_1.5T
PROV_1.5T
Cape_Town_UCT
Skyra
Allegra
ENIGMA_OCD_rsfMRI_params_template_lochner_stein.xlsx
NYSPI_Columbia
Pediatric
Adults
Yale_Gruner
Dresden
Seoul_SNU
Bergen
Brazil
Amsterdam_AMC
Chiba
CHBC
CHBSRPB
CHB
Amsterdam_VUmc
Milan_HSR
Yale_Pittinger
HCP_Prisma
Yale_2014
HCP_Trio
Zurich_UCH
Barcelona_HCPB
New_York
Bangalore_NIMHANS
Shanghai_SMCH
Kyushu
Rome_SLF
Braga_UMinho
Braga_3T
Braga_1.5T_act
Braga_1.5T
Kyoto_KPU
Kyoto1.5T
Kyoto3T
Total of 2953 subjects from 36 samples
     0    1    2    3         4    5         6    7    8    9  ...  426  \
0  1.0  1.0  1.0  1.0  1.000000  1.0  0.998028  1.0  1.0  1.0  ...  1.0   
1  1.0  1.0  1.0  1.0  0.945693  1.0  0.938856  1.0  1.0  1.0  ...  1.0   
2  1.0  1.0  1.0  1.0  1.000000  1.0  1.000000  1.0  1.0  1.0  ...  1.0   
3  1.0  1.0  1.0  1.0  0.992509  1.0  1.000000  1.0  1.0  1.0  ...  1.0   
4  1.0  1.0  1.0  1.0  1.000000  1.0  1.000000  1.0  1.0  1.0  ...  1.0   



### Check subjects with no corrMatrix2 file (i.e., no coverage data)

In [19]:
import pandas as pd

subjects_without_corrmatrix2 = metadata_after_site_removal[~metadata_after_site_removal['Formatted ID'].isin(coverage_df_filtered["Formatted ID"].values)]
print(subjects_without_corrmatrix2[['Formatted ID', 'Sample']])
print(f"\nTotal of {subjects_without_corrmatrix2.shape[0]} subjects without corrMatrix2 file")

        Formatted ID             Sample
113        sub-C0002  Bangalore_NIMHANS
1533  sub-GEROME3023            Dresden
1561  sub-GEROME4035            Dresden
1562  sub-GEROME4036            Dresden
1564  sub-GEROME4039            Dresden
1566  sub-GEROME4043            Dresden
1750          sub-43          Milan_HSR
1751          sub-44          Milan_HSR
1752          sub-45          Milan_HSR
1771          sub-65          Milan_HSR
1773          sub-67          Milan_HSR
1774          sub-68          Milan_HSR

Total of 12 subjects without corrMatrix2 file


### Step 1: Exclude ROIs with no coverage across all participants

In [20]:
print(f"Total number of ROIs: {coverage_df_filtered.shape[1]-2}")

# Calculate the sum of each column
column_sums = coverage_df_filtered.sum()

# Identify columns where the sum is zero
zero_sum_columns = column_sums[column_sums == 0]
columns_to_keep = column_sums[column_sums != 0].index

# Display the results
print("Columns with a sum of zero:")
print(zero_sum_columns)

# Create a new DataFrame with only the desired columns
coverage_df_filtered = coverage_df_filtered[columns_to_keep]

# Display the updated DataFrame shape
print("\nNumber of remaining ROIs:", coverage_df_filtered.shape[1]-2)
print(f"\nNames of the remaining columns: {coverage_df_filtered.columns}")

Total number of ROIs: 434
Columns with a sum of zero:
417    0.0
430    0.0
dtype: object

Number of remaining ROIs: 432

Names of the remaining columns: Index([             0,              1,              2,              3,
                    4,              5,              6,              7,
                    8,              9,
       ...
                  425,            426,            427,            428,
                  429,            431,            432,            433,
       'Formatted ID',       'Sample'],
      dtype='object', length=434)


### Step 2: Exclude participants with insufficient ROI coverage

In [21]:
print(f"Number of subjects before: {coverage_df_filtered.shape[0]}")

# Calculate the number of ROI columns (excluding the last two columns: "Formatted ID" and "Sample")
roi_columns = coverage_df_filtered.columns[:-2]

# Calculate the percentage of ROIs with at least 50% coverage for each participant
roi_coverage_percentage = (coverage_df_filtered[roi_columns] >= 0.5).sum(axis=1) / len(roi_columns)

# Identify participants with insufficient coverage (less than 90%)
insufficient_coverage_mask = roi_coverage_percentage < 0.9

# Get the excluded participants
excluded_participants = coverage_df_filtered[insufficient_coverage_mask]

# Display the excluded subjects, their samples, and coverage percentage
excluded_participants_list = excluded_participants.copy()
excluded_participants_list["Coverage Percentage"] = roi_coverage_percentage[insufficient_coverage_mask] * 100
print("Excluded participants (subject, sample, and coverage percentage):")
print(excluded_participants_list[["Formatted ID", "Sample", "Coverage Percentage"]])

# Filter the DataFrame to retain only participants with sufficient coverage
coverage_df_filtered = coverage_df_filtered[~insufficient_coverage_mask]

# Display the updated DataFrame shape
print(f"\nNumber of remaining subjects: {coverage_df_filtered.shape[0]}")
print(f"Number of remaining ROIs: {coverage_df_filtered.shape[1]-2}")

Number of subjects before: 2434
Excluded participants (subject, sample, and coverage percentage):
              Formatted ID                         Sample  Coverage Percentage
355   sub-subIDIBELL15C284  Barcelone_Bellvitge/PROV_1.5T            88.657407
1708            sub-ODP069              Bangalore_NIMHANS            82.638889

Number of remaining subjects: 2432
Number of remaining ROIs: 432


### Step 3: Exclude ROIs with poor coverage across participants

In [22]:
print(f"Number of ROIs before: {coverage_df_filtered.shape[1]-2}")

# Calculate the number of participants
num_participants = len(coverage_df_filtered)

# Calculate the percentage of participants with at least 50% coverage for each ROI
roi_coverage_across_participants = (coverage_df_filtered.iloc[:, :-2] >= 0.5).sum(axis=0) / num_participants

# Identify ROIs to keep and exclude
rois_to_keep = roi_coverage_across_participants[roi_coverage_across_participants >= 0.9].index
excluded_rois = roi_coverage_across_participants[roi_coverage_across_participants < 0.9]

# Filter the DataFrame to keep only the desired ROIs and the last two columns ("Formatted ID" and "Sample")
coverage_df_filtered = coverage_df_filtered[rois_to_keep.to_list() + ["Formatted ID", "Sample"]]

# Display the excluded ROIs
print(f"Total of {len(excluded_rois)} ROIs excluded")
print(excluded_rois)

# Display the updated DataFrame shape
print("\nNumber of remaining ROIs:", coverage_df_filtered.shape[1]-2)
print(f"\nNames of the remaining columns: {coverage_df_filtered.columns}")

Number of ROIs before: 432
Total of 29 ROIs excluded
108    0.553043
110    0.673109
111    0.580592
113    0.693668
114    0.473684
116    0.431743
133    0.798931
168    0.890625
312    0.556332
314    0.555510
315    0.893914
316    0.473273
317    0.897615
318    0.593750
319    0.731086
320    0.787829
322    0.852796
323    0.641447
335    0.759457
421    0.859375
422    0.732319
423    0.888158
426    0.610609
427    0.723273
428    0.826480
429    0.898849
431    0.835115
432    0.818668
433    0.888980
dtype: float64

Number of remaining ROIs: 403

Names of the remaining columns: Index([             0,              1,              2,              3,
                    4,              5,              6,              7,
                    8,              9,
       ...
                  414,            415,            416,            418,
                  419,            420,            424,            425,
       'Formatted ID',       'Sample'],
      dtype='object', length=4

### Step 4: Exclude participants with zero coverage

In [23]:
print(f"Number of subjects before: {coverage_df_filtered.shape[0]}")

# Calculate the sum of ROI values for each participant (row-wise)
participant_coverage_sums = coverage_df_filtered.iloc[:, :-2].sum(axis=1)

# Identify participants with zero coverage
zero_coverage_participants = coverage_df_filtered[participant_coverage_sums == 0]

# Display the subject and sample names of participants with zero coverage
if not zero_coverage_participants.empty:
    print("Participants with zero coverage:")
    print(zero_coverage_participants[["Formatted ID", "Sample"]])

    # Exclude participants with zero coverage
    coverage_df_filtered = coverage_df_filtered[participant_coverage_sums > 0]
    print("\nUpdated DataFrame shape (after excluding zero-coverage participants):", coverage_df_filtered.shape)
else:
    print("No participants with zero coverage.")


Number of subjects before: 2432
No participants with zero coverage.


### Match the meta-data with the filtered coverage data

In [24]:
# Filter the meta-data to keep only rows with matching Subject values in coverage_df_filtered
metadata_after_coverage_qc = metadata_after_site_removal.merge(
    coverage_df_filtered[["Formatted ID", "Sample"]],
    on=["Formatted ID", "Sample"],
    how="inner"  # Keep only rows that match
)

# Display the resulting DataFrame
print(metadata_after_coverage_qc.head())
print(f"Total of {metadata_after_coverage_qc.shape[0]} subjects")

          Sample Subject ID Formatted ID  Unique ID  OCD  Age of onset  \
0  Amsterdam_AMC        101      sub-101        NaN    2           NaN   
1  Amsterdam_AMC        104      sub-104        NaN    2           NaN   
2  Amsterdam_AMC        105      sub-105        NaN    2           NaN   
3  Amsterdam_AMC        107      sub-107        NaN    2           NaN   
4  Amsterdam_AMC        108      sub-108        NaN    2           NaN   

   Medication  Y-BOCS   Age  Age range  ...  Education  Depression current  \
0         1.0     0.0  49.0          3  ...       15.0                 0.0   
1         1.0     0.0  31.0          3  ...       18.0                 0.0   
2         1.0     0.0  24.0          3  ...       17.0                 0.0   
3         1.0     0.0  29.0          3  ...       18.0                 0.0   
4         1.0     0.0  64.0          3  ...       17.0                 0.0   

   Depression lifetime  Anxiety current  Anxiety lifetime Agr_Check  Clean  \
0       

# QC based on missing ROIs in the timeseries files (Steps 5-6)

### Extract the missing ROI data for each subject

In [25]:
import os
import pandas as pd

# Initialize an empty list to store NaN counts along with subject and sample information
all_nan_data = []
number_of_samples = 0

# Define the base directory
base_dir = "/pscratch/sd/p/pakmasha/ENIGMA_unzip"

# Function to process a halfpipe folder
def process_halfpipe(halfpipe_path, sample_name, subject_prefix=""):
    
    for subject_folder in os.listdir(halfpipe_path):
        subject_path = os.path.join(halfpipe_path, subject_folder)
        
        # Navigate to the "func" folder inside the subject folder
        func_path = os.path.join(subject_path, "func")
        
        if os.path.exists(func_path):
            
            # Search for the desired TSV file
            for file_name in os.listdir(func_path):
                
                if "corrMatrix2_atlas-schaefer2011" in file_name and file_name.endswith("timeseries.tsv"):
                    file_path = os.path.join(func_path, file_name)
                    
                    # Read the TSV file
                    timeseries_data = pd.read_csv(file_path, sep="\t", header=None)
                    
                    # Calculate the number of NaNs in each column (ROI)
                    nan_counts = timeseries_data.isna().sum()
                    
                    # Append the NaN counts with subject and sample folder names
                    nan_counts = nan_counts.to_list() + [subject_folder, sample_name]
                    all_nan_data.append(nan_counts)

# Traverse through sample folders
for sample_folder in os.listdir(base_dir):
    print(sample_folder)
    sample_path = os.path.join(base_dir, sample_folder)
    
    if os.path.isdir(sample_path):
        
        # Check if this is a direct sample folder with a "halfpipe" folder
        halfpipe_path = os.path.join(sample_path, "halfpipe")
        
        if os.path.exists(halfpipe_path):
            
            # Process this sample folder
            number_of_samples += 1
            process_halfpipe(halfpipe_path, sample_folder)
            
        else:
            
            # If no "halfpipe", assume subsample folders exist
            for subsample_folder in os.listdir(sample_path):
                print(subsample_folder)
                subsample_path = os.path.join(sample_path, subsample_folder)
                
                if os.path.isdir(subsample_path):
                    
                    # Check if the subsample contains a "halfpipe" folder
                    halfpipe_path = os.path.join(subsample_path, "halfpipe")
                    
                    if os.path.exists(halfpipe_path):
                        
                        # Process this subsample folder
                        number_of_samples += 1
                        process_halfpipe(halfpipe_path, f"{sample_folder}/{subsample_folder}")

# Convert to a DataFrame
nan_df = pd.DataFrame(all_nan_data)

# Rename the last two columns for clarity
nan_df.rename(columns={nan_df.columns[-2]: "Formatted ID", nan_df.columns[-1]: "Sample"}, inplace=True)

# Filter nan_df to keep only rows with matching subjects in the meta-data
nan_df_filtered = nan_df.merge(
    metadata_after_coverage_qc[["Formatted ID", "Sample"]],
    on=["Formatted ID", "Sample"],
    how="inner"  # Keep only rows that match
)
print(f"Total of {nan_df_filtered.shape[0]} subjects from {number_of_samples} samples")
print(nan_df_filtered.head())

Vancouver_BCCHR
UCLA
Barcelone_Bellvitge
COMPULSE_3T
RESP_CBT_3T
ANTIGA_1.5T
PROV_1.5T
Cape_Town_UCT
Skyra
Allegra
ENIGMA_OCD_rsfMRI_params_template_lochner_stein.xlsx
NYSPI_Columbia
Pediatric
Adults
Yale_Gruner
Dresden
Seoul_SNU
Bergen
Brazil
Amsterdam_AMC
Chiba
CHBC
CHBSRPB
CHB
Amsterdam_VUmc
Milan_HSR
Yale_Pittinger
HCP_Prisma
Yale_2014
HCP_Trio
Zurich_UCH
Barcelona_HCPB
New_York
Bangalore_NIMHANS
Shanghai_SMCH
Kyushu
Rome_SLF
Braga_UMinho
Braga_3T
Braga_1.5T_act
Braga_1.5T
Kyoto_KPU
Kyoto1.5T
Kyoto3T
Total of 2432 subjects from 36 samples
   0  1  2  3  4  5  6  7  8  9  ...  426  427  428  429  430  431  432  433  \
0  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0  150    0    0    0   
1  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0  150    0    0    0   
2  0  0  0  0  0  0  0  0  0  0  ...    0  150    0    0  150    0    0    0   
3  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0  150    0    0    0   
4  0  0  0  0  0  0  0  0  0  0  ...    0    0    0

### Keep only the ROIs that passed the coverage QC (Steps 1-4)

In [26]:
# Identify ROI columns to keep from coverage_df_filtered (excluding meta-data columns)
roi_columns_to_keep = [col for col in coverage_df_filtered.columns if col not in ["Formatted ID", "Sample"]]

# Retain only the matching ROI columns and the meta-data columns in nan_df_filtered
nan_df_filtered = nan_df_filtered[roi_columns_to_keep + ["Formatted ID", "Sample"]]

# Display the first few rows of the updated nan_df_filtered
print(f"Total {nan_df_filtered.shape[0]} subjects and {nan_df_filtered.shape[1]-2} ROIs")
print(nan_df_filtered.head())

Total 2432 subjects and 403 ROIs
   0  1  2  3  4  5  6  7  8  9  ...  414  415  416  418  419  420  424  425  \
0  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
1  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
2  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0  150    0   
3  0  0  0  0  0  0  0  0  0  0  ...    0  150    0    0    0    0    0    0   
4  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   

   Formatted ID           Sample  
0       sub-054  Vancouver_BCCHR  
1       sub-052  Vancouver_BCCHR  
2       sub-014  Vancouver_BCCHR  
3       sub-062  Vancouver_BCCHR  
4       sub-071  Vancouver_BCCHR  

[5 rows x 405 columns]


### Step 5: exclude ROIs with more than 1% of NAs

In [27]:
print(f"Number of ROIs before: {nan_df_filtered.shape[1]-2}")

# Calculate the percentage of participants with non-zero NaN counts for each ROI
num_participants = len(nan_df_filtered)
roi_nonzero_percentage = (nan_df_filtered.iloc[:, :-2] > 0).sum(axis=0) / num_participants * 100

# Identify ROIs to exclude (more than 1% of participants with non-zero NaN counts)
excluded_rois = roi_nonzero_percentage[roi_nonzero_percentage > 1]
print("ROIs with more than 1% of participants having NaNs:")
print(excluded_rois)

# Filter ROIs to keep (1% or fewer participants with NaNs)
rois_to_keep = roi_nonzero_percentage[roi_nonzero_percentage <= 1].index
nan_df_filtered = nan_df_filtered[rois_to_keep.to_list() + ["Formatted ID", "Sample"]]

# Display the updated DataFrame shape
print(f"\nNumber of ROIs after: {nan_df_filtered.shape[1]-2}")

Number of ROIs before: 403
ROIs with more than 1% of participants having NaNs:
2       1.069079
4      10.690789
6       9.087171
8       1.027961
10      2.590461
         ...    
416     1.644737
418     3.289474
419    24.177632
424    44.284539
425    12.787829
Length: 85, dtype: float64

Number of ROIs after: 318


In [28]:
excluded_rois[:50]

2       1.069079
4      10.690789
6       9.087171
8       1.027961
10      2.590461
22      1.233553
26      1.151316
37      2.138158
39      2.878289
41      2.014803
42      2.384868
59     23.889803
61      3.083882
70      1.809211
71      1.603618
79      1.726974
80      3.001645
99      1.850329
106    10.978618
109    34.046053
112    32.113487
115    31.414474
117    20.888158
118    37.787829
122     1.110197
134     6.537829
139     3.536184
149     1.110197
160     4.029605
161     7.606908
164     4.440789
166    18.050987
167    11.019737
177     1.439145
178     1.274671
180     1.027961
183     1.356908
184     1.768092
191     6.455592
204     6.825658
205    14.555921
208    10.238487
210     1.644737
222     1.891447
238     1.274671
239     1.315789
240     1.439145
241     2.384868
242     4.194079
258     5.098684
dtype: float64

In [29]:
nan_df_filtered.columns

Index([             0,              1,              3,              5,
                    7,              9,             11,             12,
                   13,             14,
       ...
                  403,            405,            409,            410,
                  411,            412,            413,            420,
       'Formatted ID',       'Sample'],
      dtype='object', length=320)

### Step 6: exclude participants that have any missing values

In [30]:
print(f"Number of subjects before: {nan_df_filtered.shape[0]}")

# Identify participants with any NaNs (rows with any non-zero values)
rows_with_nans_mask = (nan_df_filtered.iloc[:, :-2] > 0).any(axis=1)

# Extract participants with NaNs
participants_with_nans = nan_df_filtered[rows_with_nans_mask]

# Calculate the number of missing ROIs (non-zero values) for each excluded participant
participants_with_nans["Missing ROIs"] = nan_df_filtered.iloc[:, :-2][rows_with_nans_mask].gt(0).sum(axis=1)

# Display the subject, sample, and missing ROI counts for excluded participants
print("Excluded participants with missing ROIs:")
print(participants_with_nans[["Formatted ID", "Sample", "Missing ROIs"]])

# Exclude these participants from the DataFrame
nan_df_filtered = nan_df_filtered[~rows_with_nans_mask]

# Display the updated DataFrame shape
print(f"Number of subjects after: {nan_df_filtered.shape[0]}")

Number of subjects before: 2432
Excluded participants with missing ROIs:
                Formatted ID                           Sample  Missing ROIs
44                   sub-051                  Vancouver_BCCHR             1
105      sub-RESPCBT10CTRPRE  Barcelone_Bellvitge/RESP_CBT_3T             7
120      sub-RESPCBT03CTRPRE  Barcelone_Bellvitge/RESP_CBT_3T             5
164      sub-subIDIBELL15P36  Barcelone_Bellvitge/ANTIGA_1.5T             1
171      sub-subIDIBELL15P34  Barcelone_Bellvitge/ANTIGA_1.5T             1
...                      ...                              ...           ...
2402  sub-subKyoto3Tsubj1005                Kyoto_KPU/Kyoto3T             1
2410  sub-subKyoto3Tsubj0030                Kyoto_KPU/Kyoto3T             1
2422  sub-subKyoto3Tsubj1021                Kyoto_KPU/Kyoto3T             1
2424  sub-subKyoto3Tsubj0014                Kyoto_KPU/Kyoto3T             1
2429  sub-subKyoto3Tsubj1004                Kyoto_KPU/Kyoto3T             1

[242 rows x 3 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participants_with_nans["Missing ROIs"] = nan_df_filtered.iloc[:, :-2][rows_with_nans_mask].gt(0).sum(axis=1)


In [31]:
participants_with_nans[["Formatted ID", "Sample", "Missing ROIs"]][200:]

Unnamed: 0,Formatted ID,Sample,Missing ROIs
1993,sub-ODP024,Bangalore_NIMHANS,1
2002,sub-C0146,Bangalore_NIMHANS,9
2004,sub-ODP059,Bangalore_NIMHANS,4
2005,sub-C0086,Bangalore_NIMHANS,8
2010,sub-C0199,Bangalore_NIMHANS,5
2016,sub-054,Shanghai_SMCH,31
2023,sub-047,Shanghai_SMCH,27
2031,sub-013,Shanghai_SMCH,22
2054,sub-004,Shanghai_SMCH,3
2078,sub-055,Shanghai_SMCH,48


### Match the meta-data with the filtered NaN data

In [32]:
# Filter meta-data to keep only rows with matching subjects in nan_df_filtered
metadata_after_nan_qc = metadata_after_coverage_qc.merge(
    nan_df_filtered[["Formatted ID", "Sample"]],
    on=["Formatted ID", "Sample"],
    how="inner"  # Keep only rows that match
)

# Display the resulting DataFrame
print(metadata_after_nan_qc.head())
print(f"Total of {metadata_after_nan_qc.shape[0]} subjects")

          Sample Subject ID Formatted ID  Unique ID  OCD  Age of onset  \
0  Amsterdam_AMC        104      sub-104        NaN    2           NaN   
1  Amsterdam_AMC        105      sub-105        NaN    2           NaN   
2  Amsterdam_AMC        108      sub-108        NaN    2           NaN   
3  Amsterdam_AMC        109      sub-109        NaN    2           NaN   
4  Amsterdam_AMC        121      sub-121        NaN    2           NaN   

   Medication  Y-BOCS   Age  Age range  ...  Education  Depression current  \
0         1.0     0.0  31.0          3  ...       18.0                 0.0   
1         1.0     0.0  24.0          3  ...       17.0                 0.0   
2         1.0     0.0  64.0          3  ...       17.0                 0.0   
3         1.0     0.0  21.0          3  ...       13.0                 0.0   
4         1.0     0.0  27.0          3  ...       14.0                 0.0   

   Depression lifetime  Anxiety current  Anxiety lifetime Agr_Check  Clean  \
0       

### Exclude the samples with less than 10 subjects per class

In [33]:
# Group by Sample and count participants with diagnosis values 1 and 2
class_count_df = (
    metadata_after_nan_qc.groupby("Sample")["OCD"]
    .value_counts()
    .unstack(fill_value=0)  # Fill missing combinations with 0
    .rename(columns={1: "Number of OCD", 2: "Number of HC"})
    .reset_index()
)

# Ensure columns for diagnosis 1 and 2 exist even if some samples have none
if "Number of OCD" not in class_count_df.columns:
    class_count_df["Number of OCD"] = 0
if "Number of HC" not in class_count_df.columns:
    class_count_df["Number of HC"] = 0

# Display the first few rows of the resulting DataFrame
print(class_count_df)
print(f"\nTotal number of OCD: {sum(class_count_df['Number of OCD'].values)}")
print(f"Total number of HC: {sum(class_count_df['Number of HC'].values)}")
print(f"Total number of subjects: {sum(class_count_df['Number of OCD'].values) + sum(class_count_df['Number of HC'].values)}")

OCD                           Sample  Number of OCD  Number of HC
0                      Amsterdam_AMC              8             6
1                     Amsterdam_VUmc             29            28
2                  Bangalore_NIMHANS            169           189
3                     Barcelona_HCPB             36            28
4    Barcelone_Bellvitge/ANTIGA_1.5T             51            86
5      Barcelone_Bellvitge/PROV_1.5T             43            21
6    Barcelone_Bellvitge/RESP_CBT_3T             10            47
7                             Bergen             29            24
8            Braga_UMinho/Braga_1.5T             28            18
9        Braga_UMinho/Braga_1.5T_act             44            52
10             Braga_UMinho/Braga_3T             32            27
11                            Brazil             59            34
12               Cape_Town_UCT/Skyra             23            16
13                         Chiba/CHB             16            26
14        

In [34]:
# Filter the rows where either "Number of OCD" or "Number of HC" is less than 10
small_samples = class_count_df[(class_count_df['Number of OCD'] < 10) | (class_count_df['Number of HC'] < 10)]

# Extract the "Sample" names that meet the condition
small_sample_names = small_samples['Sample']

# Print the sample names
print("Samples with less than 10 subjects in either OCD or HC group:")
print(small_sample_names.tolist())

metadata_after_site_removal2 = metadata_after_nan_qc
number_before = metadata_after_site_removal2.shape[0]
print(f"Number of subjects before removal: {number_before}")

values_to_exclude = small_sample_names.tolist()
metadata_after_site_removal2 = metadata_after_site_removal2[~metadata_after_site_removal2['Sample'].isin(values_to_exclude)]  # ~ is a bitwise NOT
number_after = metadata_after_site_removal2.shape[0]
print(f"Number of subjects after removal: {number_after}")
print(f"{number_before - number_after} subjects removed from {len(small_sample_names.tolist())} samples")

Samples with less than 10 subjects in either OCD or HC group:
['Amsterdam_AMC', 'Milan_HSR', 'Zurich_UCH']
Number of subjects before removal: 2190
Number of subjects after removal: 2095
95 subjects removed from 3 samples


In [35]:
metadata_after_site_removal2.head()

Unnamed: 0,Sample,Subject ID,Formatted ID,Unique ID,OCD,Age of onset,Medication,Y-BOCS,Age,Age range,...,Education,Depression current,Depression lifetime,Anxiety current,Anxiety lifetime,Agr_Check,Clean,Sex_Rel,Hoard,Ord
14,Amsterdam_VUmc,sub-916002,sub-916002,,1,1.0,2.0,24.0,22.0,3,...,11.0,2.0,2.0,1.0,1,1.0,1.0,0.0,0.0,1.0
15,Amsterdam_VUmc,sub-916005,sub-916005,,2,,1.0,,27.0,3,...,9.0,0.0,0.0,0.0,0,,,,,
16,Amsterdam_VUmc,sub-916006,sub-916006,,1,2.0,2.0,15.0,21.0,3,...,11.0,1.0,1.0,1.0,1,,,,,
17,Amsterdam_VUmc,sub-916007,sub-916007,,2,,2.0,,28.0,3,...,11.0,0.0,0.0,0.0,0,,,,,
18,Amsterdam_VUmc,sub-916008,sub-916008,,1,1.0,2.0,29.0,35.0,3,...,11.0,2.0,1.0,1.0,1,1.0,0.0,0.0,1.0,1.0


##### Count the number of OCD and HC subjects in the final sample

In [36]:
# Count the number of rows per value in the "OCD" column
ocd_counts = metadata_after_site_removal2['OCD'].value_counts()

# Print the counts
print("Number of subjects per OCD value:")
print(ocd_counts)

Number of subjects per OCD value:
OCD
2    1055
1    1040
Name: count, dtype: int64


# Save the data for MBBN

### Create a final meta-data with Unique IDs

In [None]:
import pandas as pd

print("Original format:")
print(metadata_after_site_removal2[['Sample', 'Formatted ID', 'Unique ID']].head())

# Rename the Unique ID column (need an underbar _)
metadata_after_site_removal2 = metadata_after_site_removal2.rename(columns={'Unique ID': 'Unique_ID'})

# Replace "/" amd "_" with "-" in the Sample column
metadata_after_site_removal2['Sample'] = metadata_after_site_removal2['Sample'].str.replace('/', '-')
metadata_after_site_removal2['Sample'] = metadata_after_site_removal2['Sample'].str.replace('_', '-')

# Create the Unique_ID column by combining Sample and Subject
metadata_after_site_removal2['Unique_ID'] = metadata_after_site_removal2['Sample'] + '_' + metadata_after_site_removal2['Formatted ID']

print("\nNew format:")
print(metadata_after_site_removal2[['Sample', 'Formatted ID', 'Unique_ID']].head())

# Save the updated DataFrame
metadata_after_site_removal2.to_csv("/pscratch/sd/p/pakmasha/ENIGMA_OCD_MBBN_git/ENIGMA_OCD_MBBN/MBBN-main/data/metadata/ENIGMA_QC_final_subject_list.csv", index=False)
print("\nMeta-data has been uploaded to the MBBN-main/data/metadata folder")

Original format:
            Sample Formatted ID  Unique ID
14  Amsterdam_VUmc   sub-916002        NaN
15  Amsterdam_VUmc   sub-916005        NaN
16  Amsterdam_VUmc   sub-916006        NaN
17  Amsterdam_VUmc   sub-916007        NaN
18  Amsterdam_VUmc   sub-916008        NaN

New format:
            Sample Formatted ID                  Unique_ID
14  Amsterdam-VUmc   sub-916002  Amsterdam-VUmc_sub-916002
15  Amsterdam-VUmc   sub-916005  Amsterdam-VUmc_sub-916005
16  Amsterdam-VUmc   sub-916006  Amsterdam-VUmc_sub-916006
17  Amsterdam-VUmc   sub-916007  Amsterdam-VUmc_sub-916007
18  Amsterdam-VUmc   sub-916008  Amsterdam-VUmc_sub-916008

Meta-data has been uploaded to the MBBN-main/data/metadata folder


In [38]:
print(f"Total number of subjects: {metadata_after_site_removal2.shape[0]}")

Total number of subjects: 2095


### Copy the time-series data files to the MBBN_data directory

In [39]:
# Save the ROIs that passed QC (remove the last two columns: Formatted ID and Sample)
columns_to_remove = list(set(coverage_df.columns) - set(nan_df_filtered.columns))

In [41]:
# Save this list to use in other notebooks
import pickle

# Save the list to a pickle file
with open('columns_to_remove.pkl', 'wb') as f:
    pickle.dump(columns_to_remove, f)

print("List saved successfully!")

List saved successfully!


In [52]:
# Load the look-up table
roi_lut = pd.read_csv("/global/homes/p/pakmasha/ENIGMA-OCD results/QC/all_regions_Schaefer2018_400Parcels_17Networks_LUT.csv", sep=",")
print(roi_lut.head())

# Save the ROI names
roi_names = roi_lut['Schaeffer_Yeon_labels'].values
print("\nThe first 5 ROIs:")
print(roi_names[:5])

print("\nThe last 5 ROIs:")
print(roi_names[-5:])

# Remove the last 4 ROIs (additionally computed in the Bruin et. al (2023) paper)
roi_names = roi_names[:-4]
print("\nThe last 5 ROIs after editing:")
print(roi_names[-5:])

   roi_ID        R         G         B          Schaeffer_Yeon_labels  \
0       1  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_1   
1       2  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_2   
2       3  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_3   
3       4  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_4   
4       5  0.47059  0.070588  0.535294  17Networks_LH_VisCent_ExStr_5   

                 halfpipe_labels Schaeffer_Yeon_17_networks  \
0  17Networks_LH_VisCent_ExStr_1                    VisCent   
1  17Networks_LH_VisCent_ExStr_2                    VisCent   
2  17Networks_LH_VisCent_ExStr_3                    VisCent   
3  17Networks_LH_VisCent_ExStr_4                    VisCent   
4  17Networks_LH_VisCent_ExStr_5                    VisCent   

  halfpipe_17_networks  
0              VisCent  
1              VisCent  
2              VisCent  
3              VisCent  
4              VisCent  

The first 5 ROIs:
['17Networks_

In [53]:
import numpy as np

# Delete columns_to_remove from the roi_names
roi_names_filtered = np.delete(roi_names, columns_to_remove)
removed_entries = roi_names[columns_to_remove]

print("Original number of ROIs:", len(roi_names))
print("Filtered number of ROIs:", len(roi_names_filtered))
print("\nRemoved entries:", removed_entries)

Original number of ROIs: 434
Filtered number of ROIs: 318

Removed entries: ['17Networks_LH_VisCent_ExStr_3' '17Networks_LH_VisCent_ExStr_5'
 '17Networks_LH_VisCent_Striate_1' '17Networks_LH_VisCent_ExStr_8'
 '17Networks_LH_VisCent_ExStr_10' '17Networks_LH_VisPeri_ExStrSup_4'
 '17Networks_LH_SomMotA_3' '17Networks_LH_SomMotA_14'
 '17Networks_LH_SomMotA_16' '17Networks_LH_SomMotA_18'
 '17Networks_LH_SomMotA_19' '17Networks_LH_DorsAttnA_TempOcc_1'
 '17Networks_LH_DorsAttnA_TempOcc_3' '17Networks_LH_DorsAttnA_SPL_6'
 '17Networks_LH_DorsAttnA_SPL_7' '17Networks_LH_DorsAttnB_PostC_8'
 '17Networks_LH_DorsAttnB_PostC_9' '17Networks_LH_SalVentAttnA_FrMed_3'
 '17Networks_LH_SalVentAttnB_OFC_1' '17Networks_LH_LimbicB_OFC_1'
 '17Networks_LH_LimbicB_OFC_2' '17Networks_LH_LimbicB_OFC_3'
 '17Networks_LH_LimbicB_OFC_4' '17Networks_LH_LimbicB_OFC_5'
 '17Networks_LH_LimbicA_TempPole_1' '17Networks_LH_LimbicA_TempPole_2'
 '17Networks_LH_LimbicA_TempPole_3' '17Networks_LH_LimbicA_TempPole_4'
 '17Networks

In [54]:
print("Final ROIs:", roi_names_filtered)

Final ROIs: ['17Networks_LH_VisCent_ExStr_1' '17Networks_LH_VisCent_ExStr_2'
 '17Networks_LH_VisCent_ExStr_4' '17Networks_LH_VisCent_ExStr_6'
 '17Networks_LH_VisCent_ExStr_7' '17Networks_LH_VisCent_ExStr_9'
 '17Networks_LH_VisCent_ExStr_11' '17Networks_LH_VisPeri_ExStrInf_1'
 '17Networks_LH_VisPeri_ExStrInf_2' '17Networks_LH_VisPeri_ExStrInf_3'
 '17Networks_LH_VisPeri_ExStrInf_4' '17Networks_LH_VisPeri_ExStrInf_5'
 '17Networks_LH_VisPeri_StriCal_1' '17Networks_LH_VisPeri_StriCal_2'
 '17Networks_LH_VisPeri_ExStrSup_1' '17Networks_LH_VisPeri_ExStrSup_2'
 '17Networks_LH_VisPeri_ExStrSup_3' '17Networks_LH_VisPeri_ExStrSup_5'
 '17Networks_LH_SomMotA_1' '17Networks_LH_SomMotA_2'
 '17Networks_LH_SomMotA_4' '17Networks_LH_SomMotA_5'
 '17Networks_LH_SomMotA_6' '17Networks_LH_SomMotA_7'
 '17Networks_LH_SomMotA_8' '17Networks_LH_SomMotA_9'
 '17Networks_LH_SomMotA_10' '17Networks_LH_SomMotA_11'
 '17Networks_LH_SomMotA_12' '17Networks_LH_SomMotA_13'
 '17Networks_LH_SomMotA_15' '17Networks_LH_SomMot

In [55]:
import os
import shutil
import pandas as pd

# Define the base directories
base_dir = "/pscratch/sd/p/pakmasha/ENIGMA_unzip"
output_dir = "/pscratch/sd/p/pakmasha/MBBN_data"

# Function to process a halfpipe folder
def process_halfpipe(halfpipe_path, sample_name, subject_prefix=""):
    
    for subject_folder in os.listdir(halfpipe_path):
        subject_path = os.path.join(halfpipe_path, subject_folder)
        
        # Navigate to the "func" folder inside the subject folder
        func_path = os.path.join(subject_path, "func")
        
        if os.path.exists(func_path):
            
            # Search for the desired TSV file
            for file_name in os.listdir(func_path):
                
                if "corrMatrix2_atlas-schaefer2011" in file_name and file_name.endswith("timeseries.tsv"):
                    file_path = os.path.join(func_path, file_name)

                    # Replace "_" with "-" in sample and subject names
                    sanitized_sample_name = sample_name.replace("_", "-")
                    sanitized_sample_name = sanitized_sample_name.replace("/", "-")
                    sanitized_subject_name = subject_folder.replace("_", "-")

                    # Combine sample, subsample (if exists), and subject for folder name
                    subject_folder_name = f"{sanitized_sample_name}_{sanitized_subject_name}"
                    subject_subdir = os.path.join(output_dir, subject_folder_name)
                    
                    # Check if the subject is present in the final meta-data
                    if subject_folder_name in metadata_after_site_removal2['Unique_ID'].values:
                        
                        os.makedirs(subject_subdir, exist_ok=True)
                    
                        # Load the file and remove specified columns
                        try:
                            df = pd.read_csv(file_path, sep="\t", header=None)
                            df_filtered = df.drop(columns=columns_to_remove, errors='ignore')
                            
                            # Change the column names to ROI names
                            df_filtered.columns = roi_names_filtered

                            # Save the modified file to the output directory
                            output_file_name = f"{subject_folder_name}.tsv"
                            output_file_path = os.path.join(subject_subdir, output_file_name)
                            df_filtered.to_csv(output_file_path, sep="\t", index=False, header=True)

                            #print(f"Processed and copied: {file_path} to {output_file_path}")
                        except Exception as e:
                            print(f"Error processing file {file_path}: {e}")
                            
                    else:
                        print(f"Subject {subject_folder_name} is not in the final meta-data")

                        
# Traverse through sample folders
for sample_folder in os.listdir(base_dir):
    print(sample_folder)
    sample_path = os.path.join(base_dir, sample_folder)
    
    if os.path.isdir(sample_path):
        # Check if this is a direct sample folder with a "halfpipe" folder
        halfpipe_path = os.path.join(sample_path, "halfpipe")
        
        if os.path.exists(halfpipe_path):
            # Process this sample folder
            process_halfpipe(halfpipe_path, sample_folder)
            
        else:
            
            # If no "halfpipe", assume subsample folders exist
            for subsample_folder in os.listdir(sample_path):
                subsample_path = os.path.join(sample_path, subsample_folder)
                
                if os.path.isdir(subsample_path):
                    # Check if the subsample contains a "halfpipe" folder
                    halfpipe_path = os.path.join(subsample_path, "halfpipe")
                    
                    if os.path.exists(halfpipe_path):
                        # Combine sample and subsample names
                        combined_name = f"{sample_folder}/{subsample_folder}"
                        process_halfpipe(halfpipe_path, combined_name)


Vancouver_BCCHR
Subject Vancouver-BCCHR_sub-041 is not in the final meta-data
Subject Vancouver-BCCHR_sub-017 is not in the final meta-data
Subject Vancouver-BCCHR_sub-020 is not in the final meta-data
Subject Vancouver-BCCHR_sub-073 is not in the final meta-data
Subject Vancouver-BCCHR_sub-076 is not in the final meta-data
Subject Vancouver-BCCHR_sub-069 is not in the final meta-data
Subject Vancouver-BCCHR_sub-074 is not in the final meta-data
Subject Vancouver-BCCHR_sub-019 is not in the final meta-data
Subject Vancouver-BCCHR_sub-051 is not in the final meta-data
UCLA
Subject UCLA_sub-AOCD003 is not in the final meta-data
Subject UCLA_sub-AOCD014 is not in the final meta-data
Subject UCLA_sub-AOCD021 is not in the final meta-data
Subject UCLA_sub-AOCD030 is not in the final meta-data
Subject UCLA_sub-AOCD049 is not in the final meta-data
Subject UCLA_sub-AOCD023 is not in the final meta-data
Subject UCLA_sub-AOCD011 is not in the final meta-data
Subject UCLA_sub-AOCD022 is not in t

### Check if the data files were copied successfully

In [56]:
import pandas as pd

file_path = '/pscratch/sd/p/pakmasha/ENIGMA_unzip/Amsterdam_VUmc/halfpipe/sub-916002/func/sub-916002_task-rest_feature-corrMatrix2_atlas-schaefer2011CombinedDseg_timeseries.tsv'
orig_data = pd.read_csv(file_path, sep='\t', header=None)

file_path = '/pscratch/sd/p/pakmasha/MBBN_data/Amsterdam-VUmc_sub-916002/Amsterdam-VUmc_sub-916002.tsv'
copied_data = pd.read_csv(file_path, sep='\t')

print("Original time-series data:")
print(orig_data.head())
print("\nCopied time-series data:")
print(copied_data.head())

Original time-series data:
           0             1            2             3            4    \
0  9145.176384  11876.161800  8794.611336  10539.996906  9509.229325   
1  9105.992200  11892.097954  8786.183790  10480.838742  9489.655678   
2  9173.305319  11909.495056  8812.077201  10526.539572  9500.169096   
3  9114.023436  11913.536802  8819.525485  10533.051217  9505.947588   
4  9107.744233  11887.304097  8813.337332  10502.351805  9515.064338   

           5    6             7             8             9    ...  424  425  \
0  9909.672740  NaN  12182.052324  11245.322015  12286.508742  ...  NaN  NaN   
1  9894.491422  NaN  12208.509649  11244.797909  12298.751867  ...  NaN  NaN   
2  9916.177132  NaN  12204.806586  11261.847188  12303.074369  ...  NaN  NaN   
3  9921.154006  NaN  12174.430049  11252.658006  12255.664426  ...  NaN  NaN   
4  9909.026123  NaN  12180.535926  11253.464035  12252.981479  ...  NaN  NaN   

   426  427  428  429  430  431  432  433  
0  NaN  NaN  Na

### Check if all subjects from the meta-data file have the corresponding time-series directory

In [57]:
import os
import pandas as pd

# Path to the base directory where subject subfolders should be located
base_dir = '/pscratch/sd/p/pakmasha/MBBN_data'

# Extract the unique subject IDs from the 'Unique_ID' column
subject_ids = metadata_after_site_removal2['Unique_ID'].tolist()

# Check if each subject has a corresponding subfolder
for subject_id in subject_ids:
    subject_folder = os.path.join(base_dir, subject_id)
    if os.path.isdir(subject_folder):
        print(f"Subfolder exists for subject {subject_id}")
    else:
        print(f"Subfolder missing for subject {subject_id}")


Subfolder exists for subject Amsterdam-VUmc_sub-916002
Subfolder exists for subject Amsterdam-VUmc_sub-916005
Subfolder exists for subject Amsterdam-VUmc_sub-916006
Subfolder exists for subject Amsterdam-VUmc_sub-916007
Subfolder exists for subject Amsterdam-VUmc_sub-916008
Subfolder exists for subject Amsterdam-VUmc_sub-916010
Subfolder exists for subject Amsterdam-VUmc_sub-916011
Subfolder exists for subject Amsterdam-VUmc_sub-916013
Subfolder exists for subject Amsterdam-VUmc_sub-916014
Subfolder exists for subject Amsterdam-VUmc_sub-916015
Subfolder exists for subject Amsterdam-VUmc_sub-916017
Subfolder exists for subject Amsterdam-VUmc_sub-916019
Subfolder exists for subject Amsterdam-VUmc_sub-916020
Subfolder exists for subject Amsterdam-VUmc_sub-916022
Subfolder exists for subject Amsterdam-VUmc_sub-916024
Subfolder exists for subject Amsterdam-VUmc_sub-916025
Subfolder exists for subject Amsterdam-VUmc_sub-916026
Subfolder exists for subject Amsterdam-VUmc_sub-916027
Subfolder 

### Check the number of subfolders in MBBN_data

In [58]:
import os

# Path to the base directory
base_dir = '/pscratch/sd/p/pakmasha/MBBN_data'

# List all items in the directory and filter to keep only subfolders
subfolders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

# Count the number of subfolders
num_subfolders = len(subfolders)

# Print the result
print(f"Number of subfolders in {base_dir}: {num_subfolders}")
print(f"Numer of subjects in the final meta-data: {metadata_after_site_removal2.shape[0]}")

Number of subfolders in /pscratch/sd/p/pakmasha/MBBN_data: 2095
Numer of subjects in the final meta-data: 2095


### Convert .tsv files to .npy

In [59]:
import os
import numpy as np
import pandas as pd

# Define the base path where the .tsv files are stored
base_path = '/pscratch/sd/p/pakmasha/MBBN_data'

# Walk through all subdirectories and process .tsv files
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith('.tsv'):
            # Construct full path to the .tsv file
            tsv_file_path = os.path.join(root, file)

            # Load the .tsv file into a pandas DataFrame
            data = pd.read_csv(tsv_file_path, sep='\t')

            # Convert the DataFrame to a numpy array
            npy_array = data.values

            # Construct the output .npy file path
            npy_file_path = os.path.join(root, file.replace('.tsv', '.npy'))

            # Save the numpy array to a .npy file
            np.save(npy_file_path, npy_array)

            print(f"Processed: {tsv_file_path} -> {npy_file_path}")

print("All files have been processed.")

Processed: /pscratch/sd/p/pakmasha/MBBN_data/Barcelona-HCPB_sub-008/Barcelona-HCPB_sub-008.tsv -> /pscratch/sd/p/pakmasha/MBBN_data/Barcelona-HCPB_sub-008/Barcelona-HCPB_sub-008.npy
Processed: /pscratch/sd/p/pakmasha/MBBN_data/Brazil_sub-C002061/Brazil_sub-C002061.tsv -> /pscratch/sd/p/pakmasha/MBBN_data/Brazil_sub-C002061/Brazil_sub-C002061.npy
Processed: /pscratch/sd/p/pakmasha/MBBN_data/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225.tsv -> /pscratch/sd/p/pakmasha/MBBN_data/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225.npy
Processed: /pscratch/sd/p/pakmasha/MBBN_data/Seoul-SNU_sub-NOR117CSJ/Seoul-SNU_sub-NOR117CSJ.tsv -> /pscratch/sd/p/pakmasha/MBBN_data/Seoul-SNU_sub-NOR117CSJ/Seoul-SNU_sub-NOR117CSJ.npy
Processed: /pscratch/sd/p/pakmasha/MBBN_data/Dresden_sub-GEROME3073/Dresden_sub-GEROME3073.tsv -> /pscratch/sd/p/pakmasha/MBBN_data/Dresden_sub-GEROME3073/Dresden_sub-GEROME307

### Check if the data has been converted successfully

In [60]:
import os
import numpy as np
import pandas as pd

tsv_file_path = '/pscratch/sd/p/pakmasha/MBBN_data/Brazil_sub-C002061/Brazil_sub-C002061.tsv'
npy_file_path = '/pscratch/sd/p/pakmasha/MBBN_data/Brazil_sub-C002061/Brazil_sub-C002061.npy'

# Load the original TSV data
data = pd.read_csv(tsv_file_path, sep='\t')

# Load the corresponding NPY data
npy_array = np.load(npy_file_path)

# Print comparison
print(f"TSV File: {os.path.basename(tsv_file_path)}")
print("TSV Head:")
print(data.head())
print("TSV Shape:", data.shape)

print(f"NPY File: {os.path.basename(npy_file_path)}")
print("NPY Head:")
print(npy_array[:5, :5])  # Display the first 5 rows and columns
print("NPY Shape:", npy_array.shape)
print("\n")

TSV File: Brazil_sub-C002061.tsv
TSV Head:
   17Networks_LH_VisCent_ExStr_1  17Networks_LH_VisCent_ExStr_2  \
0                    9882.932166                   10300.414399   
1                    9867.733327                   10246.597112   
2                    9886.378702                   10288.226085   
3                    9901.066246                   10297.784162   
4                    9881.410635                   10288.765738   

   17Networks_LH_VisCent_ExStr_4  17Networks_LH_VisCent_ExStr_6  \
0                    8943.649037                    7159.924905   
1                    8910.039349                    7169.504028   
2                    8972.189467                    7171.974852   
3                    8954.843609                    7154.729649   
4                    8937.796606                    7143.793038   

   17Networks_LH_VisCent_ExStr_7  17Networks_LH_VisCent_ExStr_9  \
0                    9088.905769                    8607.286626   
1                

### Remove columns to match the number of ROIs to the number of attention heads

##### Identify ROIs with the lowest variance

In [61]:
import os
import numpy as np
import pandas as pd

# Define the base path where the .npy files are stored
base_path = '/pscratch/sd/p/pakmasha/MBBN_data'

# Initialize a list to hold variance data for all subjects
all_variances = []

# Walk through all subdirectories and process .npy files
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith('.npy') and file.split("_")[-2] != "0.01":
            # Construct full path to the .npy file
            npy_file_path = os.path.join(root, file)
            
            # Load the .npy file
            data = np.load(npy_file_path)
            
            # Compute variance across time points for each ROI
            variances = np.var(data, axis=0)
            
            # Append to the list of variances
            all_variances.append(variances)

# Convert the list of variances to a numpy array for easy computation
all_variances = np.array(all_variances)
print(f"Dimension of the all_variances array: {all_variances.shape}")

# Compute mean variance for each ROI across all subjects
mean_variances = np.mean(all_variances, axis=0)

# Define the number of ROIs to identify
num_rois_to_remove = 2

# Identify indices of ROIs with the lowest mean variance
lowest_variance_indices = np.argsort(mean_variances)[:num_rois_to_remove]

# Output the indices of ROIs to remove and their corresponding mean variances
rois_to_remove = {
    "indices": lowest_variance_indices,
    "mean_variances": mean_variances[lowest_variance_indices]
}

print("ROIs with the lowest mean variances:", rois_to_remove)


Dimension of the all_variances array: (2095, 318)
ROIs with the lowest mean variances: {'indices': array([307, 317]), 'mean_variances': array([385.95870799, 402.5133708 ])}


##### Check the names of chosen ROIs

In [62]:
import os
import numpy as np
import pandas as pd

# File paths for the .tsv files
tsv_files = [
    "/pscratch/sd/p/pakmasha/MBBN_data/Amsterdam-VUmc_sub-916002/Amsterdam-VUmc_sub-916002.tsv",
    "/pscratch/sd/p/pakmasha/MBBN_data/Bangalore-NIMHANS_sub-C0069/Bangalore-NIMHANS_sub-C0069.tsv"
]

# Iterate over the files and check column names for the specified indices
for tsv_file in tsv_files:
    # Load the .tsv file
    data = pd.read_csv(tsv_file, sep='\t')

    # Get column names for the specified indices
    column_names = data.columns[rois_to_remove["indices"]]

    # Print the results
    print(f"Column names in file {tsv_file} for ROIs to remove:")
    print(column_names.tolist())
    print("-")


Column names in file /pscratch/sd/p/pakmasha/MBBN_data/Amsterdam-VUmc_sub-916002/Amsterdam-VUmc_sub-916002.tsv for ROIs to remove:
['FreeSurfer_Left-Thalamus', 'Buckner2011_17Networks_4']
-
Column names in file /pscratch/sd/p/pakmasha/MBBN_data/Bangalore-NIMHANS_sub-C0069/Bangalore-NIMHANS_sub-C0069.tsv for ROIs to remove:
['FreeSurfer_Left-Thalamus', 'Buckner2011_17Networks_4']
-


##### Remove the chosen ROIs from numpy files

In [63]:
import os
import numpy as np
import pandas as pd

# File paths for the .tsv files and .npy files
base_path = "/pscratch/sd/p/pakmasha/MBBN_data"

# Walk through the directory structure to find .npy files
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith('.npy') and file.split("_")[-2] != "0.01":
            # Construct the full path to the .npy file
            npy_file_path = os.path.join(root, file)

            # Load the .npy file
            data = np.load(npy_file_path)

            # Exclude the specified ROIs by removing the corresponding columns
            data_reduced = np.delete(data, rois_to_remove["indices"], axis=1)

            # Save the modified data back to the same file
            np.save(npy_file_path, data_reduced)

            print(f"Updated file: {npy_file_path} by removing ROIs with indices {rois_to_remove['indices']}")


Updated file: /pscratch/sd/p/pakmasha/MBBN_data/Barcelona-HCPB_sub-008/Barcelona-HCPB_sub-008.npy by removing ROIs with indices [307 317]
Updated file: /pscratch/sd/p/pakmasha/MBBN_data/Brazil_sub-C002061/Brazil_sub-C002061.npy by removing ROIs with indices [307 317]
Updated file: /pscratch/sd/p/pakmasha/MBBN_data/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225/Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225.npy by removing ROIs with indices [307 317]
Updated file: /pscratch/sd/p/pakmasha/MBBN_data/Seoul-SNU_sub-NOR117CSJ/Seoul-SNU_sub-NOR117CSJ.npy by removing ROIs with indices [307 317]
Updated file: /pscratch/sd/p/pakmasha/MBBN_data/Dresden_sub-GEROME3073/Dresden_sub-GEROME3073.npy by removing ROIs with indices [307 317]
Updated file: /pscratch/sd/p/pakmasha/MBBN_data/Bergen_sub-00059/Bergen_sub-00059.npy by removing ROIs with indices [307 317]
Updated file: /pscratch/sd/p/pakmasha/MBBN_data/Bangalore-NIMHANS_sub-C0181/Bangalore-NIMHANS_sub-C0181.npy by removing ROIs with indice

##### Check the resulting .npy time series

In [64]:
filepath = "/pscratch/sd/p/pakmasha/MBBN_data/Amsterdam-VUmc_sub-916002/Amsterdam-VUmc_sub-916002.npy"
data = np.load(filepath)
print(data[:5, :5])
print(data.shape)

[[ 9145.17638384 11876.1617999  10539.996906    9909.67273961
  12182.05232445]
 [ 9105.99220016 11892.09795426 10480.8387419   9894.4914222
  12208.50964906]
 [ 9173.30531867 11909.49505644 10526.53957154  9916.17713178
  12204.80658558]
 [ 9114.02343587 11913.53680219 10533.05121727  9921.15400552
  12174.43004885]
 [ 9107.7442333  11887.30409721 10502.35180521  9909.02612338
  12180.53592623]]
(197, 316)


### Check the padding

In [65]:
npy_file_path = '/pscratch/sd/p/pakmasha/MBBN_data/Brazil_sub-C002061/Brazil_sub-C002061.npy'
npy_array = np.load(npy_file_path)

print("NPY Head:")
print(npy_array[-5:, -5:])  # Display the last 5 rows and columns
print("NPY Shape:", npy_array.shape)
print("\n")

NPY Head:
[[11839.71695139 12283.98534889 10881.35059289  8006.11719252
  11800.52840287]
 [11784.89720195 12286.79847325 10839.91277137  7948.83190584
  11832.30037806]
 [11842.9440764  12268.80825222 10855.42052007  7930.41527317
  11756.43465693]
 [11849.03353231 12279.04862801 10891.83992839  8011.15036387
  11761.88429507]
 [11853.5938301  12294.95969945 10894.4448558   7999.24655528
  11804.12010756]]
NPY Shape: (130, 316)




### Apply band-pass filtering to .npy files (SKIP FOR NOW)

In [103]:
from scipy.signal import butter, filtfilt

def bandpass_filter(data, lowcut, highcut, fs, order=4):
    """
    Apply a band-pass filter to the fMRI time-series data.

    Parameters:
    - data (numpy array): Input time-series data (1D).
    - lowcut (float): Low cutoff frequency in Hz.
    - highcut (float): High cutoff frequency in Hz.
    - fs (float): Sampling frequency (1/TR in Hz).
    - order (int): Order of the Butterworth filter.

    Returns:
    - filtered_data (numpy array): Band-pass filtered data.
    """
    nyquist = 0.5 * fs  # Nyquist frequency
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')  # Band-pass filter coefficients
    filtered_data = filtfilt(b, a, data)  # Apply the filter
    return filtered_data


In [106]:
def lorentzian_function(x, s0, corner):
    return (s0*corner**2) / (x**2 + corner**2)

In [107]:
def multi_fractal_function(x, beta_low, beta_high, A, B, corner):
    return np.where(x < corner, A * x**beta_low, B * x**beta_high)

In [108]:
# from nitime.timeseries import TimeSeries
# from nitime.analysis import SpectralAnalyzer, FilterAnalyzer, NormalizationAnalyzer
# from scipy.optimize import curve_fit

# # Define the base path where the .tsv files are stored
# base_path = '/pscratch/sd/p/pakmasha/MBBN_data'

# # Create a dictionary to store knee values for each site
# f1_dict = {}
# f2_dict = {}
# seq_len_dict = {}
# error_count = 0

# # Walk through all subdirectories and process .npy files
# for root, dirs, files in os.walk(base_path):
#     for file in files:
#         if file.endswith('.npy') and file.split('_')[-2] != "smoothed" and file.split('_')[-2] != "0.01":
#             # Construct full path to the .tsv file
#             npy_file_path = os.path.join(root, file)
#             # print(f"npy_file_path: {npy_file_path}")

#             # Calculate the repetition time (TR) depending on the site
#             site = file.split('_')[-2]
#             if 'Amsterdam-AMC' in site:
#                 TR = 2.375
#             elif 'Amsterdam-VUmc' in site:
#                 TR = 1.8
#             elif 'Barcelona-HCPB' in site:
#                 TR = 2
#             elif 'Bergen' in site:
#                 TR = 1.8
#             elif 'Braga-UMinho-Braga-1.5T' in site:
#                 TR = 2
#             elif 'Braga-UMinho-Braga-1.5T-act' in site:
#                 TR = 2
#             elif 'Braga-UMinho-Braga-3T' in site:
#                 TR = 1
#             elif 'Brazil' in site:
#                 TR = 2
#             elif 'Cape-Town-UCT-Allegra' in site:
#                 TR = 1.6
#             elif 'Cape-Town-UCT-Skyra' in site:
#                 TR = 1.73
#             elif 'Chiba-CHB' in site:
#                 TR = 2.3
#             elif 'Chiba-CHBC' in site:
#                 TR = 2.3 
#             elif 'Chiba-CHBSRPB' in site:
#                 TR = 2.5 
#             elif 'Dresden' in site:
#                 TR = 0.8 
#             elif 'Kyoto-KPU-Kyoto1.5T' in site:
#                 TR = 2.411 
#             elif 'Kyoto-KPU-Kyoto3T' in site:
#                 TR = 2
#             elif 'Kyushu' in site:
#                 TR = 2.5
#             elif 'Milan-HSR' in site:
#                 TR = 2
#             elif 'New-York' in site:
#                 TR = 1
#             elif 'NYSPI-Columbia-Adults' in site:
#                 TR = 0.85
#             elif 'NYSPI-Columbia-Pediatric' in site:
#                 TR = 0.85
#             elif 'Yale-Pittinger-HCP-Prisma' in site:
#                 TR = 0.8
#             elif 'Yale-Pittinger-HCP-Trio' in site:
#                 TR = 0.7
#             elif 'Yale-Pittinger-Yale-2014' in site:
#                 TR = 2
#             elif 'Bangalore-NIMHANS' in site:
#                 TR = 2 
#             elif 'Barcelone-Bellvitge-ANTIGA-1.5T' in site:
#                 TR = 2
#             elif 'Barcelone-Bellvitge-COMPULSE-3T' in site:
#                 TR = 2
#             elif 'Barcelone-Bellvitge-PROV-1.5T' in site:
#                 TR = 2
#             elif 'Barcelone-Bellvitge-RESP-CBT-3T' in site:
#                 TR = 2
#             elif 'Seoul-SNU' in site:
#                 TR = 3.5
#             elif 'Shanghai-SMCH' in site:
#                 TR = 3
#             elif 'UCLA' in site:
#                 TR = 2
#             elif 'Vancouver-BCCHR' in site:
#                 TR = 2
#             elif 'Yale-Gruner' in site:
#                 TR = 2
#             else:
#                 raise ValueError(f"Site '{site}' does not have a defined TR value in TR_mappings. Please add it.")

#             # Load the .npy file and calculate knee frequencies
#             y = np.load(npy_file_path).T
#             sequence_length = y.shape[1]   # use the original number of points
#             # print(f"y: {y}")
#             # print(f"y.shape: {y.shape}")

#             try: 
#                 fs = 1 / TR  # Sampling frequency in Hz
#                 lowcut = 0.01  # Low cutoff frequency in Hz
#                 highcut = 0.1  # High cutoff frequency in Hz

#                 # Apply the filter to each row (ROI) in the 2D array `y`
#                 y = np.array([bandpass_filter(roi, lowcut, highcut, fs) for roi in y])

#                 # Save the filtered time series
#                 # Save the filtered time series
#                 filtered_file_name = f"{os.path.splitext(file)[0]}_filtered_{lowcut}_{highcut}.npy"
#                 filtered_file_path = os.path.join(root, filtered_file_name)
#                 np.save(filtered_file_path, y.T)
#                 print(f"Successfully save {filtered_file_name} file")

#                 sample_whole = np.zeros(sequence_length,) # originally sequence_length   ## aggregates time-series data across ROIs   # sample_whole.shape = # of timepoints,

#                 ##### DEBUG STATEMENT #####
#                 # sample_whole = np.zeros(sequence_length - 20,)
#                 # print(f"sample_whole.shape: {sample_whole.shape}")
#                 ###########################

#                 intermediate_vec = y.shape[0]

#                 for i in range(intermediate_vec):
#                     # print(f"y[i] shape: {y[i].shape}")
#                     sample_whole+=y[i]

#                 sample_whole /= intermediate_vec    # averages the time-series signals (y) across a set number of ROIs

#                 # Smooth the averaged time series
#                 # fwhm = 2
#                 # smoothed_sample_whole = gaussian_smoothing_with_fwhm(sample_whole, fwhm)

#                 T = TimeSeries(sample_whole, sampling_interval=TR)  # computes power spectral density (PSD) of the averaged time-series signal
#                 S_original = SpectralAnalyzer(T)

#                 # Lorentzian function fitting (dividing ultralow ~ low)  ## extracts the PSD data
#                 xdata = np.array(S_original.spectrum_fourier[0][1:])  # xdata = frequency values  
#                 ydata = np.abs(S_original.spectrum_fourier[1][1:])    # ydata = corresponding power values
#                 # print(f"xdata.shape: {xdata.shape}")
#                 # print(f"ydata.shape: {ydata.shape}")

#                 # initial parameter setting
#                 p0 = [0, 0.006]   
#                 param_bounds = ([-np.inf, 0], [np.inf, 1])

#                 # fitting Lorentzian function
#                 popt, pcov = curve_fit(lorentzian_function, xdata, ydata, p0=p0, maxfev = 5000, bounds=param_bounds)   # popt = optimal parameters

#                 f1 = popt[1]

#                 knee = round(popt[1]/(1/(sample_whole.shape[0]*TR)))   # calculates knee frequency 
#                 # print(f"knee: {knee}")

#                 if knee <= 0:
#                     knee = 1

#                 if knee > ydata.shape[0]:
#                     print(f"knee value: {knee}")
#                     print(f"ydata.shape: {ydata.shape}")

#                 # divide low ~ high
#                 # initial parameter setting
#                 p1 = [2, 1, 23, 25, 0.16]
            
#                 # fitting multifractal function
#                 popt_mo, pcov = curve_fit(multi_fractal_function, xdata[knee:], ydata[knee:], p0=p1, maxfev = 50000)   # fits a multi-fractal model to the high-frequency range (above the knee)
#                 pink = round(popt_mo[-1]/(1/(sample_whole.shape[0]*TR)))   # pink = an additional boundary
#                 f2 = popt_mo[-1]

#                 # if file == "Brazil_sub-C001419_smoothed_2mm.npy":
#                 #     print("file == 'Brazil_sub-C001419_smoothed_2mm.npy'")
#                 #     print(f"y: {y[:2,]}")
#                 #     print(f"sample_whole: {sample_whole[:2,]}")
#                 #     print(f"knee: {knee}")
#                 #     print(f"f1: {f1}, f2: {f2}")

#                 # Save values to the dictionaries
#                 # Check if the key exists in the dictionary
#                 if site in f1_dict:
#                     # Append the value to the existing list
#                     f1_dict[site].append(f1)
#                     f2_dict[site].append(f2)
#                     seq_len_dict[site].append(sequence_length)
#                 else:
#                     # Create the key and initialize it with a list containing the value
#                     f1_dict[site] = [f1]
#                     f2_dict[site] = [f2]   
#                     seq_len_dict[site] = [sequence_length] 
#                 print(f"Successfully processed {file}")  
#             except Exception as e:
#                 print(f"Error processing: {file}")
#                 print(e)
#                 error_count += 1
#                 continue  # Skip the subject if an error occurs
            
# print(f"Knee frequencies f1: {f1_dict}")
# print(f"\nKnee frequencies f2: {f2_dict}")
# print(f"Sequence lengths: {seq_len_dict}")
# print(f"Error processing {error_count} files")


Successfully save Barcelona-HCPB_sub-008_filtered_0.01_0.1.npy file
Successfully processed Barcelona-HCPB_sub-008.npy
Successfully save Brazil_sub-C002061_filtered_0.01_0.1.npy file
Successfully processed Brazil_sub-C002061.npy


  return np.where(x < corner, A * x**beta_low, B * x**beta_high)
  popt_mo, pcov = curve_fit(multi_fractal_function, xdata[knee:], ydata[knee:], p0=p1, maxfev = 50000)   # fits a multi-fractal model to the high-frequency range (above the knee)


Successfully save Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225_filtered_0.01_0.1.npy file
Successfully processed Yale-Pittinger-HCP-Prisma_sub-YaleHCPPrismapb3225.npy
Successfully save Seoul-SNU_sub-NOR117CSJ_filtered_0.01_0.1.npy file
Successfully processed Seoul-SNU_sub-NOR117CSJ.npy


  return np.where(x < corner, A * x**beta_low, B * x**beta_high)


Successfully save Dresden_sub-GEROME3073_filtered_0.01_0.1.npy file
Successfully processed Dresden_sub-GEROME3073.npy
Successfully save Bergen_sub-00059_filtered_0.01_0.1.npy file
Successfully processed Bergen_sub-00059.npy
Successfully save Bangalore-NIMHANS_sub-C0181_filtered_0.01_0.1.npy file
Successfully processed Bangalore-NIMHANS_sub-C0181.npy
Successfully save Bangalore-NIMHANS_sub-C0128_filtered_0.01_0.1.npy file
Successfully processed Bangalore-NIMHANS_sub-C0128.npy
Successfully save Seoul-SNU_sub-DNO23LSM_filtered_0.01_0.1.npy file
Successfully processed Seoul-SNU_sub-DNO23LSM.npy
Successfully save Bangalore-NIMHANS_sub-ODP004_filtered_0.01_0.1.npy file
Successfully processed Bangalore-NIMHANS_sub-ODP004.npy
Successfully save Bangalore-NIMHANS_sub-ODP203_filtered_0.01_0.1.npy file
Successfully processed Bangalore-NIMHANS_sub-ODP203.npy
Successfully save Barcelone-Bellvitge-PROV-1.5T_sub-subIDIBELL15224_filtered_0.01_0.1.npy file
Successfully processed Barcelone-Bellvitge-PROV

### Remove subfolders that return error

In [2]:
import numpy as np

error_file = np.load("/pscratch/sd/p/pakmasha/MBBN_data/Vancouver-BCCHR_sub-032/Vancouver-BCCHR_sub-032.npy")
error_file.shape

(17, 316)

In [3]:
import shutil
import os

# Specify the folder path
folder_path = "/pscratch/sd/p/pakmasha/MBBN_data/Vancouver-BCCHR_sub-032"

# Check if the folder exists
if os.path.exists(folder_path):
    # Remove the folder
    shutil.rmtree(folder_path)
    print(f"Folder '{folder_path}' has been removed.")
else:
    print(f"Folder '{folder_path}' does not exist.")

Folder '/pscratch/sd/p/pakmasha/MBBN_data/Vancouver-BCCHR_sub-032' has been removed.


# Number of subjects per site in the final meta-data

In [66]:
from collections import Counter
import pandas as pd

In [67]:
metadata = pd.read_csv("/global/homes/p/pakmasha/model/MBBN-main/data/metadata/ENIGMA_QC_final_subject_list.csv")
metadata.head()

Unnamed: 0,Sample,Subject ID,Formatted ID,Unique_ID,OCD,Age of onset,Medication,Y-BOCS,Age,Age range,...,Education,Depression current,Depression lifetime,Anxiety current,Anxiety lifetime,Agr_Check,Clean,Sex_Rel,Hoard,Ord
0,Amsterdam-VUmc,sub-916002,sub-916002,Amsterdam-VUmc_sub-916002,1,1.0,2.0,24.0,22.0,3,...,11.0,2.0,2.0,1.0,1,1.0,1.0,0.0,0.0,1.0
1,Amsterdam-VUmc,sub-916005,sub-916005,Amsterdam-VUmc_sub-916005,2,,1.0,,27.0,3,...,9.0,0.0,0.0,0.0,0,,,,,
2,Amsterdam-VUmc,sub-916006,sub-916006,Amsterdam-VUmc_sub-916006,1,2.0,2.0,15.0,21.0,3,...,11.0,1.0,1.0,1.0,1,,,,,
3,Amsterdam-VUmc,sub-916007,sub-916007,Amsterdam-VUmc_sub-916007,2,,2.0,,28.0,3,...,11.0,0.0,0.0,0.0,0,,,,,
4,Amsterdam-VUmc,sub-916008,sub-916008,Amsterdam-VUmc_sub-916008,1,1.0,2.0,29.0,35.0,3,...,11.0,2.0,1.0,1.0,1,1.0,0.0,0.0,1.0,1.0


In [68]:
# Check the number of subjects per site
value_counts = Counter(metadata["Sample"].values)
total_count = 0
site_list = []
count_list = []

for value, count in value_counts.items():
    total_count += count
    site_list.append(value)
    count_list.append(count)

count_by_site = pd.DataFrame({
    "Sample": site_list,
    "Count": count_list
})

print(count_by_site)
print(f"\nTotal number of subjects: {total_count}")

count_by_site.to_csv("/global/homes/p/pakmasha/ENIGMA-OCD results/QC/count_by_site_final_metadata.csv", index=False)


                             Sample  Count
0                    Amsterdam-VUmc     57
1                 Bangalore-NIMHANS    358
2                    Barcelona-HCPB     64
3   Barcelone-Bellvitge-ANTIGA-1.5T    137
4     Barcelone-Bellvitge-PROV-1.5T     64
5   Barcelone-Bellvitge-RESP-CBT-3T     57
6                            Bergen     53
7           Braga-UMinho-Braga-1.5T     46
8       Braga-UMinho-Braga-1.5T-act     96
9             Braga-UMinho-Braga-3T     59
10                           Brazil     93
11              Cape-Town-UCT-Skyra     39
12                        Chiba-CHB     42
13                       Chiba-CHBC     53
14                    Chiba-CHBSRPB     89
15                          Dresden     35
16              Kyoto-KPU-Kyoto1.5T     25
17                Kyoto-KPU-Kyoto3T     70
18                         New-York     60
19            NYSPI-Columbia-Adults     70
20         NYSPI-Columbia-Pediatric     34
21                        Seoul-SNU     95
22         

In [69]:
import numpy as np
file = np.load("/pscratch/sd/p/pakmasha/MBBN_data/Amsterdam-VUmc_sub-916002/Amsterdam-VUmc_sub-916002.npy")
file[:5,:5]

Unnamed: 0,17Networks_LH_VisCent_ExStr_1,17Networks_LH_VisCent_ExStr_2,17Networks_LH_VisCent_ExStr_4,17Networks_LH_VisCent_ExStr_6,17Networks_LH_VisCent_ExStr_7,17Networks_LH_VisCent_ExStr_9,17Networks_LH_VisCent_ExStr_11,17Networks_LH_VisPeri_ExStrInf_1,17Networks_LH_VisPeri_ExStrInf_2,17Networks_LH_VisPeri_ExStrInf_3,...,FreeSurfer_Left-Caudate,FreeSurfer_Left-Putamen,FreeSurfer_Left-Pallidum,FreeSurfer_Left-Hippocampus,FreeSurfer_Right-Thalamus,FreeSurfer_Right-Caudate,FreeSurfer_Right-Putamen,FreeSurfer_Right-Pallidum,FreeSurfer_Right-Hippocampus,Buckner2011_17Networks_4
0,9145.176384,11876.1618,10539.996906,9909.67274,12182.052324,12286.508742,13441.939207,11276.891624,11320.904209,11326.232646,...,9029.339474,9289.592661,6477.132319,10495.065684,10515.193261,9279.207549,9544.702826,6668.395977,10750.522046,11711.181555
1,9105.9922,11892.097954,10480.838742,9894.491422,12208.509649,12298.751867,13419.403041,11269.932218,11257.155561,11304.97167,...,9032.282088,9255.93939,6478.689935,10504.033438,10545.314457,9343.314049,9506.1604,6658.330508,10768.518774,11666.15302
2,9173.305319,11909.495056,10526.539572,9916.177132,12204.806586,12303.074369,13495.095016,11314.119712,11338.619041,11363.665153,...,9057.712913,9293.725019,6477.772751,10487.972583,10560.478543,9349.887929,9564.475793,6668.401333,10745.942752,11696.754098
3,9114.023436,11913.536802,10533.051217,9921.154006,12174.430049,12255.664426,13448.391866,11272.411651,11309.798849,11373.996202,...,9062.613141,9302.189004,6494.34806,10510.149784,10548.838791,9333.932134,9559.301272,6647.370279,10788.813854,11675.98362
4,9107.744233,11887.304097,10502.351805,9909.026123,12180.535926,12252.981479,13438.709384,11247.493971,11300.423973,11353.674448,...,9104.704954,9259.560647,6474.510185,10499.573716,10553.755792,9349.947284,9537.117157,6661.175796,10755.54683,11662.329826
