### Create a DataFrame of subjects with available timeseries data & Check subjects from the QC.json that don't have the data

In [15]:
import os
import json
import pandas as pd

# Load the QC JSON file
qc_file_path = "/global/homes/p/pakmasha/ENIGMA-OCD results/QC/QC.json"
with open(qc_file_path, "r") as f:
    qc_data = json.load(f)

# Define the base directory
base_dir = "/pscratch/sd/p/pakmasha/ENIGMA_unzip"

# Initialize a list to store missing subjects
# subjects_with_timeseries_data = []
missing_subjects = []

# Traverse through sample folders and subsample folders
for qc_entry in qc_data:
    site = qc_entry["site"]
    failed_subjects = qc_entry["failed_subjects"]
    
    # Determine if the site has subsamples
    site_path = os.path.join(base_dir, site)
    if os.path.exists(site_path):
        halfpipe_path = os.path.join(site_path, "halfpipe")
        
        if os.path.exists(halfpipe_path):
            # Direct sample with a "halfpipe" folder
            available_subjects = os.listdir(halfpipe_path)
            
            # Store formatted IDs and sample info
            # for subject in available_subjects:
            #     subjects_with_timeseries_data.append({"Sample": site, "Formatted ID": subject})
            
            for subject in failed_subjects:
                if subject not in available_subjects:
                    missing_subjects.append({"Subject": subject, "Sample": site})
        else:
            # Subsamples exist
            for subsample_folder in os.listdir(site_path):
                subsample_path = os.path.join(site_path, subsample_folder)
                halfpipe_path = os.path.join(subsample_path, "halfpipe")
                
                if os.path.exists(halfpipe_path):
                    available_subjects = os.listdir(halfpipe_path)
                    
                    # Store formatted IDs and sample/subsample info
                    # for subject in available_subjects:
                    #     subjects_with_timeseries_data.append(
                    #         {"Sample": f"{site}/{subsample_folder}", "Formatted ID": subject}
                    #     )
                    
                    for subject in failed_subjects:
                        if subject not in available_subjects:
                            missing_subjects.append(
                                {"Subject": subject, "Sample": f"{site}/{subsample_folder}"}
                            )
                else:
                    # If there's no halfpipe, log the issue
                    print(f"No halfpipe folder found for {site}/{subsample_folder}")
    else:
        print(f"Site folder not found: {site}")

# # Create a DataFrame from subjects_with_timeseries_data
# Subjects_with_timeseries_data = pd.DataFrame(subjects_with_timeseries_data)        
        
# Print results
print(f"\nNumber of missing subjects: {len(missing_subjects)}")
for entry in missing_subjects:
    print(f"Missing Subject: {entry['Subject']}, Sample: {entry['Sample']}")
    
# # Display the Subjects_with_timeseries_data DataFrame
# print("\nSubjects with timeseries data:")
# print(Subjects_with_timeseries_data.head())    

# # Save the DataFrame to a CSV file
# Subjects_with_timeseries_data.to_csv("/global/homes/p/pakmasha/ENIGMA-OCD results/QC/Subjects_with_timeseries_data.csv", index=False)


Number of missing subjects: 27
Missing Subject: sub-subSEQ1NKISENR45, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR68, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR107, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR116, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR118, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR145, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR149, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR151, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR176, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR45, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR68, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR107, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR116, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR118, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR145, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR149, Sample: New_York
Missing Subject: sub-subSEQ1NKISENR151, Sample: New_York
Mis

### Check the number of subjects with available data in each sample

In [16]:
# Count the number of subjects for each sample
subject_counts = Subjects_with_timeseries_data.groupby("Sample").nunique()

# Display the result
print(subject_counts)


                                 Formatted ID
Sample                                       
Amsterdam_AMC                              50
Amsterdam_VUmc                             83
Bangalore_NIMHANS                         470
Barcelona_HCPB                            103
Barcelone_Bellvitge/ANTIGA_1.5T           196
Barcelone_Bellvitge/COMPULSE_3T            37
Barcelone_Bellvitge/PROV_1.5T             107
Barcelone_Bellvitge/RESP_CBT_3T            75
Bergen                                     70
Braga_UMinho/Braga_1.5T                    49
Braga_UMinho/Braga_1.5T_act               113
Braga_UMinho/Braga_3T                      63
Brazil                                    115
Cape_Town_UCT/Allegra                      11
Cape_Town_UCT/Skyra                        46
Chiba/CHB                                  46
Chiba/CHBC                                 60
Chiba/CHBSRPB                              99
Dresden                                    56
Kyoto_KPU/Kyoto1.5T               

### Double-check specific folders if needed

In [14]:
import os

# Specify the folder path
folder_path = "/pscratch/sd/p/pakmasha/ENIGMA_unzip/Dresden/halfpipe"

# Count the number of files in the folder
folder_count = len([f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))])

# Print the result
print(f"Number of files in the folder '{folder_path}': {folder_count}")


Number of files in the folder '/pscratch/sd/p/pakmasha/ENIGMA_unzip/Dresden/halfpipe': 56
