In [1]:
import pandas as pd
import os
import subprocess

# download the suggested exclusions file from ds004636 openneuro dataset and add to the directory. 
# label the first column's name as "scan_id"
# Load the CSV
csv_path = "suggested_exclusions.csv"
df = pd.read_csv(csv_path, header=None)  # No header in CSV
df.columns = ["scan_id", "MRIQC_fail"]

# Directories
mriqc_failed_dir = "mriqc_failed"
mriqc_passed_dir = "mriqc_passed"

os.makedirs(mriqc_failed_dir, exist_ok=True)
os.makedirs(mriqc_passed_dir, exist_ok=True)

In [3]:
# Suggested exclusions csv file only provides subject and task info. 
# We need to construct the name of the html file fully to 
# pull it from datalad. 
# scan_id comes from suggested_exclusions.csv

def get_html_filename(scan_id):
    """Convert scan_id to dataset's HTML filename format."""
    # split the scan_id by _ between subject and task info 
    parts = scan_id.split("_")
    subject = parts[0][1:]  # Remove leading 's' from subject number
    task = parts[1]
    
    # Construct filename
    # this assumes all html file names include "ses-1 (session 1)"
    # we correct for "ses-2" later in code
    return f"sub-s{subject}_ses-1_task-{task}_run-1_bold.html"

In [3]:
# **Step 1: Download failed scans (MRIQC_fail == 1.0)**
failed_files = []
# get all the scan_id's of files with mriqc_fail == 1.0
failed_scans = df[df["MRIQC_fail"] == "1.0"]["scan_id"]

# Save these files to mriqc_failed.txt 
with open(os.path.join(mriqc_failed_dir, "mriqc_failed.txt"), "w") as f:
    for file in failed_scans:
        f.write(file + "\n")

IN ORDER TO EXECUTE CODE BELOW, WE NEED TO INSTALL DATALAD:

"brew install datalad"

AND WE NEED TO USE DATALAD GET WITH DATASET URL

"datalad clone https://openneuro.org/datasets/ds004636/versions/1.0.4"

THIS WILL INSTALL ALL THE METADATA THAT WILL ALLOW US TO GET DESIRED HTMLS 
IN DS004636/DERIVATIVES/MRIQC DIRECTORY

In [None]:
# THIS CODE GETS ALL THE FILES IN FAILED_SCANS USING DATALAD
# WE DON'T NEED TO RUN IT AGAIN IF YOU HAVE FILES
# IT WON'T WORK IF YOU HAVEN'T CLONED THE OPENNEURO DATASET (STATED ABOVE)

for scan_id in failed_scans:
    filename = get_html_filename(scan_id)
    dataset_path = f"ds004636/derivatives/mriqc/{filename}"

    # Download file using Datalad
    # save all the succesfully downloaded file names to failed_files
    try:
        subprocess.run(["/opt/homebrew/bin/datalad", "get", "-d", "ds004636", dataset_path], check=True)
        failed_files.append(filename)

    except subprocess.CalledProcessError:

        # if datalad get failed, replace "ses-1" to "ses2"
        if "ses-1" in filename:
            filename = filename.replace("ses-1", "ses-2")
            dataset_path = f"ds004636/derivatives/mriqc/{filename}"
            try:
                subprocess.run(["/opt/homebrew/bin/datalad", "get", "-d", "ds004636", dataset_path], check=True)
                failed_files.append(filename)

            except subprocess.CalledProcessError:
                print(f"Failed to download: {filename}")

In [None]:
# **Step 2: Download 10 passing examples for each task**
import random
import subprocess

# the array mriqc_passed keeps track of the files that are downloaded
mriqc_passed = []
tasks = {"ANT", "CCTHot", "WATT3", "stopSignal", "twoByTwo", "DPX", "discountFix", "motorSelectiveStop", "stroop", "surveyMedley"}

# download file with the  name
def download_file(filename):
    dataset_path = f"ds004636/derivatives/mriqc/{filename}"
    
    try:
        subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
        # if downloads file, add to included files
        mriqc_passed.append(filename)
        return True
    
    except subprocess.CalledProcessError:

        # if can't find name with "ses-1", try "ses-2"
        if "ses-1" in filename:
                filename = filename.replace("ses-1", "ses-2")
                dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                try:
                    subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                    mriqc_passed.append(filename)
                except subprocess.CalledProcessError:
                    # try "ses-3"
                    if "ses-2" in filename:
                        filename = filename.replace("ses-2", "ses-3")
                        dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                        try:
                            subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                            mriqc_passed.append(filename)
                        except subprocess.CalledProcessError:
                            pass
        else:
            pass


# code to download select 10 of mriqc passed scans per task
for task in tasks:
    # get all files from suggested exclusions that haven't failed mriqc and that include the "task" name in its title
    passing_scans = df[(df["MRIQC_fail"] == "0.0") & (df["scan_id"].str.contains(task))]
    
    # set them as available to download
    available_scans = set(passing_scans["scan_id"])

    selected_samples = set()
    
    # continue as long as selected samples have less than 10 items 
    # and there exists an item in available scans
    while len(selected_samples) < 10 and available_scans:

        # remaining options include all left in available_scans that aren't in selected_samples
        remaining_options = list(available_scans - selected_samples)
        
        # end the for loop (move on to the next task)
        # if nothing no remaining option left
        # (sometimes there are not 10 distinct files for each task on the dataset)
        if not remaining_options:
            break 

        # chose randomly one of the remaining options and add to selected_scans
        scan_id = random.choice(remaining_options)
        selected_samples.add(scan_id)
        
        # get the full filename and download file
        filename = get_html_filename(scan_id)
        if download_file(filename):
            continue
        else:
            # If no file exists, pick a new sample
            selected_samples.remove(scan_id)
            available_scans.discard(scan_id)

In [None]:
all_passing_scans = df[df["mriqc_failed"] == "0.0"]["scan_id"]

# Save list of included files TO mriqc
with open(os.path.join(mriqc_passed_dir, "mriqc_passed.txt"), "w") as f:
    for name in all_passing_scans:

        # if an item ni all_passing_scans is in mriqc_passed (the list which displays all downloaded mriqc_passed files)
        # then write the name to mriqc_passed.txt
        if get_html_filename(name) in mriqc_passed:
            f.write(name + "\n")

In [None]:
# mriqc_passed.txt contained the full file names. 
# this code converts all file names back to shortened versions included in suggested exclusions csv

temp_arr = []
with open(os.path.join(mriqc_passed_dir, "mriqc_passed.txt"), "r") as f:
    for line in f:
        alist = line.split("_")
        name = alist[0][-4:] + "_" + alist[2].split("-")[1]
        temp_arr.append(name)

# Save list of included files TO mriqc
with open(os.path.join(mriqc_passed_dir, "mriqc_passed.txt"), "w") as f:
    for name in temp_arr:
        f.write(name + "\n")

In [16]:
# create 2 other txt files with the full names of the documents
real_names_failed_list = []
with open(os.path.join(mriqc_failed_dir, "mriqc_failed.txt"), "r") as f:
    for line in f:
        real_names_failed_list.append(line.strip("\n"))


with open(os.path.join(mriqc_failed_dir, "mriqc_failed_fullname.txt"), "w") as k:
    for elem in real_names_failed_list:
        full_name = get_html_filename(elem)
        if os.path.exists(os.path.join("ds004636/derivatives/mriqc_failed/", full_name)):
            k.write(full_name + "\n")
        else:
            full_name = full_name.replace("ses-1", "ses-2")
            if os.path.exists(os.path.join("ds004636/derivatives/mriqc_failed/", full_name)):
                k.write(full_name + "\n")
            else:
                print(f"couldn't write full_name of {full_name}")

couldn't write full_name of sub-s445_ses-2_task-DPX_run-1_bold.html
couldn't write full_name of sub-s445_ses-2_task-discountFix_run-1_bold.html
couldn't write full_name of sub-s445_ses-2_task-motorSelectiveStop_run-1_bold.html
