In [20]:
import pandas as pd
import os
import subprocess
import random

# Load the CSV
csv_path = "suggested_exclusions.csv"
df = pd.read_csv(csv_path, header=None)  # No header in CSV
df.columns = ["scan_id", "MRIQC_fail"]

# Directories
mriqc_failed_dir = "mriqc_failed"
all_included_dir = "all_included"

os.makedirs(mriqc_failed_dir, exist_ok=True)
os.makedirs(all_included_dir, exist_ok=True)

In [21]:
# Function to construct filenames
def get_html_filename(scan_id):
    """Convert scan_id to dataset's HTML filename format."""
    parts = scan_id.split("_")
    subject = parts[0][1:]  # Remove leading 's' from subject number
    task = parts[1]
    
    # Construct filename
    return f"sub-s{subject}_ses-1_task-{task}_run-1_bold.html"

In [13]:
# **Step 1: Download failed scans (MRIQC_fail == 1.0)**
failed_files = []
failed_scans = df[df["MRIQC_fail"] == "1.0"]["scan_id"]

for scan_id in failed_scans:
    filename = get_html_filename(scan_id)
    dataset_path = f"ds004636/derivatives/mriqc/{filename}"

    # Download file using Datalad
    try:
        subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
        failed_files.append(filename)

    except subprocess.CalledProcessError:
        if "ses-1" in filename:
            filename = filename.replace("ses-1", "ses-2")
            dataset_path = f"ds004636/derivatives/mriqc/{filename}"
            try:
                subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                failed_files.append(filename)

            except subprocess.CalledProcessError:
                print(f"Failed to download: {filename}")

get(impossible): derivatives/mriqc/sub-s607_ses-1_task-CCTHot_run-1_bold.html [path does not exist]
get(impossible): derivatives/mriqc/sub-s607_ses-1_task-WATT3_run-1_bold.html [path does not exist]
get(ok): derivatives/mriqc/sub-s607_ses-2_task-WATT3_run-1_bold.html (file) [from s3-PUBLIC...]
get(impossible): derivatives/mriqc/sub-s607_ses-1_task-stopSignal_run-1_bold.html [path does not exist]
get(impossible): derivatives/mriqc/sub-s607_ses-1_task-twoByTwo_run-1_bold.html [path does not exist]
get(ok): derivatives/mriqc/sub-s499_ses-1_task-WATT3_run-1_bold.html (file) [from s3-PUBLIC...]
get(ok): derivatives/mriqc/sub-s373_ses-1_task-WATT3_run-1_bold.html (file) [from s3-PUBLIC...]
get(impossible): derivatives/mriqc/sub-s445_ses-1_task-ANT_run-1_bold.html [path does not exist]
get(impossible): derivatives/mriqc/sub-s445_ses-1_task-CCTHot_run-1_bold.html [path does not exist]
get(impossible): derivatives/mriqc/sub-s445_ses-1_task-WATT3_run-1_bold.html [path does not exist]
get(ok): de

In [16]:
# Save list of failed files
with open(os.path.join(mriqc_failed_dir, "mriqc_failed.txt"), "w") as f:
    for file in failed_files:
        f.write(file + "\n")

In [None]:
"""not downloaded
s445_DPX,1.0
s445_discountFix,1.0
s445_motorSelectiveStop,1.0
"""

In [24]:
# **Step 2: Download 5 passing examples for each task**
included_files = []
tasks = {"ANT", "CCTHot", "WATT3", "stopSignal", "twoByTwo","DPX", "discountFix", "motorSelectiveStop", "stroop", "surveyMedley"}

for task in tasks:
    passing_scans = df[(df["MRIQC_fail"] == "0.0") & (df["scan_id"].str.contains(task))]

    if len(passing_scans) >= 5:
        chosen_samples = passing_scans.sample(5)["scan_id"]
    else:
        chosen_samples = passing_scans["scan_id"]

    for scan_id in chosen_samples:
        filename = get_html_filename(scan_id)
        dataset_path = f"ds004636/derivatives/mriqc/{filename}"
        
        # Download file using Datalad
        try:
            subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
            included_files.append(filename)
        except subprocess.CalledProcessError:
            if "ses-1" in filename:
                filename = filename.replace("ses-1", "ses-2")
                dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                try:
                    subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                    included_files.append(filename)
                except subprocess.CalledProcessError:
                    print(f"Failed to download: {filename}")

get(ok): derivatives/mriqc/sub-s646_ses-1_task-twoByTwo_run-1_bold.html (file) [from s3-PUBLIC...]
get(ok): derivatives/mriqc/sub-s639_ses-1_task-twoByTwo_run-1_bold.html (file) [from s3-PUBLIC...]
get(ok): derivatives/mriqc/sub-s533_ses-1_task-twoByTwo_run-1_bold.html (file) [from s3-PUBLIC...]
get(impossible): derivatives/mriqc/sub-s637_ses-1_task-twoByTwo_run-1_bold.html [path does not exist]
get(impossible): derivatives/mriqc/sub-s637_ses-2_task-twoByTwo_run-1_bold.html [path does not exist]
Failed to download: sub-s637_ses-2_task-twoByTwo_run-1_bold.html
get(impossible): derivatives/mriqc/sub-s650_ses-1_task-twoByTwo_run-1_bold.html [path does not exist]
get(ok): derivatives/mriqc/sub-s650_ses-2_task-twoByTwo_run-1_bold.html (file) [from s3-PUBLIC...]
get(impossible): derivatives/mriqc/sub-s600_ses-1_task-surveyMedley_run-1_bold.html [path does not exist]
get(impossible): derivatives/mriqc/sub-s600_ses-2_task-surveyMedley_run-1_bold.html [path does not exist]
Failed to download: s

In [32]:
# Save list of included files TO ALL_INCLUDED.TXT
with open(os.path.join(all_included_dir, "all_included.txt"), "w") as f:
    for file in included_files:
        f.write(file + "\n")

In [None]:
# MANUALLY PICKED THE TAKS, DOWNLOADED ALL ASSOCIATED FILES
# passing_scans = df[(df["MRIQC_fail"] == "0.0") & (df["scan_id"].str.contains("stroop"))]
chosen_samples=passing_scans["scan_id"]
for scan_id in chosen_samples:
        filename = get_html_filename(scan_id)
        print(filename)
        dataset_path = f"ds004636/derivatives/mriqc/{filename}"
        
        # Download file using Datalad
        try:
            subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
            included_files.append(filename)
        except subprocess.CalledProcessError:
            if "ses-1" in filename:
                filename = filename.replace("ses-1", "ses-2")
                dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                try:
                    subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                    included_files.append(filename)
                except subprocess.CalledProcessError:
                    print(f"Failed to download: {filename}")

In [None]:
# all downloaded files' names are in mriqc_failed.txt and all_included.txt
# develop website that allows you to select and open the visuals. 

# Files
mriqc_failed = "mriqc_failed/mriqc_failed.txt"
all_included = "all_included/all_included.txt"

# acquire filename by parsing above documents
filename = all_included...

# display the document at dataset_path in html form
dataset_path = f"ds004636/derivatives/mriqc/{filename}"
