In [1]:
import pandas as pd
import os
import subprocess

# download the suggested exclusions file from ds004636 openneuro dataset and add to the directory. 
# label the first column's name as "scan_id"
# Load the CSV
csv_path = "suggested_exclusions.csv"
df = pd.read_csv(csv_path, header=None)  # No header in CSV
df.columns = ["scan_id", "MRIQC_fail"]

# Directories
mriqc_failed_dir = "mriqc_failed"
all_included_dir = "all_included"

os.makedirs(mriqc_failed_dir, exist_ok=True)
os.makedirs(all_included_dir, exist_ok=True)

In [2]:
# Suggested exclusions csv file only provides subject and task info. 
# We need to construct the name of the html file fully to 
# pull it from datalad. 
# scan_id comes from suggested_exclusions.csv

def get_html_filename(scan_id):
    """Convert scan_id to dataset's HTML filename format."""
    # split the scan_id by _ between subject and task info 
    parts = scan_id.split("_")
    subject = parts[0][1:]  # Remove leading 's' from subject number
    task = parts[1]
    
    # Construct filename
    # this assumes all html file names include "ses-1 (session 1)"
    # we correct for "ses-2" later in code
    return f"sub-s{subject}_ses-1_task-{task}_run-1_bold.html"

In [3]:
# **Step 1: Download failed scans (MRIQC_fail == 1.0)**
failed_files = []
# get all the scan_id's of files with mriqc_fail == 1.0
failed_scans = df[df["MRIQC_fail"] == "1.0"]["scan_id"]

# Save these files to mriqc_failed.txt 
with open(os.path.join(mriqc_failed_dir, "mriqc_failed.txt"), "w") as f:
    for file in failed_scans:
        f.write(file + "\n")

IN ORDER TO EXECUTE CODE BELOW, WE NEED TO INSTALL DATALAD:

"brew install datalad"

AND WE NEED TO USE DATALAD GET WITH DATASET URL

"datalad clone https://openneuro.org/datasets/ds004636/versions/1.0.4"

THIS WILL INSTALL ALL THE METADATA THAT WILL ALLOW US TO GET DESIRED HTMLS 
IN DS004636/DERIVATIVES/MRIQC DIRECTORY

In [None]:
# THIS CODE GETS ALL THE FILES IN FAILED_SCANS USING DATALAD
# WE DON'T NEED TO RUN IT AGAIN IF YOU HAVE FILES
# IT WON'T WORK IF YOU HAVEN'T CLONED THE OPENNEURO DATASET (STATED ABOVE)

for scan_id in failed_scans:
    filename = get_html_filename(scan_id)
    dataset_path = f"ds004636/derivatives/mriqc/{filename}"

    # Download file using Datalad
    # save all the succesfully downloaded file names to failed_files
    try:
        subprocess.run(["/opt/homebrew/bin/datalad", "get", "-d", "ds004636", dataset_path], check=True)
        failed_files.append(filename)

    except subprocess.CalledProcessError:

        # if datalad get failed, replace "ses-1" to "ses2"
        if "ses-1" in filename:
            filename = filename.replace("ses-1", "ses-2")
            dataset_path = f"ds004636/derivatives/mriqc/{filename}"
            try:
                subprocess.run(["/opt/homebrew/bin/datalad", "get", "-d", "ds004636", dataset_path], check=True)
                failed_files.append(filename)

            except subprocess.CalledProcessError:
                print(f"Failed to download: {filename}")

In [None]:
# **Step 2: Download 10 passing examples for each task**
import random
import subprocess

included_files = []
tasks = {"ANT", "CCTHot", "WATT3", "stopSignal", "twoByTwo", "DPX", "discountFix", "motorSelectiveStop", "stroop", "surveyMedley"}
def download_file(filename):
    dataset_path = f"ds004636/derivatives/mriqc/{filename}"
    try:
        subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
        included_files.append(filename)
        print(f"{filename}: ok")
        return True
    except subprocess.CalledProcessError:
        if "ses-1" in filename:
                filename = filename.replace("ses-1", "ses-2")
                dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                try:
                    subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                    included_files.append(filename)
                    print(f"{filename}: ok")
                except subprocess.CalledProcessError:
                    if "ses-2" in filename:
                        filename = filename.replace("ses-2", "ses-3")
                        dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                        try:
                            subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                            included_files.append(filename)
                            print(f"{filename}: ok")
                        except subprocess.CalledProcessError:
                            pass
                            print(f"Failed to download: {filename}")
        else:
            pass
            print(f"ses1 not in name")

# code to download select 10 of mriqc passed scans per task
for task in tasks:
    # get all files that haven't failed mriqc and that include the "task" name in its title
    passing_scans = df[(df["MRIQC_fail"] == "0.0") & (df["scan_id"].str.contains(task))]
    
    # set them as available to download
    available_scans = set(passing_scans["scan_id"])

    selected_samples = set()
    
    while len(selected_samples) < 10 and available_scans:
        remaining_options = list(available_scans - selected_samples)
        if not remaining_options:
            break 
        scan_id = random.choice(remaining_options)
        selected_samples.add(scan_id)
        
        filename = get_html_filename(scan_id)
        print(len(selected_samples))
        if download_file(filename):
            continue
        else:
            # If all session replacements fail, pick a new sample
            selected_samples.remove(scan_id)
            available_scans.discard(scan_id)

In [32]:
# Save list of included files TO ALL_INCLUDED.TXT
with open(os.path.join(all_included_dir, "all_included.txt"), "w") as f:
    for file in included_files:
        f.write(file + "\n")

In [None]:
# AGAIN COPIED ALL THE FILES FROM ANOTHER DIRECTORY (IN ULAS'S COMPUTER) TO THIS REPO

# import shutil
# import os

# # Define source and destination directories
# source_dir = "ds004636/derivatives/mriqc/"
# dest_dir = "../self-regulation-dataset/ds004636/derivatives/mriqc_included/"

# # Ensure destination directory exists
# os.makedirs(dest_dir, exist_ok=True)

# # Copy files
# for scan_id in included_files:
#     filename = scan_id
#     source_path = os.path.join(source_dir, filename)
#     dest_path = os.path.join(dest_dir, filename)

#     if os.path.exists(source_path):
#         shutil.copy2(source_path, dest_path)  # copy2 preserves metadata
#         print(f"Copied {filename} to {dest_path}")
#     else:
#         if "ses-1" in filename:
#             filename = filename.replace("ses-1", "ses-2")
#             source_path = os.path.join(source_dir, filename)
#             dest_path = os.path.join(dest_dir, filename)
#             if os.path.exists(source_path):
#                 shutil.copy2(source_path, dest_path)  # copy2 preserves metadata
#                 print(f"Copied {filename} to {dest_path}")
#             else:
#                 print(f"File not found: {source_path}")

#         else:
#             print(f"ses1 not in name")

In [None]:
# all downloaded files' names are in mriqc_failed.txt and all_included.txt
# develop website that allows you to select and open the visuals. 

# Files
mriqc_failed = "mriqc_failed/mriqc_failed.txt"
all_included = "all_included/all_included.txt"

# acquire filename by parsing above documents
filename = all_included...

# display the document at dataset_path in html form
dataset_path = f"ds004636/derivatives/mriqc/{filename}"


In [None]:
import os

# Path to the mriqc directory
mriqc_failed_dir = "mriqc_failed"
mriqc_passed_dir = "mriqc_passed"


with open("mriqc_failed/mriqc_failed.txt", "r") as f:
    lines = f.readlines()

# Keep only lines where the corresponding file exists in mriqc_dir
valid_lines = [line for line in lines if os.path.isfile(os.path.join(mriqc_dir, line.strip()))]

# Write back only valid lines
with open(txt_file, "w") as f:
    f.writelines(valid_lines)

print("Files have been cleaned up successfully.")

Files have been cleaned up successfully.


In [None]:
real_names = []
with open("mriqc_failed/mriqc_failed.txt", "r") as f:
    lines = f.readlines()
    new_line = get_html_filename(lines)
    
