In [2]:
import pandas as pd
import os
import subprocess
import random

# Load the CSV
csv_path = "suggested_exclusions.csv"
df = pd.read_csv(csv_path, header=None)  # No header in CSV
df.columns = ["scan_id", "MRIQC_fail"]

# Directories
mriqc_failed_dir = "mriqc_failed"
all_included_dir = "all_included"

os.makedirs(mriqc_failed_dir, exist_ok=True)
os.makedirs(all_included_dir, exist_ok=True)

In [3]:
# Function to construct filenames
def get_html_filename(scan_id):
    """Convert scan_id to dataset's HTML filename format."""
    parts = scan_id.split("_")
    subject = parts[0][1:]  # Remove leading 's' from subject number
    task = parts[1]
    
    # Construct filename
    return f"sub-s{subject}_ses-1_task-{task}_run-1_bold.html"

In [4]:
# **Step 1: Download failed scans (MRIQC_fail == 1.0)**
failed_files = []
failed_scans = df[df["MRIQC_fail"] == "1.0"]["scan_id"]

# Save list of failed files
with open(os.path.join(mriqc_failed_dir, "mriqc_failed.txt"), "w") as f:
    for file in failed_scans:
        f.write(file + "\n")

In [7]:
for scan_id in failed_scans:
    filename = get_html_filename(scan_id)
    dataset_path = f"ds004636/derivatives/mriqc/{filename}"

    # Download file using Datalad
    try:
        subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
        failed_files.append(filename)

    except subprocess.CalledProcessError:
        if "ses-1" in filename:
            filename = filename.replace("ses-1", "ses-2")
            dataset_path = f"ds004636/derivatives/mriqc/{filename}"
            try:
                subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                failed_files.append(filename)

            except subprocess.CalledProcessError:
                print(f"Failed to download: {filename}")

FileNotFoundError: [Errno 2] No such file or directory: 'datalad'

In [None]:
# THE FILES WERE INSTALLED IN ANOTHER REPOSITORY AND COPIED TO THIS REPOSITORY USING THIS CODE
# BECAUSE WHEN DATA IS DIRECTLY CLONED FROM DATALAD USING DATALAD CLONE {URL} 
# WE CAN'T UPLOAD IT TO GITHUB DUE TO .DATALAD AND .GIT FILES. 
# WHEN DATALAD CLONE {URL} IS USED, IT DOWNLOADS ALL METADATA. 
# THIS FILE ONLY INCLUDES DESIRED FILES IN DS004636 FOLDER

# import shutil 
# import os

# #copies mriqc failed

# # Define source and destination directories
# source_dir = "ds004636/derivatives/mriqc/"
# dest_dir = "../self-regulation-dataset/ds004636/derivatives/mriqc/"

# # Ensure destination directory exists
# os.makedirs(dest_dir, exist_ok=True)

# # Copy files
# # Save list of failed files
# with open(os.path.join(mriqc_failed_dir, "mriqc_failed.txt"), "r") as f:
#     for scan_id in failed_scans:
#         filename = get_html_filename(scan_id)
#         source_path = os.path.join(source_dir, filename)
#         dest_path = os.path.join(dest_dir, filename)

#         if os.path.exists(source_path):
#             shutil.copy2(source_path, dest_path)  # copy2 preserves metadata
#             print(f"Copied {filename} to {dest_path}")
#         else:
#             if "ses-1" in filename:
#                 filename = filename.replace("ses-1", "ses-2")
#                 source_path = os.path.join(source_dir, filename)
#                 dest_path = os.path.join(dest_dir, filename)
#                 if os.path.exists(source_path):
#                     shutil.copy2(source_path, dest_path)  # copy2 preserves metadata
#                     print(f"Copied {filename} to {dest_path}")
#                 else:
#                     print(f"File not found: {source_path}")

#             else:
#                 print(f"ses1 not in name")

In [None]:
# **Step 2: Download 10 passing examples for each task**
import random
import subprocess

included_files = []
tasks = {"ANT", "CCTHot", "WATT3", "stopSignal", "twoByTwo", "DPX", "discountFix", "motorSelectiveStop", "stroop", "surveyMedley"}
def download_file(filename):
    dataset_path = f"ds004636/derivatives/mriqc/{filename}"
    try:
        subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
        included_files.append(filename)
        print(f"{filename}: ok")
        return True
    except subprocess.CalledProcessError:
        if "ses-1" in filename:
                filename = filename.replace("ses-1", "ses-2")
                dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                try:
                    subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                    included_files.append(filename)
                    print(f"{filename}: ok")
                except subprocess.CalledProcessError:
                    if "ses-2" in filename:
                        filename = filename.replace("ses-2", "ses-3")
                        dataset_path = f"ds004636/derivatives/mriqc/{filename}"
                        try:
                            subprocess.run(["datalad", "get", "-d", "ds004636", dataset_path], check=True)
                            included_files.append(filename)
                            print(f"{filename}: ok")
                        except subprocess.CalledProcessError:
                            pass
                            print(f"Failed to download: {filename}")
        else:
            pass
            print(f"ses1 not in name")

for task in tasks:
    passing_scans = df[(df["MRIQC_fail"] == "0.0") & (df["scan_id"].str.contains(task))]
    
    available_scans = set(passing_scans["scan_id"])
    selected_samples = set()
    
    while len(selected_samples) < 10 and available_scans:
        remaining_options = list(available_scans - selected_samples)
        if not remaining_options:
            break 
        scan_id = random.choice(remaining_options)
        selected_samples.add(scan_id)
        
        filename = get_html_filename(scan_id)
        print(len(selected_samples))
        if download_file(filename):
            continue
        else:
            # If all session replacements fail, pick a new sample
            selected_samples.remove(scan_id)
            available_scans.discard(scan_id)

In [32]:
# Save list of included files TO ALL_INCLUDED.TXT
with open(os.path.join(all_included_dir, "all_included.txt"), "w") as f:
    for file in included_files:
        f.write(file + "\n")

In [None]:
# AGAIN COPIED ALL THE FILES FROM ANOTHER DIRECTORY (IN ULAS'S COMPUTER) TO THIS REPO

# import shutil
# import os

# # Define source and destination directories
# source_dir = "ds004636/derivatives/mriqc/"
# dest_dir = "../self-regulation-dataset/ds004636/derivatives/mriqc_included/"

# # Ensure destination directory exists
# os.makedirs(dest_dir, exist_ok=True)

# # Copy files
# for scan_id in included_files:
#     filename = scan_id
#     source_path = os.path.join(source_dir, filename)
#     dest_path = os.path.join(dest_dir, filename)

#     if os.path.exists(source_path):
#         shutil.copy2(source_path, dest_path)  # copy2 preserves metadata
#         print(f"Copied {filename} to {dest_path}")
#     else:
#         if "ses-1" in filename:
#             filename = filename.replace("ses-1", "ses-2")
#             source_path = os.path.join(source_dir, filename)
#             dest_path = os.path.join(dest_dir, filename)
#             if os.path.exists(source_path):
#                 shutil.copy2(source_path, dest_path)  # copy2 preserves metadata
#                 print(f"Copied {filename} to {dest_path}")
#             else:
#                 print(f"File not found: {source_path}")

#         else:
#             print(f"ses1 not in name")

In [None]:
# all downloaded files' names are in mriqc_failed.txt and all_included.txt
# develop website that allows you to select and open the visuals. 

# Files
mriqc_failed = "mriqc_failed/mriqc_failed.txt"
all_included = "all_included/all_included.txt"

# acquire filename by parsing above documents
filename = all_included...

# display the document at dataset_path in html form
dataset_path = f"ds004636/derivatives/mriqc/{filename}"


In [None]:
import os

# Path to the mriqc directory
mriqc_failed_dir = "mriqc_failed"
mriqc_passed_dir = "mriqc_passed"


with open("mriqc_failed/mriqc_failed.txt", "r") as f:
    lines = f.readlines()

# Keep only lines where the corresponding file exists in mriqc_dir
valid_lines = [line for line in lines if os.path.isfile(os.path.join(mriqc_dir, line.strip()))]

# Write back only valid lines
with open(txt_file, "w") as f:
    f.writelines(valid_lines)

print("Files have been cleaned up successfully.")

Files have been cleaned up successfully.


In [None]:
real_names = []
with open("mriqc_failed/mriqc_failed.txt", "r") as f:
    lines = f.readlines()
    new_line = get_html_filename(lines)
    
