In [1]:
import os
import json
import csv
from bs4 import BeautifulSoup

# Directories and file paths
mriqc_failed_dir = "ds004636/derivatives/mriqc_failed"
mriqc_passed_dir = "ds004636/derivatives/mriqc_passed"

csv_file = "quality_metrics/BOLD_quality_metrics.csv"
mriqc_passed_txt = "metadata/mriqc_passed/mriqc_passed_fullname.txt"
mriqc_failed_txt = "metadata/mriqc_failed/mriqc_failed_fullname.txt"

# Task names to filter
TASKS = {"ANT", "CCTHot", "WATT3", "stopSignal", "twoByTwo", "DPX", "discountFix", "motorSelectiveStop", "stroop", "surveyMedley"}

In [2]:
def parse_filename(filename):
    """
        # takes in file names from mriqc_failed.txt or mriqc_passed.txt
        # and returns subject and task

    Args:
        filename (string): _description_

    Returns:
        subject (string)
        task (str)
    """
    parts = filename.split("_")
    subject = parts[0].replace("sub-", "")
    task = next((t for t in TASKS if t in filename), None)
    return subject, task


def extract_metrics(html_path, metrics):
    """given the full html name, and metrics (found in BOLD_quality_metrics.csv)
    extract relevant information from the html files, populates the data dictionary with 
    key(metric) and value(metric value) pairs

    Args:
        html_path (_type_): _description_
        metrics (_type_): _description_

    Returns:
        data: a dictionary with keys metric names and values: metric values e
    """
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    data = {}

    # relevant metric key and values are under "other" on html files
    other_section = soup.find(id="other")
    if not other_section:
        print(f" other table not found for {html_path}")
        return data  # Return empty if "Other" section is not found
    
    # in the other section, get the table with id "iqms-table" which is where ketric key-value pairs are listed
    table = other_section.find_next("table", {"id": "iqms-table"})
    if not table:
        print(f"iqms table not found for {html_path}")
        return data  # Return empty if table is not found
    
    # scrape the info, assign them to metric_name and value
    for row in table.find_all("tr"):
        cells = row.find_all("td")
        if len(cells) == 2:
            metric_name = cells[0].text.strip()
            value = cells[1].text.strip()
        elif len(cells) == 3:
            # for when metric has two parts seperated by "_"
            metric_name = f"{cells[0].text.strip()}_{cells[1].text.strip()}"
            value = cells[2].text.strip()
        else:
            # print("else")
            continue
        
        # add everything to data dictionary
        if metric_name in metrics:
            try:
                data[metric_name] = float(value)
            except ValueError:
                data[metric_name] = value
    
    return data

In [4]:

# Organize data into JSON
# quality_data = {"mriqc_failed": {}, "mriqc_passed": {}}

# Read metric names from CSV (BOLD_quality_metrics)
metrics = set()
with open(csv_file, "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        metric_name = row['Metric'] #changed this later, didn't check, might have problem!!!
        metrics.add(metric_name)
        
# # Process both mriqc_failed_txt and mriqc_passed_txt files.
# for category, txt_file in [("mriqc_failed", mriqc_failed_txt), ("mriqc_passed", mriqc_passed_txt)]:
#     with open(txt_file, "r") as f:

#         # filenames is a list of all full html file names in failed and passed combined
#         filenames = [line.strip() for line in f]

#     # sets mriqc_dir as the current directory
#     if category == "mriqc_failed": 
#         mriqc_dir = mriqc_failed_dir
#     else:
#         mriqc_dir = mriqc_passed_dir

#     # for each html in mriqc_passed or mriqc_failed, get the metrics using extract_metrics function 
#     for filename in filenames:
#         html_path = os.path.join(mriqc_dir, filename)
#         if os.path.isfile(html_path):
#             subject, task = parse_filename(filename)
#             if task:
#                 if task not in quality_data[category]:
#                     quality_data[category][task] = {}
#                 if subject not in quality_data[category][task]:
#                     quality_data[category][task][subject] = {}
#                 quality_data[category][task][subject] = extract_metrics(html_path, metrics)
#         else:
#             print(f"not a path name {filename}")
    
# # Save quality_data to json file.
# with open("quality_metrics/quality_metrics_all.json", "w") as f:
#     json.dump(quality_data, f, indent=4)

# print("JSON file created successfully!")

In [None]:
# def reformat_json(input_json, output_json):
#     """Reformats jason with a following hierarchy: 
#     from: category/task/subject/metric-value
#     to: category/task/metric/subject-value
#     creates output_json

#     Args:
#         input_json (_type_): _description_
#         output_json (_type_): _description_
#     """
#     with open(input_json, "r") as f:
#         data = json.load(f)
    
#     reformatted = {"mriqc_failed": {}, "mriqc_passed": {}}
    
#     for category in ["mriqc_failed", "mriqc_passed"]:
#         for task, subjects in data.get(category, {}).items():
#             if task not in reformatted[category]:
#                 reformatted[category][task] = {}
            
#             for subject, metrics in subjects.items():
#                 for metric, value in metrics.items():
#                     if metric not in reformatted[category][task]:
#                         reformatted[category][task][metric] = {}
                    
#                     reformatted[category][task][metric][subject] = value
    
#     with open(output_json, "w") as f:
#         json.dump(reformatted, f, indent=4)

In [None]:
# File paths
input_json = "quality_metrics/quality_metrics_all.json"
output_json = "quality_metrics/quality_metrics_all_reformatted.json"
metrics_csv = "quality_metrics/BOLD_quality_metrics.csv"

# Run functions
# reformat_json(input_json, output_json)

In [7]:
# saves metrics for plot functions
# dictionary saving low, median, and high values for each metric
thresholds = dict()

# opens metrics csv (listing names and thresholds)
with open(metrics_csv, "r") as f:
    reader = csv.DictReader(f)  # Read CSV as dictionary
    for row in reader:
        # metric = metric name
        metric = row["Metric"]  # Get the metric name

        # thresholds = dictionary with keys: metric, values: low, high, and median
        low, high, median = float(row["Low"]), float(row["High"]), float(row["Median"])  # Convert to floats
        thresholds[metric] = (low, high, median)

In [111]:
import pandas as pd
import matplotlib.pyplot as plt

# TO DO'S IN THIS FUNCTION:
# ADD MRIQC_FAILED FOR ABOVE THE LINE, PASSED FOR BELOW THE LINE
# CLARIFY SHADED GRAY REGION (WITHIN RANGE OF LOW AND HIGH)
# ADD LEGEND: RED DOT: OUTSIDE RANGE, GREEN DOT: IN RANGE

# always display -1.5 to 1.5 in znormalized values
def plot_task_metric(task, metric, data, thresholds, save_dir="quality_metrics/plots"):
    """_summary_

    Args:
        task (_type_): _description_
        metric (_type_): _description_
        data (_type_): _description_
        thresholds (_type_): _description_
        save_dir (str, optional): _description_. Defaults to "quality_metrics/plots".
    """
    os.makedirs(save_dir, exist_ok=True)
    
    subjects = []
    values = []
    colors = []
    categories = []
    
    # Extract data for both "mriqc_failed" and "all_included"
    for category in ["mriqc_passed","mriqc_failed"]:
        if task in data[category] and metric in data[category][task]:
            for subject, value in data[category][task][metric].items():
                subjects.append(subject)
                values.append(value)
                categories.append(category)  # Keep track of whether it's "mriqc_failed" or "all_included"
    
    # Z-normalize values using Low, High, and Median from thresholds
    if metric in thresholds:
        (low, high, median) = thresholds[metric]
        std_dev = (high - low) / 2  # Approximate standard deviation
        values = [(v - median) / std_dev for v in values]

    # Assign colors (red if outside range, green if inside)
    for value in values:
        if -1 <= value <= 1:  # Within expected range
            colors.append("green")
        else:
            colors.append("red")
    
    # Separate "mriqc_failed" from "mriqc_passed" in the plot
    passed_count = sum(1 for cat in categories if cat == "mriqc_passed")
    
    fig_height = max(6, len(subjects) * 0.3)  # Store the calculated height
    # Increase figure height
    fig, ax = plt.subplots(figsize=(8, fig_height))
    
    ax.scatter(values, range(len(subjects)), c=colors, s=50)
    
    # Add a horizontal separation line between failed and included
    if passed_count > 0:
        ax.axhline(y=passed_count - 0.5, color="black", linestyle="--")
        if (len(values)-1 > passed_count):
            if  values[passed_count] < 1.0:
                ax.text(-1.47, passed_count + 0.5, "mriqc_failed", verticalalignment='top', fontsize=8)
            else:
                ax.text(1.0, passed_count + 0.5, "mriqc_failed", verticalalignment='top', fontsize=8)
        
        if values[passed_count-1]< 1.0:
            ax.text(-1.47, passed_count - 1.5, "mriqc_passed", verticalalignment='bottom', fontsize=8)
        else:
            ax.text(1.0, passed_count - 1.5, "mriqc_passed", verticalalignment='bottom', fontsize=8)

    # Set y-axis labels (subjects)
    ax.set_yticks(range(len(subjects)))
    ax.set_yticklabels(subjects)
    
    # Add plot title and labels
    plt.title("Suggested Exclusions:\n" + f"{task} - {metric} Plot", fontsize=14, pad=20, y= 1)
    plt.xlabel("Z-Normalized Value", fontsize=12)
    plt.ylabel("Subjects", fontsize=12)
    
    # Draw shaded gray region for acceptable range (-1 to 1)
    ax.axvspan(-1, 1, color="gray", alpha=0.2)
    
    # Add small annotations for value ranges
    if (values[0] > 0.0):
        ax.text(-0.5, -0.4, "within range", ha="center", fontsize=8)
    else:
        ax.text(0.5, -0.4, "within range", ha="center", fontsize=8)
    ax.text(-1.0, -0.4, "low\nthreshold",  ha="center", fontsize=8)
    ax.text(1.0, -0.4, "high\nthreshold",  ha="center", fontsize=8)

    # Ensure x-axis ticks are visible
    # plt.xticks(range(int(min(values)) - 1, int(max(values)) + 2))
    plt.xticks([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5])
    plt.xlim(-1.5,1.5)
    
    # Add legend
    ax.scatter([], [], color="green", label="In range")
    ax.scatter([], [], color="red", label="Outside range")

    ax.legend(loc="upper right")

    # Adjust margins
    plt.subplots_adjust(left=0.2, right=0.8, top=0.8, bottom=0.2)
    
    # Save plot
    plt.savefig(os.path.join(save_dir, f"{task}-{metric}.png"), dpi=300)
    plt.close()

In [None]:
# try the plot function in one file to make sure it's working correctly: 
with open(output_json, "r") as f:
    data = json.load(f)  # Use json.load() instead of json.loads()
    plot_task_metric("ANT", "dvars_vstd", data, thresholds, save_dir="quality_metrics/plots")

In [None]:
# save all combinations of task-metric plots
with open(output_json, "r") as f:
    data = json.load(f) 
    for task in TASKS:
        for metric in list(thresholds.keys()):

            # for each unique task-metric combination, plot a graph 
            # data(json file), thresholds (dictionary)
            plot_task_metric(task, metric, data, thresholds, save_dir="quality_metrics/plots")


In [49]:
# plot subject task
import pandas as pd
import matplotlib.pyplot as plt

def plot_subject_task(category, task, subject, data, thresholds, save_dir):
    """_summary_

    Args:
        task (_type_): _description_
        metric (_type_): _description_
        data (_type_): _description_
        thresholds (_type_): _description_
        save_dir (str, optional): _description_. Defaults to "quality_metrics/plots".
    """
    os.makedirs(save_dir, exist_ok=True)
    
    metrics = []
    values = []
    colors = []
    categories = []
    

    # Extract data for both "mriqc_failed" and "all_included"
    # takes in quality_metrics_all (not reformatted)
    category = category
    if task in data[category] and subject in data[category][task]:
        for metric, value in data[category][task][subject].items():
            metrics.append(metric)
            values.append(value)
            categories.append(category)  # Keep track of whether it's "mriqc_failed" or "all_included"
    
    # Z-normalize values using Low, High, and Median from thresholds
    # update the values array with normalized values.
    for metric in metrics:
        (low, high, median) = thresholds[metric]
        std_dev = (high - low) / 2  # Approximate standard deviation
        value_idx = metrics.index(metric)
        values[value_idx] = (values[value_idx] - median) / std_dev

    # Assign colors (red if outside range, green if inside)
    for value in values:
        if -1 <= value <= 1:  # Within expected range
            colors.append("green")
        else:
            colors.append("red")
    
    
    # Increase figure height
    fig, ax = plt.subplots(figsize=(8, 6))
    
    ax.scatter(range(len(metrics)), values, c=colors, s=50)

    # Set y-axis labels (subjects)
    ax.set_yticks([-2, -1, 0, 1, 2])
    
    # Add plot title and labels
    plt.title("Suggested Exclusions:\n" + f"{subject} - {task} Plot - {category}", fontsize=14, pad=20, y= 1)
    plt.ylabel("Z-Normalized Value", fontsize=12)
    plt.xlabel("Metrics", fontsize=12)
    
    # Draw shaded gray region for acceptable range (-1 to 1)
    ax.axhspan(-1, 1, color="gray", alpha=0.2)
    
    # Add small annotations for value ranges

    if values[-1] < 0:
        ax.text(4.8, 0.5, "within\nrange", verticalalignment='center', ha="center", fontsize=8)
    else: 
        ax.text(4.8, -0.5, "within\nrange", verticalalignment='center', ha="center",fontsize=8)
    ax.text(4.8, -1, "low\nthreshold", verticalalignment='center', ha="center",fontsize=8)
    ax.text(4.8, 1, "high\nthreshold", verticalalignment='center', ha="center",fontsize=8)

    # Ensure x-axis ticks are visible
    # plt.xticks(range(int(min(values)) - 1, int(max(values)) + 2)
    plt.xticks(range(6), metrics)
    
    # Add legend
    ax.scatter([], [], color="green", label="In range")
    ax.scatter([], [], color="red", label="Outside range")

    ax.legend(loc="upper right")

    plt.ylim(-2,2)
    # Adjust margins
    plt.subplots_adjust(left=0.2, right=0.8, top=0.8, bottom=0.2)
    
    # Save plot
    plt.savefig(os.path.join(save_dir, f"{category}-{subject}-{task}.png"), dpi=300)
    plt.close()


In [41]:
# try the plot function in one file to make sure it's working correctly: 
output_json = "quality_metrics/quality_metrics_all.json"
with open(output_json, "r") as f:
    data = json.load(f)  # Use json.load() instead of json.loads()
    plot_subject_task("mriqc_failed", "CCTHot", "s607", data, thresholds, save_dir="quality_metrics/plots/subject-task")

['dvars_vstd', 'fber', 'fd_mean', 'gsr_y', 'snr', 'tsnr']


In [None]:
# save all combinations of task-metric plots
with open(output_json, "r") as f:
    data = json.load(f) 
    for category,task in data.items(): 
        for task, subject in task.items():
            for subject,metrics in subject.items():
                # for each unique task-metric combination, plot a graph 
                # data(json file), thresholds (dictionary)
                plot_subject_task(category, task, subject, data, thresholds, save_dir="quality_metrics/plots/subject-task")