In [1]:
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from tqdm import tqdm
import significantdigits as sd

force = False

# Run this script from the notebooks directory
root_dir = Path().resolve()
input_path = Path().resolve() / "results" / "fuzzy"
output_path = Path().resolve() / "results" / "stats"
#output_path.mkdir(parents=True, exist_ok=True)
print(f"Input path: {input_path}")
print(f"Output path: {output_path}")

Input path: /lustre09/project/6049200/ychatel/mriqc-fuzzy/results/fuzzy
Output path: /lustre09/project/6049200/ychatel/mriqc-fuzzy/results/stats


In [2]:
def parse_file(filename):
    df_data = pd.read_json(filename)
    meta = df_data['bids_meta']
    result = pd.DataFrame([df_data.drop(columns=['bids_meta']).iloc[0]])
    result['subject'] = meta['subject']
    return result

def parse_files(directory, force=False):
    if not force and (output_path / "abide_Caltech_metrics.csv").exists():
        print("File already exists, skipping parsing.")
        return pd.read_csv(output_path / "abide_Caltech_metrics.csv")

    # Use pathlib's rglob for faster recursive search
    json_paths = list(directory.rglob('anat/*.json'))
    df_list = []
    for path in tqdm(json_paths):
        df = parse_file(path)
        repetition = Path(path).parts[-5]
        df['repetition'] = repetition
        df['file'] = path
        df_list.append(df)

    df = pd.concat(df_list, ignore_index=True)
    df.to_csv(output_path / "abide_Caltech_metrics.csv", index=False)
    return df

df = parse_files(input_path, force=force)

File already exists, skipping parsing.


## Distribution numerical variability across subjects

In [3]:
if force or not (output_path / "abide_Caltech_metrics.csv").exists():
  stats= df.melt(id_vars=['subject', 'repetition', 'file']).groupby(['subject','variable'])['value'].agg(['mean', 'std', 'var', 'min', 'max', lambda x : float(sd.significant_digits(x, reference=np.mean(x), basis=10))]).clip(lower=0)
  stats.columns = ['mean', 'std', 'var', 'min', 'max', 'sig_digits']
  stats = stats.reset_index()
  stats.to_csv(output_path / "abide_Caltech_metrics_stats.csv", index=False)

stats  = pd.read_csv(output_path / "abide_Caltech_metrics_stats.csv")

In [4]:
import plotly.express as px
stats.sort_values(by=['sig_digits'], ascending=[False], inplace=True)
fig = px.strip(stats, x='sig_digits', y='variable',  title='Significant digits distribution across subject per metric', height=1600)
fig

In [5]:
import plotly.express as px
stats_copy = stats.copy()
stats_copy = stats_copy[stats_copy['mean'] > 0]
stats_copy.sort_values(by=['mean'], ascending=[False], inplace=True)
fig = px.strip(stats_copy, x='mean', y='variable',  title='Mean + std distribution across subject per metric', log_x=True, height=1600)
fig.add_scatter(x=stats_copy['std'], y=stats_copy['variable'], mode='markers', name='std', marker=dict(color='red', symbol='x'))
fig.update_xaxes(exponentformat='power', title='Mean (circle) and standard deviation (cross)')
display(fig)

## Statics averaged across subjects

In [6]:
import plotly.graph_objects as go
stats_mean = stats.groupby('variable')['mean'].mean().reset_index()
stats_std = stats.groupby('variable')['var'].mean().apply(np.sqrt).reset_index()
stats_avg = pd.merge(stats_mean, stats_std, on='variable')
stats_avg.rename(columns={'mean':'mean', 'var':'std'}, inplace=True)
fig = (
  go.Figure()
  .update_layout(title='Mean + std averaged across subjects per metric', height=1600)
  .update_xaxes(type='log', exponentformat='power',title='Mean (circle) and standard deviation (cross)')
  .add_scatter(x=stats_avg['std'], y=stats_avg['variable'], mode='markers', name='std', marker=dict(color='red', symbol='x'))
  .add_scatter(x=stats_avg['mean'], y=stats_avg['variable'], mode='markers', name='mean', marker=dict(color='blue', symbol='circle'))
 )
fig

## Compute Numerical Anatomical Variability Ratio

In [7]:
df = parse_files(input_path, force=force)
sigma_num =  df.melt(id_vars=['subject', 'repetition', 'file']).groupby(['subject','variable'])['value'].var().reset_index()
sigma_num = sigma_num.groupby('variable')['value'].mean().apply(np.sqrt).reset_index()
sigma_num.columns = ['variable', 'sigma_num']
sigma_anat = df.melt(id_vars=['subject', 'repetition', 'file']).groupby(['repetition','variable'])['value'].var().reset_index()
sigma_anat = sigma_anat.groupby('variable')['value'].mean().apply(np.sqrt).reset_index()
sigma_anat.columns = ['variable', 'sigma_anat']
navr = pd.merge(sigma_num, sigma_anat, on='variable')
navr['navr'] = navr['sigma_num'] / navr['sigma_anat']

File already exists, skipping parsing.


In [8]:
px.scatter(navr, x='navr', y='variable', title='Numerical vs Anatomical Variability Ratio (NAVR)', height=800)

In [9]:
filename = root_dir / "datasets" / "abide" / "RawDataBIDS" / "Caltech" / "participants.tsv"
meta = pd.read_csv(filename, sep='\t')
meta['participant_id'] = meta['participant_id'].astype(int)
stats_ = stats.copy()
stats_['subject'] = stats_['subject'].astype(int)
stats_ = pd.merge(meta, stats_, left_on='participant_id', right_on='subject')
stats_ = stats_.dropna(axis=1)

In [10]:
import plotly.express as px
stats_.sort_values(by=['sig_digits'], ascending=[False], inplace=True)
fig = px.strip(stats_, x='sig_digits', y='variable',  title='Significant digits per metric', color='DX_GROUP', height=1600)
fig

In [11]:
import plotly.express as px
stats_.sort_values(by=['sig_digits'], ascending=[False], inplace=True)
fig = px.strip(stats_, x='sig_digits', y='variable',  title='Significant digits per metric', color='SEX', height=1600)
fig

In [12]:
px.scatter(stats_, y='sig_digits', x='AGE_AT_SCAN', color='variable', title='Significant digits per age at scan', height=800)