In [1]:
from nilearn.maskers import NiftiLabelsMasker
from nilearn.image import index_img, concat_imgs, resample_img
from nilearn.decoding import Decoder
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.ndimage import center_of_mass
import nibabel as nib
import numpy as np
import pandas as pd
import os

In [2]:

# Define data directory and mapping file
data_dir = '/home/zaz22/research-data-store/fmri/fmri_beijing'
mapping_file = '/home/zaz22/research-data-store/rois/rois_3000_beijing/rois/brain_atoms.mnc.gz'
repetition_time = 2.0

# Define subject IDs
subject_ids = [
    '9640133', '9783279', '9887336', '9890726', '4095748', '4136226', '4221029',
    '4225073', '4241194', '4256491', '4265987', '4334113', '4383707', '4475709',
    '4921428', '5150328', '5193577', '5575344', '5600820', '5669389', '5993008',
    '6187322', '6383713', '6477085', '6500128', '7011503', '7093319', '7135128',
    '7253183', '7390867', '7407032', '7689953', '7994085', '8191384', '8278680',
    '8328877', '8838009', '9002207', '9093997', '9210521', '9221927', '9578631',
    '3494778', '3554582', '3561920', '3562883', '3587000', '3593327', '3610134',
    '3624598', '3655623', '3672300', '3672854', '3691107', '3707771', '3712305',
    '3732101', '3739175', '3803759', '3809753', '3827352', '3856956', '3870624',
    '3889095', '3910672', '3930512', '3967265', '3976121', '3983607', '3993793',
    '3994098', '4006710', '4028266', '4048810', '4053388', '4053836', '4055710',
    '4073815', '4075719', '4091983', '2538839', '2559537', '2601519', '2659769',
    '2697768', '2703336', '2714224', '2737106', '2780647', '2833684', '2884672',
    '2897046', '2907951', '2910270', '2919220', '2940712', '2950754', '2984158',
    '3004580', '3086074', '3157406', '3194757', '3205761', '3212536', '3224401'
]

# ROI Extraction
mapping_img = nib.load(mapping_file)
mapping_data = mapping_img.get_fdata()
affine = mapping_img.affine
roi_labels = np.unique(mapping_data)


In [None]:
# Extract ROI coordinates
roi_coords = []
for label in roi_labels:
    roi_mask = mapping_data == label
    com_voxel = center_of_mass(roi_mask)
    com_world = nib.affines.apply_affine(affine, com_voxel)
    roi_coords.append(com_world)

print(f'ROI coordinates: {roi_coords}')

# Initialize NiftiLabelsMasker
labels_masker = NiftiLabelsMasker(
    labels_img=mapping_img,
    standardize=True,
    detrend=True,
    t_r=repetition_time,
    verbose=1
)

In [None]:

# Load functional data and extract time series
all_tseries = []
for sub_id in subject_ids:
    func_file = os.path.join(data_dir, f'fmri_X_{sub_id}_session_1_run1.nii.gz')
    if os.path.exists(func_file):
        func_img = nib.load(func_file)
        tseries = labels_masker.fit_transform(func_img)
        all_tseries.append(tseries)
        print(f"Extracted time series for subject {sub_id}, shape: {tseries.shape}")
    else:
        print(f"File not found for subject {sub_id}: {func_file}")

# Concatenate all time series
X = np.vstack(all_tseries)

In [None]:

# Load behavioral data and filter based on diagnosis
behavioral_file = '/home/zaz22/repos/adhd-mri-using-ml/adhd200_preprocessed_phenotypics.tsv'
behavioral = pd.read_csv(behavioral_file, sep='\t')
conditions_map = {'0': 'Non-ADHD', '1': 'ADHD', '2': 'ADHD', '3': 'ADHD'}
behavioral['Diagnosis_Class'] = behavioral['DX'].map(conditions_map)
behavioral = behavioral[behavioral['Diagnosis_Class'].notna()]
y = behavioral['Diagnosis_Class'].values

# Dimensionality reduction and scaling pipeline
scaler = StandardScaler()
variance_selector = VarianceThreshold(threshold=0.01)
pca = PCA(n_components=50)  # Adjust based on explained variance ratio

# Decoder with GridSearchCV for hyperparameter tuning
decoder = Decoder(estimator='svc', cv=5, scoring='accuracy')
param_grid = {'decoder__estimator__C': [0.01, 0.1, 1, 10, 100]}  # Adjust hyperparameters here

# Combine preprocessing and decoding
pipeline = Pipeline([
    ('scaler', scaler),
    ('variance_selector', variance_selector),
    ('pca', pca),
    ('decoder', decoder)
])
print(pipeline.get_params())


grid_search = GridSearchCV(
    pipeline,
    param_grid={'decoder__estimator__C': [0.01, 0.1, 1, 10, 100]},
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)


In [9]:
# Align fMRI data (X) with behavioral data (y)
valid_subject_ids = [
    sub_id for sub_id in subject_ids if sub_id in behavioral['ScanDir ID'].astype(str).tolist()
]

filtered_tseries = []
filtered_labels = []

for idx, sub_id in enumerate(subject_ids):
    if sub_id in valid_subject_ids:
        filtered_tseries.append(all_tseries[idx])  # Keep only valid time series
        filtered_labels.append(behavioral.loc[behavioral['ScanDir ID'] == int(sub_id), 'Diagnosis_Class'].values[0])

# Convert to numpy arrays
X = np.vstack(filtered_tseries)
y = np.array(filtered_labels)

# Verify alignment
print(f"Shape of X: {X.shape}")
print(f"Length of y: {len(y)}")

Shape of X: (24776, 2843)
Length of y: 105


In [10]:
# Aggregate time series data by subject
aggregated_tseries = []
for tseries in filtered_tseries:
    aggregated_tseries.append(np.mean(tseries, axis=0))  # Average across time points

# Convert to numpy arrays
X = np.vstack(aggregated_tseries)  # One row per subject
y = np.array(filtered_labels)      # Labels remain unchanged

# Verify alignment
print(f"Shape of X: {X.shape}")  # Should match (105, number_of_ROIs)
print(f"Length of y: {len(y)}")  # Should match the number of subjects (105)

# Cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_accuracies = []
training_losses = []
validation_losses = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X), start=1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Fit model
    grid_search.fit(X_train, y_train)

    # Training loss
    train_preds = grid_search.predict(X_train)
    train_loss = 1 - accuracy_score(y_train, train_preds)
    training_losses.append(train_loss)

    # Validation loss
    val_preds = grid_search.predict(X_test)
    val_loss = 1 - accuracy_score(y_test, val_preds)
    validation_losses.append(val_loss)

    # Accuracy
    accuracy = accuracy_score(y_test, val_preds)
    fold_accuracies.append(accuracy)
    print(f"Fold {fold}: Accuracy = {accuracy:.2f}, Training Loss = {train_loss:.2f}, Validation Loss = {val_loss:.2f}")

# Overall results
mean_accuracy = np.mean(fold_accuracies)
print(f"Mean Cross-Validation Accuracy: {mean_accuracy:.2f}")

Shape of X: (105, 2843)
Length of y: 105


AttributeError: 'str' object has no attribute 'set_params'