# SVM Classification of fMRI Data on the ds002336

## Imports

In [None]:
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.svm import SVC

from tqdm.notebook import tqdm
from copy import deepcopy
import seaborn as sns
import numpy as np
import pandas as pd
import nibabel as nib

## Main parameters

In [None]:
DATASET_FILE = r"..\labels.csv"  # need to be set
SELECTED_SUBJECTS = [101, 102, 103, 104, 105, 106, 107, 108, 109, 110]
RUN_SET_SPLITS = {
    'train': ['-'],   # there are no runs in XP1
    'test': ['-']
}
SELECTED_CLASSES = ['Rest', 'Task-NF']
FOLDS = 5

## Data handling

In [None]:
def load_set(set_df: pd.DataFrame, flatten: bool = True, time_first: bool = True, averaged_over_time: bool = False) -> np.ndarray:
    data = []
    labels = []
    for _, row in set_df.iterrows():
        # Loaded dimensions height x width x slices x times
        loaded_data = nib.load(row['ext_frmi_pths']).get_fdata()

        if time_first:
            loaded_data = loaded_data.transpose(3, 0, 1, 2)
            
            if flatten:
                loaded_data = loaded_data.reshape((loaded_data.shape[0], loaded_data.shape[1], -1))
                loaded_data = loaded_data.reshape((loaded_data.shape[0], -1))

            if averaged_over_time:
                loaded_data = loaded_data.mean(axis=0, keepdim=True)

        elif flatten:
            loaded_data = loaded_data.reshape((-1, loaded_data.shape[-1]))

            if averaged_over_time:
                loaded_data = loaded_data.mean(axis=-1, keepdim=True)

        data.append(loaded_data)
        labels.append(np.array([row['trial_ids']]).repeat(loaded_data.shape[0]))

    data = np.concatenate(data, axis=0)
    labels = np.concatenate(labels, axis=0)

    return data, labels

In [None]:
def load_and_split_dataset(dataset_file: str, selected_classes: list, selected_subjects: list, train_runs: list, test_runs: list):
    # Load dataset
    dataset_file = pd.read_csv(dataset_file, sep=';')

    # Filter subjects and classes
    subject_query_str = [f'subjects == {selected_subjects[0]}']
    if len(selected_subjects) > 1:
        _ = [subject_query_str.append(f' | subjects == {subject}') for subject in selected_subjects[1:]]
    subject_query_str = ''.join(subject_query_str)

    class_query_str = [f'trial_types == "{selected_classes[0]}"']
    if len(selected_classes) > 1:
        _ = [class_query_str.append(f' | trial_types == "{class_type}"') for class_type in selected_classes[1:]]
    class_query_str = ''.join(class_query_str)
    
    dataset_file = dataset_file.query(f'({subject_query_str}) & ({class_query_str})')
    dataset_file.reset_index(drop=True, inplace=True)

    # Split set file into train and test splits
    train_set_file = dataset_file.loc[dataset_file['runs'].isin(train_runs)]
    test_set_file = dataset_file.loc[dataset_file['runs'].isin(test_runs)]

    # Load data sets and labels
    train_data, train_labels = load_set(train_set_file)
    test_data, test_labels = load_set(test_set_file)

    # Standardize data
    sc_train = StandardScaler()
    train_data = sc_train.fit_transform(train_data)

    sc_test = StandardScaler()
    test_data = sc_test.fit_transform(test_data)

    return train_data, train_labels, test_data, test_labels

# Training sessions

## Subject-wise SVMs

### Linear kernel

In [None]:
subject_linear_svcs = dict()

folds = FOLDS
with tqdm(total=folds * len(SELECTED_SUBJECTS), leave=True) as pbar:
    for subject in SELECTED_SUBJECTS:
        print(f'Subject {subject}')
        subject_linear_svcs[str(subject)] = {
            'model': None,
            'accuracies': []
        }

        train_data, train_labels, _, _ = \
            load_and_split_dataset(DATASET_FILE, SELECTED_CLASSES, [subject], RUN_SET_SPLITS['train'], RUN_SET_SPLITS['test'])

        best_acc = 0.
        for fold in range(folds):
            print(f'Fold {fold}')

            zero_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 0])
            one_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 1])
            rnd = np.random.RandomState(seed=fold)
            rnd.shuffle(zero_indices)
            rnd.shuffle(one_indices)
            fold_indices = np.zeros(zero_indices.shape[0]*2, dtype=np.longlong)
            fold_indices[::2] = zero_indices
            fold_indices[1::2] = one_indices
            
            train_length = int(len(train_data) * 0.8)
            fold_train_data, fold_train_labels = train_data[fold_indices][:train_length], train_labels[fold_indices][:train_length]
            fold_test_data, fold_test_labels = train_data[fold_indices][train_length:], train_labels[fold_indices][train_length:]

            linear_svc = SVC(kernel='linear', verbose=1, C=0.1)
            linear_svc.fit(fold_train_data, fold_train_labels)
            
            predictions = linear_svc.predict(fold_test_data)
            fold_acc = accuracy_score(predictions, fold_test_labels)
            print(f'Fold accuracy: {fold_acc * 100}%')
            if fold_acc > best_acc:
                best_acc = fold_acc
                subject_linear_svcs[str(subject)]['model'] = deepcopy(linear_svc)
            subject_linear_svcs[str(subject)]['accuracies'].append(fold_acc)
            
            pbar.update(1)

#### Plot accuracies

In [None]:
for_df = {
    'accuracy': [],
    'subject': []
}
for subject in SELECTED_SUBJECTS:
    for_df['accuracy'].extend(subject_linear_svcs[str(subject)]['accuracies'])
    for_df['subject'].extend([f'xp{subject}'] * 5)
df = pd.DataFrame.from_dict(for_df)

import seaborn as sns
sns.set_theme(style="whitegrid")

# Draw a nested barplot by species and sex
g = sns.catplot(
    data=df, kind="bar",
    x="subject", y="accuracy",
    errorbar="sd", palette="dark", alpha=.7, height=8
)
g.despine(left=True)
g.set_axis_labels("Patient", "Accuracy")
g.fig.suptitle("Classification Accuracies of Separate SVMs per Subject")

### RBF SVM

In [None]:
subject_rbf_svcs = dict()

folds = FOLDS
with tqdm(total=folds * len(SELECTED_SUBJECTS), leave=True) as pbar:
    for subject in SELECTED_SUBJECTS:
        print(f'Subject {subject}')
        subject_rbf_svcs[str(subject)] = {
            'model': None,
            'accuracies': []
        }

        train_data, train_labels, _, _ = \
            load_and_split_dataset(DATASET_FILE, SELECTED_CLASSES, [subject], RUN_SET_SPLITS['train'], RUN_SET_SPLITS['test'])

        best_acc = 0.
        for fold in range(folds):
            print(f'Fold {fold}')

            zero_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 0])
            one_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 1])
            rnd = np.random.RandomState(seed=fold)
            rnd.shuffle(zero_indices)
            rnd.shuffle(one_indices)
            fold_indices = np.zeros(zero_indices.shape[0]*2, dtype=np.longlong)
            fold_indices[::2] = zero_indices
            fold_indices[1::2] = one_indices
            
            train_length = int(len(train_data) * 0.8)
            fold_train_data, fold_train_labels = train_data[fold_indices][:train_length], train_labels[fold_indices][:train_length]
            fold_test_data, fold_test_labels = train_data[fold_indices][train_length:], train_labels[fold_indices][train_length:]

            rbf_svc = SVC(kernel='rbf', verbose=1, C=0.1, max_iter=1000)
            rbf_svc.fit(fold_train_data, fold_train_labels)
            
            predictions = rbf_svc.predict(fold_test_data)
            fold_acc = accuracy_score(predictions, fold_test_labels)
            print(f'Fold accuracy: {fold_acc * 100}%')
            if fold_acc > best_acc:
                best_acc = fold_acc
                subject_rbf_svcs[str(subject)]['model'] = deepcopy(rbf_svc)
            subject_rbf_svcs[str(subject)]['accuracies'].append(fold_acc)
            
            pbar.update(1)

#### Plot accuracies

In [None]:
for_df = {
    'accuracy': [],
    'subject': []
}
for subject in SELECTED_SUBJECTS:
    for_df['accuracy'].extend(subject_rbf_svcs[str(subject)]['accuracies'])
    for_df['subject'].extend([f'xp{subject}'] * 5)
df = pd.DataFrame.from_dict(for_df)

sns.set_theme(style="whitegrid")
g = sns.catplot(
    data=df, kind="bar",
    x="subject", y="accuracy",
    errorbar="sd", palette="dark", alpha=.7, height=8
)
g.despine(left=True)
g.set_axis_labels("Patient", "Accuracy")
g.fig.suptitle("Classification Accuracies of Separate SVMs per Subject")
g.set(ylim=(0, 1))

## Inter-subject SVMs

### Linear SVM

In [None]:
mixed_linear_svcs = {
    'model': None,
    'accuracies': []
}

folds = FOLDS
with tqdm(total=folds, leave=True) as pbar:
    train_data, train_labels, _, _ = \
        load_and_split_dataset(DATASET_FILE, SELECTED_CLASSES, SELECTED_SUBJECTS, RUN_SET_SPLITS['train'], RUN_SET_SPLITS['test'])

    best_acc = 0.
    for fold in range(folds):
        print(f'Fold {fold}')

        zero_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 0])
        one_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 1])
        rnd = np.random.RandomState(seed=fold)
        rnd.shuffle(zero_indices)
        rnd.shuffle(one_indices)
        fold_indices = np.zeros(zero_indices.shape[0]*2, dtype=np.longlong)
        fold_indices[::2] = zero_indices
        fold_indices[1::2] = one_indices
        
        train_length = int(len(train_data) * 0.8)
        fold_train_data, fold_train_labels = train_data[fold_indices][:train_length], train_labels[fold_indices][:train_length]
        fold_test_data, fold_test_labels = train_data[fold_indices][train_length:], train_labels[fold_indices][train_length:]

        linear_svc = SVC(verbose=1, C=0.1, kernel='linear')
        linear_svc.fit(fold_train_data, fold_train_labels)
        
        predictions = linear_svc.predict(fold_test_data)
        fold_acc = accuracy_score(fold_test_labels, predictions)
        print(f'Fold accuracy: {fold_acc * 100}%')
        if fold_acc > best_acc:
            best_acc = fold_acc
            mixed_linear_svcs['model'] = deepcopy(linear_svc)
        mixed_linear_svcs['accuracies'].append(fold_acc)
        
        pbar.update(1)

#### Plot metrics

In [None]:
train_data, train_labels, _, _ = \
    load_and_split_dataset(DATASET_FILE, SELECTED_CLASSES, SELECTED_SUBJECTS, RUN_SET_SPLITS['train'], RUN_SET_SPLITS['test'])

zero_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 0])
one_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 1])
rnd = np.random.RandomState(seed=1)
rnd.shuffle(zero_indices)
rnd.shuffle(one_indices)
fold_indices = np.zeros(zero_indices.shape[0]*2, dtype=np.longlong)
fold_indices[::2] = zero_indices
fold_indices[1::2] = one_indices

train_length = int(len(train_data) * 0.8)
fold_train_data, fold_train_labels = train_data[fold_indices][:train_length], train_labels[fold_indices][:train_length]
fold_test_data, fold_test_labels = train_data[fold_indices][train_length:], train_labels[fold_indices][train_length:]

linear_svc = SVC(verbose=1, C=0.1, kernel='linear')
linear_svc.fit(fold_train_data, fold_train_labels)

In [None]:
predictions = linear_svc.predict(fold_test_data)
acc = accuracy_score(fold_test_labels, predictions)
print(f'Accuracy: {acc * 100}%')

conf_mat = confusion_matrix(fold_test_labels, predictions)
sns.heatmap(conf_mat, annot=True)

### RBF SVM

In [None]:
mixed_rbf_svcs = {
    'model': None,
    'accuracies': []
}

folds = FOLDS
with tqdm(total=folds, leave=True) as pbar:
    train_data, train_labels, _, _ = \
        load_and_split_dataset(DATASET_FILE, SELECTED_CLASSES, SELECTED_SUBJECTS, RUN_SET_SPLITS['train'], RUN_SET_SPLITS['test'])

    best_acc = 0.
    for fold in range(folds):
        print(f'Fold {fold}')

        zero_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 0])
        one_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 1])
        rnd = np.random.RandomState(seed=fold)
        rnd.shuffle(zero_indices)
        rnd.shuffle(one_indices)
        fold_indices = np.zeros(zero_indices.shape[0]*2, dtype=np.longlong)
        fold_indices[::2] = zero_indices
        fold_indices[1::2] = one_indices
        
        train_length = int(len(train_data) * 0.8)
        fold_train_data, fold_train_labels = train_data[fold_indices][:train_length], train_labels[fold_indices][:train_length]
        fold_test_data, fold_test_labels = train_data[fold_indices][train_length:], train_labels[fold_indices][train_length:]

        rbf_svc = SVC(verbose=1, C=0.1, kernel='rbf', max_iter=1000)
        rbf_svc.fit(fold_train_data, fold_train_labels)
        
        predictions = rbf_svc.predict(fold_test_data)
        fold_acc = accuracy_score(fold_test_labels, predictions)
        print(f'Fold accuracy: {fold_acc * 100}%')
        if fold_acc > best_acc:
            best_acc = fold_acc
            mixed_rbf_svcs['model'] = deepcopy(rbf_svc)
        mixed_rbf_svcs['accuracies'].append(fold_acc)
        
        pbar.update(1)

In [None]:
train_data, train_labels, _, _ = \
    load_and_split_dataset(DATASET_FILE, SELECTED_CLASSES, SELECTED_SUBJECTS, RUN_SET_SPLITS['train'], RUN_SET_SPLITS['test'])

zero_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 0])
one_indices = np.array([idx for idx in range(0, len(train_data), 1) if train_labels[idx] == 1])
rnd = np.random.RandomState(seed=1)
rnd.shuffle(zero_indices)
rnd.shuffle(one_indices)
fold_indices = np.zeros(zero_indices.shape[0]*2, dtype=np.longlong)
fold_indices[::2] = zero_indices
fold_indices[1::2] = one_indices

train_length = int(len(train_data) * 0.8)
fold_train_data, fold_train_labels = train_data[fold_indices][:train_length], train_labels[fold_indices][:train_length]
fold_test_data, fold_test_labels = train_data[fold_indices][train_length:], train_labels[fold_indices][train_length:]

rbf_svc = SVC(verbose=1, C=0.1, kernel='rbf')
rbf_svc.fit(fold_train_data, fold_train_labels)

In [None]:
predictions = rbf_svc.predict(fold_test_data)
acc = accuracy_score(fold_test_labels, predictions)
print(f'Accuracy: {acc * 100}%')

conf_mat = confusion_matrix(fold_test_labels, predictions)
sns.heatmap(conf_mat, annot=True)

## Visualize linear SVM coefficients on top of the original fMRI data

In [None]:
import matplotlib.pyplot as plt
from skimage.util import montage
import nibabel as nib
from nilearn.plotting import plot_glass_brain
from skimage.transform import rescale

base = nib.load(r"..\data\ds002338\PreProcessed\Trials\XP1\sub-101\1_Task-NF_fmri.nii.gz").get_fdata().transpose(2, 0, 1, 3).mean(-1)  # need to be set

subject_coeffs = linear_svc.coef_
subject_coeffs = subject_coeffs.squeeze(0).reshape(44, 64, 64).transpose(2, 0, 1)
subj_coefs_pos = np.where(subject_coeffs > 0, subject_coeffs, 0)
subj_coefs_neg = np.where(subject_coeffs < 0, np.abs(subject_coeffs), 0)

# Visualize activation maps based on SVM Coefficients
plt.figure(figsize=(20, 20))
plt.imshow(montage(subj_coefs_pos[1:]))
plt.imshow(montage(subj_coefs_neg[1:]))
plt.colorbar()
plt.imshow(montage(base[1:]), cmap="gray", alpha=0.5)
plt.grid(False)
plt.show()

In [None]:
# Visualize activation maps based on SVM Coefficients
plt.figure(figsize=(20, 20))
plt.imshow(montage(subj_coefs_pos[1:] * base[1:]))
plt.imshow(montage(subj_coefs_neg[1:] * base[1:]))
plt.colorbar()
plt.imshow(montage(base[1:]), cmap="gray", alpha=0.5)
plt.grid(False)
plt.show()