In [None]:
import os
import glob
import time
import numpy as np
import pandas as pd
from tqdm import trange
from collections import defaultdict
import tensorflow as tf
import matplotlib.pyplot as plt

import skfda
from skfda import FDataGrid
from skfda.preprocessing.registration import ElasticRegistration
from skfda.preprocessing.registration.validation import AmplitudePhaseDecomposition
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from config import param_lasso, param_ridge, param_rf
from model_utils import align_funcdata, prep_data, eval_model, train

In [None]:
data_dir = ".../Clean_fMRI"
file_paths = glob.glob(os.path.join(data_dir, "*.csv"))

data = [pd.read_csv(file_path, index_col=0) for file_path in file_paths]

X = [df.iloc[:, 16:] for df in data]
y = [int(df['Phenotype_Diagnosis_L1'].iloc[0] == 'ADHD Diagnosed') for df in data]

len(X), X[0].shape[0], X[0].shape[1]

In [None]:
region_data = (
    data[0]["Region_ID"]
    .value_counts()
    .rename_axis("Region_ID")
    .reset_index(name="Count")
    .sort_values(by="Region_ID")
    .reset_index(drop=True)
)

aligend_funclist = align_funcdata(X_test, region_data, grid_points=X[0].shape[1])
X = calc_curvelen(aligned_funclist)

In [None]:
metrics = {
    'lda': defaultdict(list),
    'lasso': defaultdict(list),
    'ridge': defaultdict(list),
    'rf': defaultdict(list)
}

start = time.time()

for i in trange(100, desc="Running Iterations"):
    X_train, X_test, y_train, y_test = prep_data(X, y, seed=i)
    
    # LDA
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    results = eval_model(lda, X_test, y_test)
    for k, v in results.items():
        metrics['lda'][k].append(v)
    
    # Lasso
    lasso = LogisticRegression(penalty='l1', solver='liblinear')
    best_lasso = train(lasso, param_lasso, X_train, y_train)
    results = eval_model(best_lasso, X_test, y_test)
    for k, v in results.items():
        metrics['lasso'][k].append(v)
    
    # Ridge
    ridge = LogisticRegression(penalty='l2', solver='liblinear')
    best_ridge = train(ridge, param_ridge, X_train, y_train)
    results = eval_model(best_ridge, X_test, y_test)
    for k, v in results.items():
        metrics['ridge'][k].append(v)
    
    # Random Forest
    rf = RandomForestClassifier()
    best_rf = train(rf, param_rf, X_train, y_train)
    results = eval_model(best_rf, X_test, y_test)
    for k, v in results.items():
        metrics['rf'][k].append(v)

end = time.time()

print(f"{'Model':<10} {'Accuracy':>10} {'F1 Score':>10} {'Precision':>10} {'Recall':>10}")
print("-" * 55)

for model_name in ['lda', 'lasso', 'ridge', 'rf']:
    acc = np.mean(metrics[model_name]['accuracy'])
    f1 = np.mean(metrics[model_name]['f1_score'])
    prec = np.mean(metrics[model_name]['precision'])
    rec = np.mean(metrics[model_name]['recall'])

    print(f"{model_name.upper():<10} {acc:10.4f} {f1:10.4f} {prec:10.4f} {rec:10.4f}")

print("-" * 55)
print(f"Total Time: {round((end-start)/60)} min")