### Packages

In [None]:
# import cupy as cp
import torch
import hcp_utils as hcp # https://rmldj.github.io/hcp-utils/
from statsmodels.stats.multitest import multipletests
import matlab.engine
import os
import json
import psutil
import random
import numpy as np
import pandas as pd
import re
import psutil
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.linalg import eigh, svd, logm
from scipy.stats import norm
from sklearn.decomposition import FastICA, PCA
from sklearn.covariance import LedoitWolf
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from nilearn import image as nimg
from nilearn import plotting
import nibabel as nib
from pyriemann.estimation import Covariances
from pyriemann.utils.mean import mean_covariance
from pyriemann.utils.tangentspace import tangent_space, untangent_space, log_map_riemann, unupper
from pyriemann.utils.distance import distance_riemann, distance
from pyriemann.utils.base import logm, expm
from concurrent.futures import ProcessPoolExecutor, TimeoutError
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

### Options

In [None]:
# Define your settings
settings = {
    "phenotype": "ReadEng_AgeAdj",
    "percentile": 0.2,
    "outputfolder": "regression_pmat",
    "n_folds": 5,
    "TanSVM_C": 1,
    "random_state": 42,
    "n_filters_per_group": 1,
    "Tangent_Class": True,
    "metric": "logeuclid"
}

# Ensure the output folder exists
outputfolder = settings["outputfolder"]
if not os.path.exists(outputfolder):
    os.makedirs(outputfolder)

# Define the path for the settings file
settings_filepath = os.path.join(outputfolder, "settings.json")

# Save the settings to a JSON file
with open(settings_filepath, "w") as f:
    json.dump(settings, f, indent=4)

print(f"Settings have been saved to {settings_filepath}")
# Define the output folder
phenotype = settings["phenotype"]
percentile = settings["percentile"]
n_folds = settings["n_folds"]
TanSVM_C = settings["TanSVM_C"]
random_state = settings["random_state"]
n_filters_per_group = settings["n_filters_per_group"]
Tangent_Class = settings["Tangent_Class"]
# Pyriemannian Mean https://github.com/pyRiemann/pyRiemann/blob/master/pyriemann/utils/mean.py#L633 Metric for mean estimation, can be: "ale", "alm", "euclid", "harmonic", "identity", "kullback_sym", "logdet", "logeuclid", "riemann", "wasserstein", or a callable function.
# https://link.springer.com/article/10.1007/s12021-020-09473-9 <---- best descriptions/plots
# Geometric means in a novel vector space structure on symmetric positive-definite matrices <https://epubs.siam.org/doi/abs/10.1137/050637996?journalCode=sjmael>`_
metric = settings["metric"]

def load_array_from_outputfolder(filename):
    filepath = os.path.join(outputfolder, filename)
    return np.load(filepath)
# Function to save an array to the output folder
def save_array_to_outputfolder(filename, array):
    filepath = os.path.join(outputfolder, filename)
    np.save(filepath, array)

def save_text_results(text, filename="results.txt"):
    """Save text results to a file."""
    filepath = os.path.join(outputfolder, filename)
    with open(filepath, "a") as f:  # Using 'a' to append results to the file
        f.write(text + "\n")

### Memory and Processor Usage/Limits Checks

In [None]:
# https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
#Open terminal for job
# srun --jobid=68974 --overlap --pty /bin/bash 

# #SLURM RAM
!cgget -r memory.limit_in_bytes /slurm/uid_$SLURM_JOB_UID/job_$SLURM_JOB_ID

#SLURM VM
!cgget -r memory.memsw.limit_in_bytes /slurm/uid_$SLURM_JOB_UID/job_$SLURM_JOB_ID

#SLURM USAGE
!cgget -r memory.memsw.usage_in_bytes /slurm/uid_$SLURM_JOB_UID/job_$SLURM_JOB_ID

!echo "SLURM_JOB_ID: $SLURM_JOB_ID"
!echo "SLURM_JOB_NAME: $SLURM_JOB_NAME"
!echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
!echo "SLURM_MEM_PER_NODE: $SLURM_MEM_PER_NODE"
!echo "SLURM_CPUS_ON_NODE: $SLURM_CPUS_ON_NODE"
!echo "SLURM_MEM_PER_CPU: $SLURM_MEM_PER_CPU"

!free -h

import resource

# Get the soft and hard limits of virtual memory (address space)
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
print(f"Soft limit: {soft / (1024 ** 3):.2f} GB")
print(f"Hard limit: {hard / (1024 ** 3):.2f} GB")

# Get the soft and hard limits of the data segment (physical memory usage)
soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
print(f"Soft limit: {soft / (1024 ** 3):.2f} GB")
print(f"Hard limit: {hard / (1024 ** 3):.2f} GB")

#TORQUE Virtual Memory
# !cgget -r memory.memsw.limit_in_bytes /torque/$PBS_JOBID

# #TORQUE RAM
# !cgget -r memory.limit_in_bytes /torque/$PBS_JOBID

# #TORQUE USAGE
# !cgget -r memory.memsw.usage_in_bytes /torque/$PBS_JOBID
# print(int(os.environ['PBS_NP']))
!nvidia-smi

def gpu_mem():
    # Memory usage information
    print(f"Total memory available: {(torch.cuda.get_device_properties('cuda').total_memory / 1024**3):.2f} GB")
    print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"Reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

def cpu_mem():
   # Display memory information
    print(f"Total Memory: { psutil.virtual_memory().total / (1024**3):.2f} GB")
    print(f"Available Memory: { psutil.virtual_memory().available / (1024**3):.2f} GB")
    print(f"Used Memory: { psutil.virtual_memory().used / (1024**3):.2f} GB")
    print(f"Memory Usage: { psutil.virtual_memory().percent}%")

gpu_mem()
cpu_mem()

### Load Paths & Parcellated

In [None]:
groupA_parcellated_array = load_array_from_outputfolder("groupA_parcellated_array.npy")
groupB_parcellated_array = load_array_from_outputfolder("groupB_parcellated_array.npy")
groupA_paths_filtered = load_array_from_outputfolder("groupA_paths_filtered.npy")
groupB_paths_filtered = load_array_from_outputfolder("groupB_paths_filtered.npy")

### Regression

In [None]:
from  sklearn.svm import LinearSVR, SVR
from sklearn.linear_model import LassoCV, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import pearsonr
import scipy.stats as stats
import statsmodels.api as sm

def analyze_residuals(y_pred, y_true):
    residuals = y_true - y_pred

    # Manually calculate R^2 score using y_true mean
    ss_res = np.sum((y_true - y_pred) ** 2)  # Sum of squares of residuals
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)  # Total sum of squares (mean of y_true)
    r_squared = 1 - (ss_res / ss_tot)  # R^2 score
    
    plt.figure(figsize=(8, 5))
    plt.scatter(y_pred, residuals, color="blue", label="Residuals")
    plt.axhline(y=0, color="black", linestyle="--")
    plt.title(f"Residuals vs Fitted Values (R^2 = {r_squared})")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend()
    plt.show()

    plt.figure(figsize=(8, 5))
    plt.scatter(y_true, residuals, color="red", label="Residuals")
    plt.axhline(y=0, color="black", linestyle="--")
    plt.title(f"Residuals vs True Values (R^2 = {r_squared})")
    plt.xlabel("True Values")
    plt.ylabel("Residuals")
    plt.legend()
    plt.show()

    fig = sm.graphics.qqplot(residuals, dist=stats.norm, fit=True, line="45")
    plt.title("QQ Plot of Standardized Residuals")
    plt.show()

    plt.figure(figsize=(8, 5))
    plt.hist(residuals, bins=30, edgecolor='black', color='purple')
    plt.title("Histogram of Residuals")
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.show()

    return r_squared


def FKT_proj(data, filters, method="basic", alpha=1, beta=0, l1_ratio=0.5, lambda1=.01, lambda2=.01):
    S = (data @ filters)
    
    if method == "basic":
        proj = (np.linalg.pinv(S)@ data)
    elif method == "covs":
        cov_est_scm = Covariances(estimator='scm')
        s_cov = cov_est_scm.transform(S.T[np.newaxis,:,:])[0,:,:]
        data_cov = cov_est_scm.transform(data.T[np.newaxis,:,:])[0,:,:]
        proj = (data_cov @ filters @ np.linalg.inv(s_cov)).T
    elif method == "linreg":
        reg = LinearRegression()
        reg.fit(S, data)
        proj = reg.coef_.T
    elif method == "grouplassolinreg":
        reg = MultiTaskLasso(alpha=alpha)  # Using 5-fold cross-validation
        reg.fit(S, data)
        proj = reg.coef_.T
    elif method == "lassolinreg":
        reg = Lasso(alpha=alpha)  # Using 5-fold cross-validation
        reg.fit(S, data)
        proj = reg.coef_.T
    elif method == "elasticlinreg":
        reg = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        reg.fit(S, data)
        proj = reg.coef_.T
    elif method == "growl":
        # Proximal Operator for GrOWL targeting columns
        def prox_growl(V, lambda1, lambda2, tau):
            p, r = V.shape
            norms = np.linalg.norm(V, axis=0)  # Norms of columns
            indices = np.argsort(-norms)  # Sort indices by descending norms
            weights = lambda1 + lambda2 * np.linspace(1, 0, r)  # Weights decrease
            V_new = np.zeros_like(V)
            for i in range(r):
                idx = indices[i]
                if norms[idx] > weights[i] * tau:
                    V_new[:, idx] = (1 - tau * weights[i] / norms[idx]) * V[:, idx]
            return V_new
        
        # Initialization
        B = np.zeros((filters.shape[1], data.shape[1]))
        
        # Optimization Loop
        max_iter = 100
        learning_rate = 0.01
        for _ in range(max_iter):
            gradient = S.T @ (S @ B - data)
            B -= learning_rate * gradient
            B = prox_growl(B, lambda1, lambda2, tau=learning_rate)
            if np.linalg.norm(gradient) < 1e-1:
                break
        
        proj = B.T
    
    return proj

# Combine group A and B data and paths
def combine_groups(groupA_parcellated_array, groupB_parcellated_array, groupA_paths_filtered, groupB_paths_filtered):
    # Combine data arrays
    combined_data = np.concatenate((groupA_parcellated_array, groupB_parcellated_array), axis=0)

    # Combine paths arrays
    combined_paths = np.concatenate((groupA_paths_filtered, groupB_paths_filtered), axis=0)

    return combined_data, combined_paths

# Extract subject IDs from the combined paths
def extract_subject_ids(combined_paths):
    subject_ids = np.array([re.search(r'/(\d+)/', path[0]).group(1) for path in combined_paths])
    return np.array(subject_ids)

def extract_phenotype(subids,phenotype):
    file_path_restricted = '../HCP/RESTRICTED_zainsou_8_6_2024_2_11_21.csv'
    file_path_unrestricted = '../HCP/unrestricted_zainsou_8_2_2024_6_13_22.csv'

    try:
        data_r = pd.read_csv(file_path_restricted)
        data_ur = pd.read_csv(file_path_unrestricted)
    except FileNotFoundError:
        print(f"File not found: {file_path_restricted} or {file_path_unrestricted}")
        raise

    # Combine restricted and unrestricted data on Subject ID
    data = pd.merge(data_r, data_ur, on='Subject', how='outer')

    # Convert Subject IDs to string for consistency
    data['Subject'] = data['Subject'].astype(str)
    subids = subids.astype(str)

    # Filter data for training subjects
    train_data = data[data['Subject'].isin(subids)]
    # Ensure the order matches the training data
    train_data = train_data.set_index('Subject').loc[subids].reset_index()
    pheno_score = train_data[phenotype]
    return pheno_score

# Regress out age from predictors
def regress_out_age(predictor, age):
    reg = Ridge(alpha=1)
    # reg = LinearRegression()

    reg.fit(age.reshape(-1, 1), predictor)  # Age is the independent variable

    # reg = LinearSVR(C=1,fit_intercept=False)
    # reg.fit(predictor, age)  # Age is the independent variable

    return reg

def preproc(train, test,method="zscore"):
    if method == "zscore":
        scaler = StandardScaler()
        train_zscore = scaler.fit_transform(train)
        test_zscore = scaler.transform(test)
    else:
        mean = train.mean(axis=0)
        train_zscore = train - mean
        test_zscore = test - mean
    return train_zscore, test_zscore 

# Helper function to plot regression results
def plot_predictions(true_values, predicted_values, title, train_or_test):
    plt.scatter(true_values, predicted_values, label=f'{train_or_test} Predictions', color='blue', alpha=0.6)
    
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.title(title)
    plt.legend()
    plt.show()

def regression(train, test, age_train, age_test, values_train, values_test,pre="center"):
    if pre == "center":
        # Mean center the tangent space data
        mean = np.mean(train, axis=0)
        train_centered = train - mean
        test_centered = test - mean
    elif pre == "znorm":
        train_centered,  test_centered = preproc(train,test)
    else:
        train_centered = train
        test_centered = test


    # Regress out age using only the training data
    age_reg = regress_out_age(train_centered, age_train)
    # tan_train_centered = tan_train_centered - (tan_train_centered@np.linalg.pinv(age_reg.coef_[np.newaxis,:]))@age_reg.coef_[np.newaxis,:]
    # tan_test_centered = tan_test_centered - (tan_test_centered@np.linalg.pinv(age_reg.coef_[np.newaxis,:]))@age_reg.coef_[np.newaxis,:]
    train_centered = train_centered - age_reg.predict(age_train.reshape(-1, 1))
    test_centered = test_centered - age_reg.predict(age_test.reshape(-1, 1))

    # Choose the regression model
    # reg_model = LinearSVR(C=1,fit_intercept=True)
    # reg_model = LinearSVR(C=1)
    # reg_model = LinearSVR(C=.01,loss='squared_epsilon_insensitive')
    # reg_model = Lasso(alpha=0.001,fit_intercept=False)
    reg_model = Ridge(alpha=1)
    # reg_model = ElasticNet(alpha=1, l1_ratio=0.0001)
    # reg_model = LinearRegression(fit_intercept=False)
    # reg_model = LassoCV(cv=5)  # Use 5-fold cross-validation within each fold to tune Lasso
    # reg_model = SVR(kernel="poly")
    reg_model.fit(train_centered, values_train)

    # Get the predicted values from the regularized model
    predictions_train = reg_model.predict(train_centered)
    # Adjust for the regularization bias using Zou & Hastie's method
    # Regress the true values on the predicted values
    kappa_reg = LinearRegression(fit_intercept=False)
    kappa_reg.fit(predictions_train.reshape(-1, 1), values_train)
    kappa = kappa_reg.coef_[0]
    predictions_train_adj = kappa*predictions_train

    predictions = kappa*reg_model.predict(test_centered)
    # Plot predictions vs true values for test fold
    test_score = analyze_residuals(predictions, values_test)
    plot_predictions(values_test, predictions,f'Test Predictions vs True Values - Fold R²: {test_score}', 'Test')

    train_score = analyze_residuals(predictions_train_adj, values_train)
    plot_predictions(values_train, predictions_train_adj, f'Train Predictions vs True Values - Fold R²: {train_score}', 'Train')

    fold_corr, _ = pearsonr(values_test, predictions)

    return train_centered, reg_model.coef_, test_score, fold_corr


# Tangent space regression with centering
def tan_regression_folds(data, values, age, metric="riemann", pre="znorm", n_splits=5):
    # Binning the continuous target variable for stratified splits
    n_bins = 5  # Adjust this based on your target distribution
    binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    values_binned = binner.fit_transform(values.reshape(-1, 1))

    cov_est = Covariances(estimator='lwf')
    # cov_est = Covariances(estimator='corr')
    covs = cov_est.transform(np.transpose(data, (0, 2, 1)))
    
    # Initialize Stratified K-Fold cross-validation
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=10)
    
    fold_scores = []
    fold_corrs = []
    filter_fold_scores = []
    filter_fold_corrs = []

    count = 0
    # Perform Stratified K-Fold Cross-Validation
    for train_index, test_index in kf.split(data, values_binned):
        print("Fold", count)
        count += 1
        train, test = data[train_index], data[test_index]

        values_train, values_test = values[train_index], values[test_index]
        values_train, values_test = preproc(values_train[:,np.newaxis],values_test[:,np.newaxis], method="center")
        values_train = values_train[:,0]
        values_test = values_test[:,0]

        # Plot the histogram of training data and calculated weights
        plt.figure(figsize=(10, 6))
        # Plot histogram of training target values
        plt.hist(values_train, bins=n_bins, alpha=0.7, color='blue', edgecolor='black', label='Train Data')
        plt.title(f'Train Data Histogram and Weights - Fold {count}')
        plt.xlabel('Target Value')
        plt.ylabel('Frequency / Weight')
        plt.legend()
        plt.show()

        age_train, age_test = age[train_index], age[test_index]
        age_train, age_test = preproc(age_train[:,np.newaxis], age_test[:,np.newaxis], method="center")
        age_train = age_train[:,0]
        age_test = age_test[:,0]

        train_covs, test_covs = covs[train_index], covs[test_index]
        train_mean = mean_covariance(train_covs, metric=metric)
        tan_train = tangent_space(train_covs,train_mean,metric=metric)
        tan_test = tangent_space(test_covs,train_mean,metric=metric)

        tan_train_centered, model_coefs, test_score, fold_corr = regression(tan_train, tan_test, age_train, age_test, values_train, values_test,pre=pre)
        fold_scores.append(test_score)
        fold_corrs.append(fold_corr)
        print(f"Fold Test R² Score: {test_score}")
        print(f"Fold Test R Score: {fold_corr}")

        # (tan_train@np.linalg.pinv(reg_model.coef_))@reg_model.coef_
        # hauf_coef = FKT_proj(tan_train,reg_model.coef_[:,np.newaxis],method="basic")
        # hauf_coef = FKT_proj(tan_train,reg_model.coef_[:,np.newaxis],method="covs")
        hauf_coef = FKT_proj(tan_train_centered, model_coefs[:,np.newaxis],method="linreg").T[:,0]

        weights_matrix = untangent_space(hauf_coef,train_mean,metric=metric)
        # weights_matrix = untangent_space(reg_model.coef_,train_mean,metric=metric)
        eigs, filters_all = eigh(weights_matrix,train_mean)
        # plt.scatter(range(0,eigs.shape[0]),np.abs(eigs))
        # plt.show()
        iter_test_score_reduced = 0
        iter_test_corr_reduced = 0
        test_score_reduced = 0
        test_corr_reduced = 0
        n_filters = 0
        while test_score_reduced <=  np.abs(iter_test_score_reduced):
            test_score_reduced = iter_test_score_reduced
            test_corr_reduced = iter_test_corr_reduced
            n_filters += 1
            inds = np.argsort(np.abs(eigs))[-n_filters:]
            filters = filters_all[:,inds]

            train_transformed = train @ filters
            test_transformed = test @ filters

            train_transformed_cov = cov_est.transform(np.transpose(train_transformed, (0, 2, 1)))
            test_transformed_cov = cov_est.transform(np.transpose(test_transformed, (0, 2, 1)))

            reduced_mean = mean_covariance(train_transformed_cov, metric=metric)
            tangent_transform_train = tangent_space(train_transformed_cov, reduced_mean, metric=metric)
            tangent_transform_test = tangent_space(test_transformed_cov, reduced_mean, metric=metric)

            _, _, iter_test_score_reduced, iter_test_corr_reduced, = regression(tangent_transform_train, tangent_transform_test, age_train, age_test, values_train, values_test,pre=pre)
                
        filter_fold_scores.append(test_score_reduced)
        filter_fold_corrs.append(test_corr_reduced)
        print(n_filters)
        print(f"Fold Reduced Test R² Score: {test_score_reduced}")
        print(f"Fold Reduced Test R Score: {test_corr_reduced}")


    # Output the average R² score across all folds
    mean_score = np.mean(fold_scores)
    mean_corr = np.mean(fold_corrs)

    mean_filter_score = np.mean(filter_fold_scores)
    mean_filter_corr = np.mean(filter_fold_corrs)

    print(f"Mean R² Score across {n_splits} folds: {mean_score}")
    print(f"Mean R Score across {n_splits} folds: {mean_corr}")

    print(f"Mean R² Filter Score across {n_splits} folds: {mean_filter_score}")
    print(f"Mean R Filter Score across {n_splits} folds: {mean_filter_corr}")
    return mean_score, mean_corr, mean_filter_score, mean_filter_corr, fold_scores, fold_corrs, filter_fold_scores, filter_fold_corrs



def tan_quantile_regression_folds(data, values, age, metric="riemann", pre="znorm", n_splits=5):
    # Binning the continuous target variable for stratified splits
    n_bins = 3  # Adjust this based on your target distribution
    binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    values_binned = binner.fit_transform(values.reshape(-1, 1))

    cov_est = Covariances(estimator='lwf')
    # cov_est = Covariances(estimator='corr')
    covs = cov_est.transform(np.transpose(data, (0, 2, 1)))
    
    # Initialize Stratified K-Fold cross-validation
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=10)
    
    for quantile in range(0,n_bins):
        fold_scores = []
        fold_corrs = []
        filter_fold_scores = []
        filter_fold_corrs = []

        count = 0
        # Perform Stratified K-Fold Cross-Validation
        for train_index, test_index in kf.split(data, values_binned):
            print("Fold", count)
            count += 1
            
            group_train_ind = np.where(values_binned[train_index] == quantile)[0]
            group_test_ind = np.where(values_binned[test_index] == quantile)[0]
            
            train, test = data[train_index][group_train_ind], data[test_index][group_test_ind]

            values_train, values_test = values[train_index][group_train_ind], values[test_index][group_test_ind]
            values_train, values_test = preproc(values_train[:,np.newaxis],values_test[:,np.newaxis], method="center")
            values_train = values_train[:,0]
            values_test = values_test[:,0]


            # Plot the histogram of training data and calculated weights
            plt.figure(figsize=(10, 6))
            # Plot histogram of training target values
            plt.hist(values_train, bins=n_bins, alpha=0.7, color='blue', edgecolor='black', label='Train Data')
            plt.title(f'Train Data Histogram and Weights - Fold {count}')
            plt.xlabel('Target Value')
            plt.ylabel('Frequency / Weight')
            plt.legend()
            plt.show()

            age_train, age_test = age[train_index][group_train_ind], age[test_index][group_test_ind]
            age_train, age_test = preproc(age_train[:,np.newaxis], age_test[:,np.newaxis], method="center")
            age_train = age_train[:,0]
            age_test = age_test[:,0]

            train_covs, test_covs = covs[train_index][group_train_ind], covs[test_index][group_test_ind]
            train_mean = mean_covariance(train_covs, metric=metric)
            tan_train = tangent_space(train_covs,train_mean,metric=metric)
            tan_test = tangent_space(test_covs,train_mean,metric=metric)

            print(tan_train.shape,tan_test.shape,age_train.shape,age_test.shape,values_train.shape,values_test.shape)
            tan_train_centered, model_coefs, test_score, fold_corr = regression(tan_train, tan_test, age_train, age_test, values_train, values_test,pre=pre)
            fold_scores.append(test_score)
            fold_corrs.append(fold_corr)
            print(f"Fold Test R² Score: {test_score}")
            print(f"Fold Test R Score: {fold_corr}")

            # (tan_train@np.linalg.pinv(reg_model.coef_))@reg_model.coef_
            # hauf_coef = FKT_proj(tan_train,reg_model.coef_[:,np.newaxis],method="basic")
            # hauf_coef = FKT_proj(tan_train,reg_model.coef_[:,np.newaxis],method="covs")
            hauf_coef = FKT_proj(tan_train_centered, model_coefs[:,np.newaxis],method="linreg")

            weights_matrix = untangent_space(hauf_coef.T[:,0],train_mean,metric=metric)
            # weights_matrix = untangent_space(reg_model.coef_,train_mean,metric=metric)
            eigs, filters_all = eigh(weights_matrix,train_mean)
            # plt.scatter(range(0,eigs.shape[0]),np.abs(eigs))
            # plt.show()

            n_filters = 5
            inds = np.argsort(np.abs(eigs))[-n_filters:]
            filters = filters_all[:,inds]

            train_transformed = train @ filters
            test_transformed = test @ filters

            train_transformed_cov = cov_est.transform(np.transpose(train_transformed, (0, 2, 1)))
            test_transformed_cov = cov_est.transform(np.transpose(test_transformed, (0, 2, 1)))

            reduced_mean = mean_covariance(train_transformed_cov, metric=metric)
            tangent_transform_train = tangent_space(train_transformed_cov, reduced_mean, metric=metric)
            tangent_transform_test = tangent_space(test_transformed_cov, reduced_mean, metric=metric)

            _, _, test_score_reduced, test_corr_reduced, = regression(tangent_transform_train, tangent_transform_test, age_train, age_test, values_train, values_test,pre=pre)
                    
            filter_fold_scores.append(test_score_reduced)
            filter_fold_corrs.append(test_corr_reduced)
            print(n_filters)
            print(f"Fold Reduced Test R² Score: {test_score_reduced}")
            print(f"Fold Reduced Test R Score: {test_corr_reduced}")


        # Output the average R² score across all folds
        mean_score = np.mean(fold_scores)
        mean_corr = np.mean(fold_corrs)

        mean_filter_score = np.mean(filter_fold_scores)
        mean_filter_corr = np.mean(filter_fold_corrs)

        print(f"Mean R² Score across {n_splits} folds: {mean_score}")
        print(f"Mean R Score across {n_splits} folds: {mean_corr}")

        print(f"Mean R² Filter Score across {n_splits} folds: {mean_filter_score}")
        print(f"Mean R Filter Score across {n_splits} folds: {mean_filter_corr}")

def tan_quantile_regression_II(data, values, age, metric="riemann", pre="znorm", n_splits=5):
    # Binning the continuous target variable for stratified splits
    n_bins = 3  # Adjust this based on your target distribution
    binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    values_binned = binner.fit_transform(values.reshape(-1, 1))

    cov_est = Covariances(estimator='lwf')
    # cov_est = Covariances(estimator='corr')
    covs = cov_est.transform(np.transpose(data, (0, 2, 1)))
    
    # Initialize Stratified K-Fold cross-validation
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=10)
    
    fold_scores = []
    fold_corrs = []
    filter_fold_scores = []
    filter_fold_corrs = []

    count = 0
    # Perform Stratified K-Fold Cross-Validation
    for train_index, test_index in kf.split(data, values_binned):
        print("Fold", count)
        count += 1
        filters = []
        values_train, values_test = values[train_index], values[test_index]
        values_train, values_test = preproc(values_train[:,np.newaxis],values_test[:,np.newaxis], method="center")
        values_train = values_train[:,0]
        values_test = values_test[:,0]

        # Plot the histogram of training data and calculated weights
        plt.figure(figsize=(10, 6))
        # Plot histogram of training target values
        plt.hist(values_train, bins=n_bins, alpha=0.7, color='blue', edgecolor='black', label='Train Data')
        plt.title(f'Train Data Histogram and Weights - Fold {count}')
        plt.xlabel('Target Value')
        plt.ylabel('Frequency / Weight')
        plt.legend()
        plt.show()

        age_train, age_test = age[train_index], age[test_index]
        age_train, age_test = preproc(age_train[:,np.newaxis], age_test[:,np.newaxis], method="center")
        age_train = age_train[:,0]
        age_test = age_test[:,0]
        
        train, test = data[train_index], data[test_index]

        train_covs, test_covs = covs[train_index], covs[test_index]
        for quantile in range(0,n_bins):
            group_train_ind = np.where(values_binned[train_index] == quantile)[0]
            group_test_ind = np.where(values_binned[test_index] == quantile)[0]
            
            values_train_quantile, values_test_quantile, age_train_quantile, age_test_quantile, train_quantile, test_quantile = values_train[group_train_ind], values_test[group_test_ind], age_train[group_train_ind], age_test[group_test_ind], train[group_train_ind], test[group_test_ind]
            train_covs_quantile, test_covs_quantile = train_covs[group_train_ind], test_covs[group_test_ind]
            train_mean_quantile = mean_covariance(train_covs_quantile, metric=metric)
            tan_train_quantile = tangent_space(train_covs_quantile,train_mean_quantile,metric=metric)
            tan_test_quantile = tangent_space(test_covs_quantile,train_mean_quantile,metric=metric)

            tan_train_centered_quantile, model_coefs, test_score, fold_corr = regression(tan_train_quantile, tan_test_quantile, age_train_quantile, age_test_quantile, values_train_quantile, values_test_quantile,pre=pre)
            fold_scores.append(test_score)
            fold_corrs.append(fold_corr)
            print(f"Fold Test R² Score: {test_score}")
            print(f"Fold Test R Score: {fold_corr}")

            # (tan_train@np.linalg.pinv(reg_model.coef_))@reg_model.coef_
            # hauf_coef = FKT_proj(tan_train,reg_model.coef_[:,np.newaxis],method="basic")
            # hauf_coef = FKT_proj(tan_train,reg_model.coef_[:,np.newaxis],method="covs")
            hauf_coef = FKT_proj(tan_train_centered_quantile, model_coefs[:,np.newaxis],method="linreg")

            weights_matrix = untangent_space(hauf_coef.T[:,0],train_mean_quantile,metric=metric)
            # weights_matrix = untangent_space(reg_model.coef_,train_mean,metric=metric)
            eigs, filters_all = eigh(weights_matrix,train_mean_quantile)
            # plt.scatter(range(0,eigs.shape[0]),np.abs(eigs))
            # plt.show()

            n_filters = 3
            inds = np.argsort(np.abs(eigs))[-n_filters:]
            quantile_filters = filters_all[:,inds]
            filters.append(quantile_filters)

        train_mean = mean_covariance(train_covs, metric=metric)
        tan_train = tangent_space(train_covs,train_mean,metric=metric)
        tan_test = tangent_space(test_covs,train_mean,metric=metric)
        
        filters = np.concatenate(filters, axis=1)  # Concatenate along the filter dimension
        train_transformed = train @ filters
        test_transformed = test @ filters

        train_transformed_cov = cov_est.transform(np.transpose(train_transformed, (0, 2, 1)))
        test_transformed_cov = cov_est.transform(np.transpose(test_transformed, (0, 2, 1)))

        reduced_mean = mean_covariance(train_transformed_cov, metric=metric)
        tangent_transform_train = tangent_space(train_transformed_cov, reduced_mean, metric=metric)
        tangent_transform_test = tangent_space(test_transformed_cov, reduced_mean, metric=metric)

        _, _, test_score_reduced, test_corr_reduced, = regression(tangent_transform_train, tangent_transform_test, age_train, age_test, values_train, values_test,pre=pre)

        filter_fold_scores.append(test_score_reduced)
        filter_fold_corrs.append(test_corr_reduced)
        print(n_filters)
        print(f"Fold Reduced Test R² Score: {test_score_reduced}")
        print(f"Fold Reduced Test R Score: {test_corr_reduced}")


    # Output the average R² score across all folds
    mean_score = np.mean(fold_scores)
    mean_corr = np.mean(fold_corrs)

    mean_filter_score = np.mean(filter_fold_scores)
    mean_filter_corr = np.mean(filter_fold_corrs)

    print(f"Mean R² Score across {n_splits} folds: {mean_score}")
    print(f"Mean R Score across {n_splits} folds: {mean_corr}")

    print(f"Mean R² Filter Score across {n_splits} folds: {mean_filter_score}")
    print(f"Mean R Filter Score across {n_splits} folds: {mean_filter_corr}")

### Create 1 Fold and Filters for Rest of Pipeline

In [None]:
data, combined_paths = combine_groups(groupA_parcellated_array, groupB_parcellated_array, groupA_paths_filtered, groupB_paths_filtered)
subject_ids = extract_subject_ids(combined_paths)
values = np.array(extract_phenotype(subject_ids,phenotype))
nan_mask = ~np.isnan(values)  # This will create a boolean mask, True where values are not NaN
data, combined_paths, subject_ids, values = data[nan_mask], combined_paths[nan_mask], subject_ids[nan_mask], values[nan_mask]
age = np.array(extract_phenotype(subject_ids, 'Age_in_Yrs'))

In [None]:
tan_quantile_regression_II(data,values,age, metric=metric,pre="center",n_splits=5)

In [None]:
mean_score, mean_corr, mean_filter_score, mean_filter_corr, fold_scores, fold_corrs, filter_fold_scores, filter_fold_corrs = tan_regression_folds(data,values,age, metric=metric,pre="center",n_splits=5)

In [None]:
n_bins = 3  # Adjust this based on your target distribution
binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
values_binned = binner.fit_transform(values.reshape(-1, 1))

# Initialize Stratified K-Fold cross-validation
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=10)
splits = list(kf.split(data, values_binned))

In [None]:
fold = 0
fold_outputfolder = f"fold_{fold}"
if not os.path.exists(os.path.join(outputfolder, f"fold_{fold}")):
    os.makedirs(os.path.join(outputfolder, f"fold_{fold}"))
if not os.path.exists(os.path.join(outputfolder, f"fold_{fold}", "results")):
    os.makedirs(os.path.join(outputfolder, f"fold_{fold}", "results"))

train_index = splits[fold][0]
test_index = splits[fold][1]

In [None]:
train, test = data[train_index], data[test_index]
combined_paths_train, combined_paths_test = combined_paths[train_index], combined_paths[test_index]

values_train, values_test = values[train_index], values[test_index]
values_train, values_test = preproc(values_train[:,np.newaxis],values_test[:,np.newaxis], method="center")
values_train = values_train[:,0]
values_test = values_test[:,0]

age_train, age_test = age[train_index], age[test_index]
age_train, age_test = preproc(age_train[:,np.newaxis], age_test[:,np.newaxis], method="center")
age_train = age_train[:,0]
age_test = age_test[:,0]

In [None]:
group0_train_ind = np.where(values_binned[train_index] == 0)[0]
group1_train_ind = np.where(values_binned[train_index] == 1)[0]
group2_train_ind = np.where(values_binned[train_index] == 2)[0]


group0_train = train[group0_train_ind]
group0_values_train = values_train[group0_train_ind]
group0_age_train = age_train[group0_train_ind]
print(group0_values_train.shape)
print(group0_train.shape)
print(group0_age_train.shape)

group1_train = train[group1_train_ind]
group1_values_train = values_train[group1_train_ind]
group1_age_train = age_train[group1_train_ind]
print(group1_values_train.shape)
print(group1_train.shape)
print(group1_age_train.shape)

group2_train = train[group2_train_ind]
group2_values_train = values_train[group2_train_ind]
group2_age_train = age_train[group2_train_ind]
print(group2_values_train.shape)
print(group2_train.shape)
print(group2_age_train.shape)

# Plot the histogram for each group's 'values_train'
plt.figure(figsize=(8, 6))

plt.hist(group0_values_train, bins=10, alpha=0.5, label="Group 0 (Values = 0)")
plt.hist(group1_values_train, bins=10, alpha=0.5, label="Group 1 (Values = 1)")
plt.hist(group2_values_train, bins=10, alpha=0.5, label="Group 2 (Values = 2)")

plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('Histogram of Values for Different Groups')
plt.legend(loc='upper right')

plt.show()

In [None]:
cov_est = Covariances(estimator='lwf')
train_covs = cov_est.transform(np.transpose(train, (0, 2, 1)))
train_mean = mean_covariance(train_covs, metric=metric)
tan_train = tangent_space(train_covs,train_mean,metric=metric)

tan_mean = np.mean(tan_train, axis=0)
tan_train_centered = tan_train - tan_mean

age_reg = regress_out_age(tan_train_centered, age_train)
tan_train_centered = tan_train_centered - age_reg.predict(age_train.reshape(-1, 1))

reg_model = Ridge(alpha=1)
reg_model.fit(tan_train_centered, values_train)

hauf_coef = FKT_proj(tan_train_centered,reg_model.coef_[:,np.newaxis],method="linreg")

weights_matrix = untangent_space(hauf_coef.T[:,0],train_mean,metric=metric)
eigs, filters_all = eigh(weights_matrix,train_mean)
inds = np.argsort(np.abs(eigs))[-4:]
filters_parcellated = filters_all[:,inds]

In [None]:
from pyriemann.utils.tangentspace import log_map_riemann, log_map_euclid, log_map_logeuclid

In [None]:
riem_covs = tangent_space(train_covs[:10], train_mean,metric="riemann")
# logeuc_covs = log_map_logeuclid(train_covs[:10], train_mean)
# euc_covs = log_map_euclid(train_covs[:10], train_mean)

In [None]:
correlation_matrix = np.corrcoef(riem_covs.T)
print(correlation_matrix.shape)

# sns.heatmap((riem_covs.T@riem_covs))
# plt.show()
# sns.heatmap((logeuc_covs[0].T@logeuc_covs[0]))
# plt.show()
# sns.heatmap((euc_covs[0].T@euc_covs[0]))
# plt.show()

In [None]:
# Extract the upper triangular part without the diagonal (as they are self-correlations)
upper_tri_indices = np.triu_indices_from(correlation_matrix, k=1)
average_correlation = np.mean(np.abs(correlation_matrix[upper_tri_indices]))

print("Average correlation between projected matrices:", average_correlation)

In [None]:
import numpy as np

metrics = ["euclid", "logeuclid", "riemann"]
A = train_covs[0]
B = train_covs[1]

# Set precision level for numpy printing (16 decimal places)
np.set_printoptions(precision=16, suppress=True)

# Print shapes of A and B
print(f"A shape: {A.shape}")
print(f"B shape: {B.shape}")

# Print determinants of A and B with high precision (16 decimal places)
print(f"Determinant of A: {np.linalg.det(A):.16f}")
print(f"Determinant of B: {np.linalg.det(B):.16f}")

for met in metrics:
    print(f"\nTesting metric: {met}")
    test = 1  # To check if all operations succeed

    try:
        # Calculate distance with high precision
        dist = distance(A, B, metric=met)
        print(f"Distance ({met}): {dist:.16f}")
    except Exception as e:
        test = 0
        print(f"{met} failed during distance calculation: {e}")

    try:
        # Calculate mean covariance
        av = mean_covariance(np.array([A, B]), metric=met)
        print(f"Mean covariance ({met}) calculated successfully.")
        # Eigenvalue decomposition of the mean covariance matrix
        eigenvalues, _ = np.linalg.eigh(av)

        # Check if any eigenvalues are zero (or very close to zero)
        if np.any(eigenvalues <= 1e-10):
            print("Matrix is singular or nearly singular.")
        else:
            print("Matrix is positive definite.")

    except Exception as e:
        test = 0
        print(f"{met} failed during mean covariance calculation: {e}")

    try:
        # Convert to tangent space
        tanA = tangent_space(A, av, metric=met)
        tanB = tangent_space(B, av, metric=met)
        print(f"Tangent space transformations ({met}) successful.")
    except Exception as e:
        test = 0
        print(f"{met} failed during tangent space transformation: {e}")

    try:
        # Convert back from tangent space
        untanA = untangent_space(tanA, av, metric=met)
        untanB = untangent_space(tanB, av, metric=met)
        print(f"Untangent space transformations ({met}) successful.")
    except Exception as e:
        test = 0
        print(f"{met} failed during untangent space transformation: {e}")

    try:
        # Print determinants of the mean covariance and eigenvalue decompositions with high precision
        print(f"Determinant of mean covariance: {np.linalg.det(av):.16f}")
        _, _ = eigh(A, av)
        _, _ = eigh(B, av)
        _, _ = eigh(untanA, av)
        _, _ = eigh(untanB, av)
        print(f"GEVD ({met}) successful.")
    except Exception as e:
        test = 0
        print(f"{met} failed during GEVD calculation: {e}")

    if test == 1:
        print(f"{met} passed all tests.")
    else:
        print(f"{met} encountered errors.")


In [None]:
def migp(subs, batch_size=2, m=4800):
    W_gpu = None
    for batch_start in range(0, len(subs), batch_size):
        # Select the current batch of subjects
        batch_subs = subs[batch_start:batch_start + batch_size]
        batch_paths = [path for sublist in batch_subs for path in sublist]

        concatenated_data = []

        for task in batch_paths:
            X = nib.load(task).get_fdata()
            Xn = hcp.normalize(X-X.mean(axis=1, keepdims=True))
            # print(Xn.mean(axis=0).mean())
            # print(Xn.std(axis=0).mean())
            concatenated_data.append(Xn)
            del X, Xn
            
        try:
            # Concatenate data along the first axis using numpy
            batch = np.concatenate(concatenated_data, axis=0)
            batch = hcp.normalize(batch - batch.mean(axis=1,keepdims=True))
            del concatenated_data

            with torch.no_grad():
                # Convert to torch tensor and move to GPU
                batch_gpu = torch.tensor(batch, dtype=torch.float32, device="cuda")
                del batch
                if torch.isnan(batch_gpu).any():
                    print("NaNs detected in the batch data. Aborting SVD operation.")
                    del batch_gpu
                    torch.cuda.empty_cache()
                    return None
                if W_gpu is None:
                    combined_data_gpu = batch_gpu
                else:
                    combined_data_gpu = torch.cat([W_gpu, batch_gpu], dim=0)
                del batch_gpu
                torch.cuda.empty_cache()

                # # Calculate size in GB
                # size_in_gb = combined_data_gpu.element_size() * combined_data_gpu.nelement() / (1024**3)
                # print(f"Size of the array: {size_in_gb:.2f} GB")
                # cpu_mem()
                # gpu_mem()
                # Perform SVD on the GPU
                # Check for NaNs in the data

                # _, S_gpu, Vh_gpu = torch.linalg.svd(combined_data_gpu, full_matrices=False)
                _, Q = torch.linalg.eigh(combined_data_gpu@combined_data_gpu.T)
                # cpu_mem()
                # gpu_mem()
                # Compute the updated W on the GPU
                # W_gpu = torch.diag(S_gpu[:m]) @ Vh_gpu[:m, :]
                # Returned in Ascending order
                W_gpu = Q[:, -m:].T@combined_data_gpu
                del Q, combined_data_gpu  # Free up GPU memory
                torch.cuda.empty_cache()
                print(batch_start, "done")
        except Exception as e:
            print(f"Failed during GPU processing: {e}")
            if "combined_data_gpu" in locals():
                del combined_data_gpu
            if "Q" in locals():
                del Q
            if "W_gpu" in locals():
                del W_gpu
            torch.cuda.empty_cache()
            return None

    # Transfer W back to CPU only at the end
    W = W_gpu.cpu().numpy()
    del W_gpu  # Free up GPU memory

    return W

In [None]:
reducedsubs = migp((combined_paths_train))
save_array_to_outputfolder(os.path.join(fold_outputfolder,'reducedsubs.npy'), reducedsubs)

In [None]:
reducedsubs = load_array_from_outputfolder(os.path.join(fold_outputfolder,'reducedsubs.npy'))

In [None]:
def process_subject_haufe(sub,pinv_TF):
    try:
        concatenated_data = []
        for task in sub:
            X = nib.load(task).get_fdata(dtype=np.float32)
            Xn = hcp.normalize(X-X.mean(axis=1, keepdims=True))
            concatenated_data.append(Xn)
            del X, Xn

        # Concatenate data along the first axis
        subject = np.concatenate(concatenated_data, axis=0)
        del concatenated_data  # Explicitly delete the concatenated data list

        Xp = hcp.normalize(subject - subject.mean(axis=1, keepdims=True))
        del subject
        Xpf = pinv_TF@Xp
        del Xp
        return Xpf

    except Exception as e:
        print(f"Error processing subject: {e}")
        traceback.print_exc()  # Print the full traceback
        return None


def haufe_transform(F, parcellated,paths):
    
    # Ensure the tensors are on the correct device
    pinv_TF = np.linalg.pinv(parcellated.reshape(-1,parcellated.shape[-1]) @ np.linalg.pinv(F.T))


    # pinv_TF_list = pinv_TF.reshape(len(paths),F.shape[1],pinv_TF.shape[0])
    pinv_TF_list = (np.array_split(pinv_TF, len(paths), axis=1))

    with ProcessPoolExecutor(max_workers=(int(os.cpu_count()*.5))) as executor:
        # Use map to process subjects in parallel
        blocks = np.array(list(executor.map(process_subject_haufe, paths,pinv_TF_list)))
        print(blocks.shape)
        return (blocks.sum(axis=0))

In [None]:
filters_transform = haufe_transform(filters_parcellated,train,combined_paths_train)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"filters_transform.npy"), filters_transform)

In [None]:
filters_transform = load_array_from_outputfolder(os.path.join(fold_outputfolder,"filters_transform.npy"))

In [None]:
print(filters_parcellated.shape)

In [None]:
plotting.view_surf(hcp.mesh.inflated, hcp.cortex_data(hcp.unparcellate(filters_parcellated[:,0], hcp.mmp)), threshold=np.percentile(np.abs(filters_parcellated[:,0]), 50), bg_map=hcp.mesh.sulc)

In [None]:
plotting.view_surf(hcp.mesh.inflated, hcp.cortex_data(filters_transform[0,:]), threshold=np.percentile(np.abs(filters_transform[0,:]), 90), bg_map=hcp.mesh.sulc)

### Orthonormalize Filters

In [None]:
def orthonormalize_filters(W):
    print(W.shape)
    
    # Perform QR decomposition to orthonormalize the filters
    Q, _ = np.linalg.qr(W)
    
    print(Q.shape)

    # Verify that the inner product between the two orthonormalized vectors is 0 (orthogonality)
    print(f'Inner product between Q[:, 0] and Q[:, 1]: {np.dot(Q[:, 0].T, Q[:, 1])} (should be 0)')
    
    # Verify that the inner product within each vector is 1 (normalization)
    print(f'Norm of Q[:, 0]: {np.dot(Q[:, 0].T, Q[:, 0])} (should be 1)')
    print(f'Norm of Q[:, 1]: {np.dot(Q[:, 1].T, Q[:, 1])} (should be 1)')
    
    return Q
# Example usage

filters = orthonormalize_filters(filters_transform)

### PPCA

In [None]:
def call_pca_dim(Data=None,eigs=None,N=None):
   # Start MATLAB engine
    eng = matlab.engine.start_matlab()
    
    # Add the path to the MATLAB function
    eng.addpath("/project/3022057.01/IFA/melodic", nargout=0)
    
    if Data is not None:
      # Call the MATLAB function
      prob = eng.pca_dim(matlab.double(Data))
      eig_vectors = np.array(prob['E'])
    else:
      prob = eng.pca_dim_eigs(matlab.double(eigs.tolist()), matlab.double([N]))

    # Extract and convert each variable
    lap = np.array(prob['lap']).flatten().reshape(-1, 1)
    bic = np.array(prob['bic']).flatten().reshape(-1, 1)
    rrn = np.array(prob['rrn']).flatten().reshape(-1, 1)
    AIC = np.array(prob['AIC']).flatten().reshape(-1, 1)
    MDL = np.array(prob['MDL']).flatten().reshape(-1, 1)
    eig = np.array(prob['eig']).flatten()
    orig_eig = np.array(prob['orig_eig']).flatten()
    leig = np.array(prob['leig']).flatten()

    # Stop MATLAB engine
    eng.eval('clearvars', nargout=0)
    eng.quit()
    
    plt.figure(figsize=(10, 6))
    plt.scatter(np.arange(len(eig)),eig,label="Adjusted Eigenspectrum")
    plt.scatter(np.arange(len(orig_eig)),orig_eig,label="Eigenspectrum")
    plt.xlabel('Index')
    plt.ylabel('Eigenvalue')
    plt.legend()
    plt.title('Scree Plot')
    plt.show()


    # Use SimpleImputer to handle any missing values
    imputer = SimpleImputer(strategy='mean')
    lap = imputer.fit_transform(lap)
    bic = imputer.fit_transform(bic)
    rrn = imputer.fit_transform(rrn)
    AIC = imputer.fit_transform(AIC)
    MDL = imputer.fit_transform(MDL)
    
    # Use StandardScaler to standardize the data
    scaler = StandardScaler()
    lap_std = scaler.fit_transform(lap)
    bic_std = scaler.fit_transform(bic)
    rrn_std = scaler.fit_transform(rrn)
    AIC_std = scaler.fit_transform(AIC)
    MDL_std = scaler.fit_transform(MDL)
    
    # Plot the results
    plt.figure(figsize=(10, 6))
    plt.scatter(np.arange(len(lap_std)), lap_std, label='Laplacian')
    plt.scatter(np.arange(len(bic_std)), bic_std, label='BIC')
    plt.scatter(np.arange(len(rrn_std)), rrn_std, label='RRN')
    plt.scatter(np.arange(len(AIC_std)), AIC_std, label='AIC')
    plt.scatter(np.arange(len(MDL_std)), MDL_std, label='MDL')
    
    plt.xlabel('Index')
    plt.ylabel('Standardized Value')
    plt.legend()
    plt.title('Scatter Plot of Standardized Eigenvalues and Model Order Selection Values')
    plt.show()
   
    return np.argmax(rrn_std)+1

def get_n_and_some(data):
    # Check the shape of the data and determine the axis for mean subtraction

    # Move data to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_gpu = data.to(device, dtype=torch.float32)
    groupN = data_gpu.shape[1] - 1

    # Subtract the mean along the specified axis
    data_centered = data_gpu - torch.mean(data_gpu, dim=1, keepdim=True)
    del data_gpu  # Free up GPU memory
    torch.cuda.empty_cache()
    # Perform SVD decomposition
    _, d, v = torch.svd(data_centered)
    del data_centered  # Free up GPU memory
    torch.cuda.empty_cache()
    
    # Convert singular values to eigenvalues
    e = (d ** 2) / groupN

    # Move eigenvalues to CPU and convert to NumPy array
    e_np = e.cpu().numpy()
    del e, d  # Free up GPU memory
    torch.cuda.empty_cache()

    # Determine the number of components
    n_components = torch.tensor(call_pca_dim(eigs=e_np, N=groupN),device=device,dtype=torch.int32)

    return n_components, v.T

def PPCA(data, filters=None, threshold=1.6, niters=10, n=-1):
    n_components = -1
    n_prev = -2
    i = 0

    # Move data to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_gpu = torch.tensor(data,device=device, dtype=torch.float32)

    while n_components != n_prev and i < niters:
        n_prev = n_components
        if filters is not None:
            basis_gpu =  torch.tensor(filters.T,device=device, dtype=torch.float32)
        else:
            n_components, vt = get_n_and_some(data_gpu)
            if n <= 0:
                basis_gpu = vt[:n_components, :]
            else:
                print(n)
                basis_gpu = vt[:n, :]
            del vt
            torch.cuda.empty_cache()
        
        print(n_prev, n_components)

        # Estimate noise and residual standard deviation
        est_noise = data_gpu - (data_gpu @ torch.linalg.pinv(basis_gpu)) @ basis_gpu
        est_residual_std = torch.std(est_noise,dim=0,correction=torch.linalg.matrix_rank(basis_gpu))
        del est_noise
        torch.cuda.empty_cache()

        # Normalize the data
        data_gpu = (data_gpu / est_residual_std)
        i += 1

    data = data_gpu.cpu().numpy()
    basis = basis_gpu.cpu().numpy()
    # del data_gpu, basis_gpu, est_residual_std
    del data_gpu, basis_gpu
    torch.cuda.empty_cache()
    return data, basis

In [None]:
subs_data_VN, vt = PPCA(reducedsubs.copy(), threshold=0.0, niters=1)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"subs_data_VN.npy"), subs_data_VN)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"vt.npy"), vt)

In [None]:
# Columns are samples i.e. XXT is the covariance matrix formed
def whiten(X,n_components, method="SVD", visualize=False):
    # -1 to account for demean
    n_samples = X.shape[-1]-1
    X_mean = X.mean(axis=-1)
    X -= X_mean[:, np.newaxis]

    if method == "SVD":
        u, d = svd(X, full_matrices=False, check_finite=False)[:2]
        # Give consistent eigenvectors for both svd solvers
        # u *= np.sign(u[0])
        K = (u / d).T[:n_components]  # see (6.33) p.140
        del u, d
        whitening_matrix = np.sqrt(n_samples)*K
    elif method == "Cholesky":
    # Does not Orthogonalize, just has unit covariance
        # Step 2: Perform Cholesky decomposition
        L = np.linalg.cholesky(np.cov(X,ddof=1))
        # Step 3:
        whitening_matrix = np.linalg.inv(L)
    elif method == "InvCov":
        # Calculate the covariance matrix of the centered data
        cov_matrix = np.cov(X)
        # Perform eigenvalue decomposition of the covariance matrix
        eigvals, eigvecs = np.linalg.eigh(cov_matrix)
        # Calculate the whitening matrix
        D_inv_sqrt = np.diag(1.0 / np.sqrt(eigvals))
        whitening_matrix = eigvecs @ D_inv_sqrt @ eigvecs.T
   
    whitened_data = whitening_matrix@X

    return whitened_data, whitening_matrix

# Combine Basis
combined_spatial = np.vstack((vt,filters.T))

# Whiten
whitened_basis, whitening_matrix_pre = whiten(combined_spatial,n_components=combined_spatial.shape[0],method="InvCov",visualize=True)
subs_data_com_VN, _ = PPCA(reducedsubs.copy(), filters=whitened_basis.T, threshold=0.0, niters=1)

# tempbasis = np.linalg.pinv(subs_data_com_VN@np.linalg.pinv(whitened_basis))@subs_data_com_VN
# whitened_basis, _ = whiten(tempbasis,n_components=tempbasis.shape[0],method="InvCov",visualize=True)

# for i in range(0,3):
#     # Readjust the MiGP data based on the new basis
#     subs_data_com_VN, _ = PPCA(subs_data_com_VN.copy(), filters=whitened_basis.T, threshold=0.0, niters=1)

#     # Recalculate the basis via Haufe transform based on adjusted MIGP data
#     tempbasis = np.linalg.pinv(subs_data_com_VN@np.linalg.pinv(whitened_basis))@subs_data_com_VN

#     # Rewhiten the basis
#     whitened_basis, whitening_matrix = whiten(tempbasis,n_components=combined_spatial.shape[0],method="InvCov",visualize=True)
    

In [None]:
def ICA(data,whitened_data):
    ica = FastICA(whiten=False)
    # Takes in array-like of shape (n_samples, n_features) and returns ndarray of shape (n_samples, n_components)
    IFA_components = ica.fit_transform(whitened_data.T).T
    A = data@np.linalg.pinv(IFA_components)
    W = np.linalg.pinv(A)
    print("The combined unmixing matrix correctly calculates the components: ", np.allclose(W@data, IFA_components))
    print("The combined mixing matrix correctly reconstructs the low rank data_demean: ", np.allclose(A@IFA_components, A@(W@data)))


    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    # Heat map for the combined unmixing matrix
    sns.heatmap(W@data, cmap='viridis', ax=axes[0])
    axes[0].set_title('Combined Unmixing Matrix (W @ data)')
    axes[0].set_xlabel('Components')
    axes[0].set_ylabel('Samples')

    # Heat map for the IFA components
    sns.heatmap(IFA_components, cmap='viridis', ax=axes[1])
    axes[1].set_title('IFA Components')
    axes[1].set_xlabel('Components')
    axes[1].set_ylabel('Samples')

    # Adjust layout
    plt.tight_layout()
    plt.show()

    return IFA_components, A, W

In [None]:
raw_components_combined, A_combined, W_combined = ICA(subs_data_com_VN,whitened_basis)

In [None]:
subs_data_VN_more, vtmore = PPCA(reducedsubs.copy(), threshold=0.0, niters=1,n=vt.shape[0]+filters.shape[1])
vtmorewhiten,_ = whiten(vtmore,n_components=vtmore.shape[0],method="SVD")
subs_data_VN_more, _ = PPCA(reducedsubs.copy(), filters=vtmorewhiten.T, threshold=0.0, niters=1)

raw_components_major_more, A_major_more, W_major_more = ICA(subs_data_VN_more,vtmorewhiten)

In [None]:
save_array_to_outputfolder(os.path.join(fold_outputfolder,"raw_components_combined.npy"), raw_components_combined)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"A_combined.npy"), A_combined)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"W_combined.npy"), W_combined)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"raw_components_major_more.npy"), raw_components_major_more)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"A_major_more.npy"), A_major_more)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"W_major_more.npy"), W_major_more)

In [None]:
def noise_projection(W,data, visualize=True):

    Signals = np.linalg.pinv(W)@(W@data)
    Residuals = data - Signals
    residual_std = np.std(Residuals,axis=0,ddof=np.linalg.matrix_rank(W))
    # Trace of I-pinv(W)(W) is equal to the nullity (n-m gvien n > m) of the reconstructed matrix 
    # trace = data.shape[0] - np.linalg.matrix_rank(W)
    # residual_std2 = (np.einsum('ij,ij->j', Residuals, Residuals)/(trace))**.5


    if visualize:
        n=1000
        plt.figure()
        plt.plot(Signals[:n,0:1])
        plt.plot(Residuals[:n,0:1])
        # plt.plot(data[:n,0:1])
        # plt.plot(data[:n,0:1] - (Signals[:n,0:1]+Residuals[:n,0:1]))
        plt.legend(['Signal','Noise', 'Data' ,'Reconstruction Error'])
        plt.title("Calculations based on pinv(W)W Projection Matrix")
        plt.show()

        plt.scatter(range(0,residual_std.shape[0]), residual_std)
        plt.title("Noise std Per Voxel based on pinv(W)W Projection Matrix")
        plt.show()
    return residual_std


def threshold_and_visualize(data, W, components,visualize=False):
    
    voxel_noise = noise_projection(W,data)[:, np.newaxis]
    z_scores_array = np.zeros_like(components)
    z_scores = np.zeros_like(components)

    # Process each filter individually
    for i in range(components.shape[1]):
        z_score = ((components[:, i:i+1]))/voxel_noise
        # P(Z < -z \text{ or } Z > z) = (1 - \text{CDF}(z)) + (1 - \text{CDF}(z)) = 2 \times (1 - \text{CDF}(z))
        p_values = 2 * (1 - norm.cdf(np.abs(z_score)))
        # Apply multiple comparisons correction for the current filter https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
        reject, pvals_corrected, _, _ = multipletests(p_values.flatten(), alpha=0.05, method='fdr_bh')
        masked_comp = z_score*(reject[:,np.newaxis])
        # print(masked_comp, reject[:,np.newaxis],z_score)
        z_scores_array[:, i:i+1] = masked_comp        
        z_scores[:,i:i+1] = z_score

       # Skip the iteration if there are no significant values
        if not np.any(reject) and visualize:
            print(f'Component {i} did not contain any significant values')
            plt.figure()
            plt.hist(z_score, bins=30, color='blue', alpha=0.7)
            plt.title(f"Histogram for Filter {i} NO SIGNIFICANT VALUES")
            plt.xlabel('Value')
            plt.ylabel('Frequency')
            plt.show()
        else:
            if visualize:
                # Create a figure and axes for subplots (1 row of 2 plots per filter)
                fig, axes = plt.subplots(1, 2, figsize=(18, 10))

                ax_hist1 = axes[0]
                ax_img = axes[1]

                # Plot the histogram of the current filter
                ax_hist1.hist(z_score, bins=30, color='blue', alpha=0.7)
                ax_hist1.set_title(f"Histogram for Filter {i}")
                ax_hist1.set_xlabel('Value')
                ax_hist1.set_ylabel('Frequency')
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    # Heat map for the combined unmixing matrix
    sns.heatmap(z_scores, cmap='viridis', ax=axes[0])
    axes[0].set_title('z_score')
    axes[0].set_xlabel('Components')
    axes[0].set_ylabel('Samples')

    # Heat map for the IFA components
    sns.heatmap(z_scores_array, cmap='viridis', ax=axes[1])
    axes[1].set_title('z_score thresh')
    axes[1].set_xlabel('Components')
    axes[1].set_ylabel('Samples')

    # Adjust layout
    plt.tight_layout()
    plt.show()

    return z_scores, z_scores_array

In [None]:
z_scores_unthresh, z_scores_thresh = threshold_and_visualize(subs_data_com_VN, W_combined, raw_components_combined.T, visualize=False)

In [None]:
z_scores_unthresh_major_more, z_scores_thresh_major_more = threshold_and_visualize(subs_data_VN_more, W_major_more, raw_components_major_more.T, visualize=False)

In [None]:
save_array_to_outputfolder(os.path.join(fold_outputfolder,"z_scores_unthresh.npy"), z_scores_unthresh)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"z_scores_thresh.npy"), z_scores_thresh)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"z_scores_unthresh_major_more.npy"), z_scores_unthresh_major_more)
save_array_to_outputfolder(os.path.join(fold_outputfolder,"z_scores_thresh_major_more.npy"), z_scores_thresh_major_more)

In [None]:
z_scores_unthresh = load_array_from_outputfolder(os.path.join(fold_outputfolder,"z_scores_unthresh.npy"))
z_scores_thresh = load_array_from_outputfolder(os.path.join(fold_outputfolder,"z_scores_thresh.npy"))
z_scores_unthresh_major_more = load_array_from_outputfolder(os.path.join(fold_outputfolder,"z_scores_unthresh_major_more.npy"))
z_scores_thresh_major_more = load_array_from_outputfolder(os.path.join(fold_outputfolder,"z_scores_thresh_major_more.npy"))

In [None]:
from functools import partial

# https://www.frontiersin.org/journals/neuroscience/articles/10.3389/fnins.2017.00115/full

def calculate_netmat_and_spatial_map(Xn, z_maps):
    """
    Calculate the network matrix (netmat) and spatial map for a given subject and z_maps.
    
    Parameters:
    Xn (array): Time x Grayordinates normalized data matrix (Time x V)
    z_maps (array): Grayordinates x Components map (V x C)

    Returns:
    netmat (array): Components x Components network matrix (C x C)
    spatial_map (array): Components x Grayordinates matrix (C x V)
    """
    # Time x Components
    # Demean the regressors (z_maps)
    z_maps_demeaned = z_maps - z_maps.mean(axis=0, keepdims=True)  # Demean the columns of z_maps (V x C)
    
    # Time x Components
    A = (Xn @ np.linalg.pinv(z_maps_demeaned.T))  # A is Time x Components (T x C)
   
    
    # Normalized Time x Components matrix
    An = hcp.normalize(A)  # An is Time x Components (T x C)
    del A

    # Components x Components network matrix
    netmat = (An.T @ An) / (Xn.shape[0] - 1)  # Netmat is Components x Components (C x C)

    # Components x Grayordinates spatial map
    spatial_map = np.linalg.pinv(An) @ Xn  # Spatial map is Components x Grayordinates (C x V)

    return An, netmat, spatial_map

def dual_regress_sub(sub_path, z_maps_1, z_maps_2):
    try:
        concatenated_data = []
        for task in sub_path:
            # Load and preprocess each task
            X = nib.load(task).get_fdata(dtype=np.float32)  # Grayordinates x Time (V x T)
            Xn = hcp.normalize(X - X.mean(axis=1, keepdims=True))  # Normalizing (V x T)
            concatenated_data.append(Xn)
            del X, Xn
        
        # Concatenate data along the first axis (all tasks into one big matrix)
        subject = np.concatenate(concatenated_data, axis=0)  # Time x Grayordinates (T x V)
        del concatenated_data
        
        # Normalize the concatenated data
        Xn = hcp.normalize(subject - subject.mean(axis=1,keepdims=True))  # Time x Grayordinates normalized data (T x V)
        del subject
        
        # Calculate netmat and spatial map for the first set of z_maps
        An_1, netmat_1, spatial_map_1 = calculate_netmat_and_spatial_map(Xn, z_maps_1)

        # Calculate netmat and spatial map for the second set of z_maps
        An_2, netmat_2, spatial_map_2 = calculate_netmat_and_spatial_map(Xn, z_maps_2)

        return (An_1, netmat_1, spatial_map_1), (An_2, netmat_2, spatial_map_2)

    except Exception as e:
        print(f"Error processing subject: {e}")
        return None, None

def dual_regress(group_paths, z_maps_1, z_maps_2):
    # Use partial to avoid duplicating z_maps in memory
    with ProcessPoolExecutor(max_workers=int(os.cpu_count() * 0.7)) as executor:
        # Create a partial function that "binds" the z_maps_1 and z_maps_2 without duplicating them
        partial_func = partial(dual_regress_sub, z_maps_1=z_maps_1, z_maps_2=z_maps_2)

        # Pass the subject paths to the executor without copying z_maps
        results = list(executor.map(partial_func, group_paths))
        
        # Separate the results for the two bases, collecting An, netmat, and spatial_map
        An_1, netmats_1, spatial_maps_1 = zip(*[(res[0][0], res[0][1], res[0][2]) for res in results if res[0] is not None])
        An_2, netmats_2, spatial_maps_2 = zip(*[(res[1][0], res[1][1], res[1][2]) for res in results if res[1] is not None])

        return (np.array(An_1), np.array(netmats_1), np.array(spatial_maps_1)), (np.array(An_2), np.array(netmats_2), np.array(spatial_maps_2))

# Save function for An, netmats, and spatial maps
def save_numpy_arrays(output_prefix, An_1, netmats_1, spatial_maps_1, An_2, netmats_2, spatial_maps_2):
    """
    Saves the An arrays, netmats, and spatial maps to disk using np.save.
    
    Parameters:
    output_prefix (str): Prefix for the output files.
    An_1 (np.array): Time x Components matrix for z_maps_1.
    netmats_1 (np.array): Network matrices for z_maps_1.
    spatial_maps_1 (np.array): Spatial maps for z_maps_1.
    An_2 (np.array): Time x Components matrix for z_maps_2.
    netmats_2 (np.array): Network matrices for z_maps_2.
    spatial_maps_2 (np.array): Spatial maps for z_maps_2.
    """
    save_array_to_outputfolder(f"{output_prefix}_An_1.npy", An_1)
    save_array_to_outputfolder(f"{output_prefix}_netmats_1.npy", netmats_1)
    save_array_to_outputfolder(f"{output_prefix}_spatial_maps_1.npy", spatial_maps_1)
    save_array_to_outputfolder(f"{output_prefix}_An_2.npy", An_2)
    save_array_to_outputfolder(f"{output_prefix}_netmats_2.npy", netmats_2)
    save_array_to_outputfolder(f"{output_prefix}_spatial_maps_2.npy", spatial_maps_2)


In [None]:
# For Group A - Training Set
(group_An_1_train, group_netmats_1_train, group_spatial_maps_1_train), (group_An_2_train, group_netmats_2_train, group_spatial_maps_2_train) = dual_regress(combined_paths_train, z_scores_unthresh, z_scores_unthresh_major_more)
save_numpy_arrays(os.path.join(fold_outputfolder,"group_train"), group_An_1_train, group_netmats_1_train, group_spatial_maps_1_train, group_An_2_train, group_netmats_2_train, group_spatial_maps_2_train)
# For Group A - Test Set
(group_An_1_test, group_netmats_1_test, group_spatial_maps_1_test), (group_An_2_test, group_netmats_2_test, group_spatial_maps_2_test) = dual_regress(combined_paths_test, z_scores_unthresh, z_scores_unthresh_major_more)
save_numpy_arrays(os.path.join(fold_outputfolder,"group_test"), group_An_1_test, group_netmats_1_test, group_spatial_maps_1_test, group_An_2_test, group_netmats_2_test, group_spatial_maps_2_test)

In [None]:
group_netmats_1_train = load_array_from_outputfolder(os.path.join(fold_outputfolder,"group_train_netmats_1.npy"))
group_netmats_2_train = load_array_from_outputfolder(os.path.join(fold_outputfolder,"group_train_netmats_2.npy"))
group_netmats_1_test = load_array_from_outputfolder(os.path.join(fold_outputfolder,"group_test_netmats_1.npy"))
group_netmats_2_test = load_array_from_outputfolder(os.path.join(fold_outputfolder,"group_test_netmats_2.npy"))

In [None]:
print(group_netmats_2_train.shape)

In [None]:
def tan_regression(train_netmats,test_netmats, train_values, test_values, train_age, test_age, metric=metric):
    
    # Plot the histogram of training data and calculated weights
    plt.figure(figsize=(10, 6))
    # Plot histogram of training target values
    plt.hist(train_values, bins=n_bins, alpha=0.7, color='blue', edgecolor='black', label='Train Data')
    plt.title(f'Train Data Histogram and Weights')
    plt.xlabel('Target Value')
    plt.ylabel('Frequency / Weight')
    plt.legend()
    plt.show()

    train_mean = mean_covariance(train_netmats, metric=metric)
    tan_train = tangent_space(train_netmats, train_mean,metric=metric)
    tan_test = tangent_space(test_netmats, train_mean,metric=metric)

    tan_mean = np.mean(tan_train, axis=0)
    tan_train_centered = tan_train - tan_mean
    tan_test_centered = tan_test - tan_mean

    age_reg = regress_out_age(tan_train_centered, train_age)
    tan_train_centered = tan_train_centered - age_reg.predict(train_age.reshape(-1, 1))
    tan_test_centered = tan_test - age_reg.predict(test_age.reshape(-1, 1))

    # reg_model = Ridge(alpha=1)
    reg_model = LinearSVR(C=1,fit_intercept=False)

    reg_model.fit(tan_train_centered, train_values)
    
    # Evaluate the model on the test set
    test_score = reg_model.score(tan_test_centered, test_values)
    predictions = reg_model.predict(tan_test_centered)
    fold_corr, _ = pearsonr(test_values, predictions)

    print(f"Fold Test R² Score: {test_score}")
    print(f"Fold Test R Score: {fold_corr}")

    # Plot predictions vs true values for test fold
    plot_predictions(test_values, predictions,f'Test Predictions vs True Values - Fold R²: {test_score:.2f}', 'Test')
    # Now plot for train fold
    predictions_train = reg_model.predict(tan_train_centered)
    plot_predictions(values_train, predictions_train, f'Train Predictions vs True Values - Fold R²: {test_score:.2f}', 'Train')
     # Plot residuals vs true values for test fold
    plot_residuals(test_values, predictions, f'Test Residuals vs True Values - Fold R²: {test_score:.2f}', 'Test')

    # Plot residuals vs true values for train fold
    plot_residuals(train_values, predictions_train, f'Train Residuals vs True Values - Fold R²: {test_score:.2f}', 'Train')


tan_regression(group_netmats_1_train,group_netmats_1_test, values_train, values_test, age_train, age_test, metric=metric)
tan_regression(group_netmats_2_train,group_netmats_2_test, values_train, values_test, age_train, age_test, metric=metric)