In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from typing import List, Tuple

import torch
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc, confusion_matrix

SETUP

In [7]:
code_directory = '/Users/tereza/nishant/atlas/atlas_work_terez/atlas_harmonization/code'
os.chdir(code_directory)

# set the paths relative to this new current directory
base_path_data = '../Data'
base_path_results = '../results'

print("Current Directory:", os.getcwd())

print('Files available at results path:')
files = os.listdir(base_path_results)
# for file in files:
    # print(file)

Current Directory: /Users/tereza/nishant/atlas/atlas_work_terez/atlas_harmonization/code
Files available at results path:


In [3]:
base_path_results = '../results'
figures_path = '../figures'
results_path = '../results'

In [27]:
hup_region_features = pd.read_csv(os.path.join(results_path, 'ge_go_hup_region_features.csv'), index_col=0)
mni_region_features = pd.read_csv(os.path.join(results_path, 'mni_region_features.csv'), index_col=0)

In [37]:
# print(hup_region_features.head())
# print(mni_region_features.head())

hup_region_features.reset_index(inplace=True)
mni_region_features.reset_index(inplace=True)
print(hup_region_features['patient_id'].nunique())
print(mni_region_features['patient_id'].nunique())

28
106


In [17]:
# def prepare_classification_data(hup_features: pd.DataFrame, 
#                               mni_features: pd.DataFrame,
#                               significant_features: List[str] = None) -> Tuple[np.ndarray, np.ndarray]:
#     """
#     Prepare data for site classification.
    
#     Args:
#         hup_features: Region-level features for HUP patients
#         mni_features: Region-level features for MNI patients
#         significant_features: List of features to use (from statistical analysis)
#     """
#     # Group by patient and compute mean across regions
#     hup_patient_features = hup_features.groupby('patient_id')[significant_features].mean()
#     mni_patient_features = mni_features.groupby('patient_id')[significant_features].mean()
    
#     # Create labels
#     hup_labels = np.ones(len(hup_patient_features))
#     mni_labels = np.zeros(len(mni_patient_features))
    
#     # Combine features and labels
#     X = pd.concat([hup_patient_features, mni_patient_features])
#     y = np.concatenate([hup_labels, mni_labels])
    
#     # Scale features
#     scaler = StandardScaler()
#     X_scaled = scaler.fit_transform(X)
    
#     return X_scaled, y

# def perform_pca(X: np.ndarray, variance_threshold: float = 0.95) -> np.ndarray:
#     """
#     Perform PCA while retaining specified variance
#     """
#     pca = PCA()
#     X_pca = pca.fit_transform(X)
    
#     # Calculate cumulative variance ratio
#     cum_var_ratio = np.cumsum(pca.explained_variance_ratio_)
#     n_components = np.argmax(cum_var_ratio >= variance_threshold) + 1
    
#     return X_pca[:, :n_components]

In [35]:
hup_region_features['site'] = 'HUP'
mni_region_features['site'] = 'MNI'

# Combine the datasets
region_features = pd.concat([hup_region_features, mni_region_features], ignore_index=True)

num_unique_patients = region_features['patient_id'].nunique() # doesn't work without reset_index
print(f"Number of unique patients: {num_unique_patients}")

# feature columns to aggregate
feature_columns = ['deltaRel_mean', 'thetaRel_mean', 'alphaRel_mean', 'betaRel_mean', 'gammaRel_mean', 'entropy_1min_mean', 'entropy_fullts_mean']

# aggregate features per patient
patient_features = region_features.groupby(['patient_id', 'site'])[feature_columns].mean().reset_index() # doesn't work without reset_index

Number of unique patients: 106


In [39]:
def diagnose_region_features(region_features: pd.DataFrame):
    print("Dataset Overview:")
    print(f"Total rows: {len(region_features)}")
    print(f"Total columns: {region_features.columns.tolist()}")
    
    print("\nMissing Values:")
    print(region_features.isnull().sum())
    
    print("\nPatients per site:")
    print(region_features.groupby('site')['patient_id'].nunique())
    
    print("\nRegions per patient:")
    patient_region_counts = region_features.groupby(['site', 'patient_id']).size()
    print("\nSummary statistics of regions per patient:")
    print(patient_region_counts.groupby('site').describe())
    
    print("\nSample of data structure:")
    print(region_features.groupby(['site', 'patient_id']).head(2))

# diagnose_region_features(region_features)

# Verify feature matrix after aggregation
# print("\nFeature matrix verification:")
# print(patient_features.groupby('site').size())
# print("\nFeature value ranges:")
# print(patient_features[feature_columns].describe())

In [38]:
print("\nPatient-level feature matrix shape:")
print(patient_features.shape)
print("\nSample of aggregated features per patient:")
print(patient_features.head())
print("\nVerify unique patients:")
print(patient_features.groupby('site')['patient_id'].nunique())


Patient-level feature matrix shape:
(134, 9)

Sample of aggregated features per patient:
   patient_id site  deltaRel_mean  thetaRel_mean  alphaRel_mean  betaRel_mean  \
0           1  MNI       0.169199       0.150309       0.147994      0.175702   
1           2  MNI       0.187945       0.207479       0.143997      0.133608   
2           3  HUP       0.129081       0.122709       0.236676      0.179459   
3           3  MNI       0.177230       0.179795       0.166074      0.151208   
4           4  HUP       0.170417       0.182097       0.158259      0.159255   

   gammaRel_mean  entropy_1min_mean  entropy_fullts_mean  
0       0.124942           0.663203             0.792426  
1       0.087666           0.667673             0.797214  
2       0.057972           0.676374             0.806519  
3       0.102935           0.663542             0.792790  
4       0.114277           0.665870             0.795281  

Verify unique patients:
site
HUP     28
MNI    106
Name: patient_id,

LOGISTIC REGRESSION

In [41]:
# Feature matrix X
X = patient_features[feature_columns]

# Label vector y
y = patient_features['site'].map({'HUP': 1, 'MNI': 0})

# aggregate then standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)