In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection
import logging
import os
from typing import Dict, List, Tuple

# Global configurations
code_directory = '/Users/tereza/nishant/atlas/atlas_work_terez/atlas_harmonization/code'
os.chdir(code_directory)

RESULTS_DIR = '../results'
DATA_DIR = '../Data'
FIGURES_DIR = '../figures'

class RegionalAnalysis:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.feature_columns = [
            'deltaRel_mean', 
            'thetaRel_mean', 
            'alphaRel_mean', 
            'betaRel_mean',
            'gammaRel_mean',
            'entropy_1min_mean',
            'entropy_fullts_mean'
        ]
        
        self.feature_names = {
            'deltaRel_mean': 'Delta Band (0.5-4 Hz)',
            'thetaRel_mean': 'Theta Band (4-8 Hz)',
            'alphaRel_mean': 'Alpha Band (8-13 Hz)',
            'betaRel_mean': 'Beta Band (13-30 Hz)',
            'gammaRel_mean': 'Gamma Band (30-80 Hz)',
            'entropy_1min_mean': 'Signal Entropy (1-min)',
            'entropy_fullts_mean': 'Signal Entropy (full)'
        }
        
    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Load and prepare the regional feature data for both cohorts
        """
        try:
            # Load data with correct paths
            hup_path = os.path.join(RESULTS_DIR, 'ge_go_hup_region_features.csv')
            mni_path = os.path.join(RESULTS_DIR, 'mni_region_features.csv')
            
            print(f"Loading HUP features from: {hup_path}")
            print(f"Loading MNI features from: {mni_path}")
            
            self.hup_features = pd.read_csv(hup_path)
            self.mni_features = pd.read_csv(mni_path)
            
            # Get common regions
            self.common_regions = set(self.hup_features['roi'].unique()) & set(self.mni_features['roi'].unique())
            
            print(f"\nLoaded data:")
            print(f"HUP features shape: {self.hup_features.shape}")
            print(f"MNI features shape: {self.mni_features.shape}")
            print(f"Number of common regions: {len(self.common_regions)}")
            
            return self.hup_features, self.mni_features
            
        except FileNotFoundError as e:
            self.logger.error(f"File not found error: {str(e)}")
            print(f"\nCurrent working directory: {os.getcwd()}")
            print(f"RESULTS_DIR path: {os.path.abspath(RESULTS_DIR)}")
            raise
        except Exception as e:
            self.logger.error(f"Error loading data: {str(e)}")
            raise
    
    def prepare_paired_data(self, region: str, feature: str) -> Tuple[np.ndarray, np.ndarray]:
        """
        Prepare paired data for a specific region and feature.
        Returns arrays of equal length for valid statistical comparison.
        """
        # Get data for specific region
        hup_region_data = self.hup_features[self.hup_features['roi'] == region]
        mni_region_data = self.mni_features[self.mni_features['roi'] == region]
        
        # Group by patient and get mean values (in case of multiple electrodes per region per patient)
        hup_data = hup_region_data.groupby('patient_id')[feature].mean().values
        mni_data = mni_region_data.groupby('patient_id')[feature].mean().values
        
        # Get minimum length to ensure paired data
        min_length = min(len(hup_data), len(mni_data))
        
        if min_length < 5:  # Minimum sample size requirement
            raise ValueError(f"Insufficient samples for region {region} (HUP: {len(hup_data)}, MNI: {len(mni_data)})")
            
        # Match lengths for pairing
        hup_data = hup_data[:min_length]
        mni_data = mni_data[:min_length]
        
        return hup_data, mni_data
    
    def compute_effect_size(self, hup_data: np.ndarray, mni_data: np.ndarray) -> float:
        """
        Compute Cohen's d effect size for paired data
        """
        diff = hup_data - mni_data
        d = np.mean(diff) / np.std(diff)
        return d
    
    def analyze_regions(self) -> pd.DataFrame:
        """
        Perform paired regional analysis between HUP and MNI cohorts
        """
        results = []
        
        for region in self.common_regions:
            self.logger.info(f"Analyzing region: {region}")
            
            for feature in self.feature_columns:
                try:
                    # Get paired data
                    hup_data, mni_data = self.prepare_paired_data(region, feature)
                    
                    # Perform Wilcoxon signed-rank test (paired test)
                    statistic, pvalue = stats.wilcoxon(hup_data, mni_data)
                    
                    # Compute effect size
                    effect_size = self.compute_effect_size(hup_data, mni_data)
                    
                    results.append({
                        'region': region,
                        'feature': feature,
                        'statistic': statistic,
                        'pvalue': pvalue,
                        'effect_size': effect_size,
                        'hup_mean': np.mean(hup_data),
                        'mni_mean': np.mean(mni_data),
                        'hup_std': np.std(hup_data),
                        'mni_std': np.std(mni_data),
                        'n_samples': len(hup_data)
                    })
                    
                except ValueError as e:
                    self.logger.warning(f"Skipping {region}-{feature}: {str(e)}")
                except Exception as e:
                    self.logger.error(f"Error processing {region}-{feature}: {str(e)}")
        
        # Create results DataFrame
        results_df = pd.DataFrame(results)
        
        if len(results_df) > 0:
            # Apply FDR correction
            _, fdr_pvals = fdrcorrection(results_df['pvalue'])
            results_df['pvalue_fdr'] = fdr_pvals
        else:
            self.logger.warning("No results generated from analysis")
            
        return results_df
    
    def summarize_results(self, results_df: pd.DataFrame):
        """
        Print summary of significant findings
        """
        if len(results_df) == 0:
            print("No results to summarize")
            return
            
        print("\nRegional Analysis Summary")
        print("=" * 50)
        
        for feature in self.feature_columns:
            feature_results = results_df[results_df['feature'] == feature]
            sig_results = feature_results[feature_results['pvalue_fdr'] < 0.05]
            
            print(f"\n{self.feature_names[feature]}:")
            print(f"- {len(sig_results)} regions show significant differences")
            
            if len(sig_results) > 0:
                sig_results = sig_results.sort_values('effect_size', key=abs, ascending=False)
                print("\nTop regions with largest differences:")
                
                for _, row in sig_results.head(3).iterrows():
                    direction = "higher in HUP" if row['effect_size'] > 0 else "higher in MNI"
                    effect_mag = "large" if abs(row['effect_size']) > 0.8 else \
                                "medium" if abs(row['effect_size']) > 0.5 else "small"
                    
                    print(f"  * {row['region']}: {effect_mag} effect {direction}")
                    print(f"    (p={row['pvalue_fdr']:.3e}, d={row['effect_size']:.2f}, n={row['n_samples']})")

def main():
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    
    try:
        # Initialize analysis
        analysis = RegionalAnalysis()
        
        # Load data
        analysis.load_data()
        
        # Perform analysis
        results = analysis.analyze_regions()
        
        # Summarize results
        analysis.summarize_results(results)
        
        # Save results
        output_path = os.path.join(RESULTS_DIR, 'regional_analysis_results.csv')
        results.to_csv(output_path, index=False)
        print(f"\nResults saved to: {output_path}")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

INFO:__main__:Analyzing region: ctx-rh-precuneus
INFO:__main__:Analyzing region: ctx-rh-rostralmiddlefrontal
INFO:__main__:Analyzing region: ctx-rh-superiortemporal
INFO:__main__:Analyzing region: ctx-lh-rostralanteriorcingulate
INFO:__main__:Analyzing region: ctx-lh-parstriangularis
INFO:__main__:Analyzing region: ctx-lh-rostralmiddlefrontal
INFO:__main__:Analyzing region: ctx-lh-lateraloccipital
INFO:__main__:Analyzing region: Right-Amygdala
INFO:__main__:Analyzing region: Right-Putamen
INFO:__main__:Analyzing region: ctx-lh-precentral
INFO:__main__:Analyzing region: ctx-rh-bankssts
INFO:__main__:Analyzing region: ctx-rh-precentral


Loading HUP features from: ../results/ge_go_hup_region_features.csv
Loading MNI features from: ../results/mni_region_features.csv

Loaded data:
HUP features shape: (398, 10)
MNI features shape: (639, 10)
Number of common regions: 57


INFO:__main__:Analyzing region: ctx-rh-rostralanteriorcingulate
INFO:__main__:Analyzing region: ctx-rh-caudalmiddlefrontal
INFO:__main__:Analyzing region: ctx-rh-medialorbitofrontal
INFO:__main__:Analyzing region: ctx-rh-lateraloccipital
INFO:__main__:Analyzing region: Left-Hippocampus
INFO:__main__:Analyzing region: ctx-rh-lateralorbitofrontal
INFO:__main__:Analyzing region: ctx-rh-lingual
INFO:__main__:Analyzing region: ctx-lh-postcentral
INFO:__main__:Analyzing region: ctx-lh-superiorfrontal
INFO:__main__:Analyzing region: ctx-lh-lingual
INFO:__main__:Analyzing region: Left-Putamen
INFO:__main__:Analyzing region: ctx-lh-pericalcarine
INFO:__main__:Analyzing region: ctx-lh-fusiform
INFO:__main__:Analyzing region: ctx-lh-parsorbitalis
INFO:__main__:Analyzing region: ctx-lh-posteriorcingulate
INFO:__main__:Analyzing region: ctx-lh-cuneus
INFO:__main__:Analyzing region: ctx-lh-inferiortemporal
INFO:__main__:Analyzing region: ctx-lh-caudalmiddlefrontal
INFO:__main__:Analyzing region: ctx


Regional Analysis Summary

Delta Band (0.5-4 Hz):
- 0 regions show significant differences

Theta Band (4-8 Hz):
- 0 regions show significant differences

Alpha Band (8-13 Hz):
- 0 regions show significant differences

Beta Band (13-30 Hz):
- 0 regions show significant differences

Gamma Band (30-80 Hz):
- 0 regions show significant differences

Signal Entropy (1-min):
- 0 regions show significant differences

Signal Entropy (full):
- 0 regions show significant differences

Results saved to: ../results/regional_analysis_results.csv
