In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import seaborn as sns
import networkx as nx
import matplotlib.colors as mcolors
from scipy import stats
from scipy.stats import f_oneway, kruskal
from statsmodels.stats.multicomp import MultiComparison
from scikit_posthocs import posthoc_dunn
# pd.options.display.max_rows = None
# pd.options.display.max_columns = None
import glob as glob
import math
pd.set_option('display.max_rows', None)  # To display all rows
pd.set_option('display.max_columns', None)  # To display all columns
from scipy.stats import shapiro

In [2]:
#load in master dataset csv 
gm_3q = pd.read_csv('graph_metrics_adni_a4_bootstrapped_3quant.csv')
print(gm_3q['Centiloid Quantile'].unique())

[0 1 2]


In [3]:
adni_gm = gm_3q[gm_3q['Dataset']=='ADNI']
a4_gm = gm_3q[gm_3q['Dataset']=='A4']
print(adni_gm['Dataset'].nunique()) #looks good
print(a4_gm['Dataset'].nunique())

1
1


In [4]:
print(adni_gm.head())

  Dataset  Centiloid Quantile  Weighted Clustering Coefficient  \
0    ADNI                   0                         0.068934   
1    ADNI                   0                         0.083235   
2    ADNI                   0                         0.058079   
3    ADNI                   0                         0.070969   
4    ADNI                   0                         0.067504   

   Weighted Avg. Shortest Path Length  Weighted Small World  
0                            1.842079              0.325390  
1                            2.203289              0.329465  
2                            2.307050              0.207186  
3                            2.092218              0.308219  
4                            2.123366              0.281602  


Do One Way ANOVA test between the quartile groups for weighted clustering coefficent and shortest path leng, do KS for small world 

In [6]:
from scipy.stats import f_oneway, kruskal
from statsmodels.stats.multicomp import MultiComparison
# from scikit_posthocs import posthoc_dunn
import numpy as np
import pandas as pd

def perform_anova_kruskal_dunn(df1, df2, metrics, group_column):
    anova_results = []
    tukey_results = []
    dunn_results = []
    
    for df, name in [(df1, 'ADNI'), (df2, 'A4')]:
        for metric in metrics:
            group_data = [df[df[group_column] == group][metric] for group in df[group_column].unique()]
            num_comparisons = len(group_data)
            
            # ANOVA
            if metric != 'Weighted Small World':
                result = f_oneway(*group_data)
                p_adjusted_anova = result.pvalue * num_comparisons  # Applying Bonferroni Correction
                significant_anova = 'Yes' if p_adjusted_anova < 0.05 else 'No'
                
                mc = MultiComparison(df[metric], df[group_column])
                tukey_result = mc.tukeyhsd()
                tukey_summary = tukey_result.summary()
                
                anova_results.append({
                    'Dataset': name,
                    'Metric': metric,
                    'Test': 'ANOVA',
                    'Statistic': result.statistic,
                    'p-value': result.pvalue,
                    'p-value (adjusted)': p_adjusted_anova,  # Including adjusted p-value
                    'Significant': significant_anova,
                    'Tukey_HSD': tukey_summary
                })
                
            # Kruskal-Wallis
            else:
                result = kruskal(*group_data)
                p_adjusted_kw = result.pvalue * num_comparisons  # Applying Bonferroni Correction
                significant_kw = 'Yes' if p_adjusted_kw < 0.05 else 'No'
                
                dunn_result = posthoc_dunn(df, val_col=metric, group_col=group_column)
                
                dunn_results.append({
                    'Dataset': name,
                    'Metric': metric,
                    'Test': 'Kruskal-Wallis',
                    'Statistic': result.statistic,
                    'p-value': result.pvalue,
                    'p-value (adjusted)': p_adjusted_kw,  # Including adjusted p-value
                    'Significant': significant_kw,
                    'Dunn_HSD': dunn_result
                })
                
    return pd.DataFrame(anova_results), pd.DataFrame(dunn_results)


def extract_tukey_results(tukey_summary):
    """
    Extracts relevant information from Tukey's HSD test summary.
    Returns a dictionary containing mean differences, confidence intervals,
    and p-values for each pairwise comparison.
    """
    tukey_results = {}
    for line in tukey_summary.data:
        groups, meandiff, _, p_adj = line[:4]
        tukey_results[groups] = {'Mean Difference': meandiff,
                                 'Confidence Interval': line[4],
                                 'P-value (adjusted)': p_adj}
    return tukey_results

def modify_results_df_with_tukey_and_dunn(results):
    results['Tukey_Mean_Difference'] = None
    results['Tukey_Confidence_Interval'] = None
    results['Tukey_P-value (adjusted)'] = None
    results['Dunn_p-value (adjusted)'] = None
    
    for i, row in results.iterrows():
        if row['Test'] == 'ANOVA':
            tukey_summary = row['Tukey_HSD']
            if tukey_summary:
                tukey_results = extract_tukey_results(tukey_summary)
                for groups, data in tukey_results.items():
                    results.at[i, 'Tukey_Mean_Difference'] = data['Mean Difference']
                    results.at[i, 'Tukey_Confidence_Interval'] = data['Confidence Interval']
                    results.at[i, 'Tukey_P-value (adjusted)'] = data['P-value (adjusted)']
                    
        elif row['Test'] == 'Kruskal-Wallis':
            dunn_result = row['Dunn_HSD']
            if dunn_result is not None:
                dunn_p_adjusted = dunn_result * len(results)  # Bonferroni correction
                results.at[i, 'Dunn_p-value (adjusted)'] = dunn_p_adjusted
    
    # Drop the 'Tukey_HSD' and 'Dunn_HSD' columns as they're no longer needed
    results.drop(columns=['Tukey_HSD', 'Dunn_HSD'], inplace=True)
    
    return results


# Test
metrics = ['Weighted Clustering Coefficient', 'Weighted Avg. Shortest Path Length', 'Weighted Small World']
anova_results, dunn_results = perform_anova_kruskal_dunn(adni_gm, a4_gm, metrics, 'Centiloid Quantile')
final_results = modify_results_df_with_tukey_and_dunn(pd.concat([anova_results, dunn_results]))
print(final_results)


  Dataset                              Metric            Test    Statistic  \
0    ADNI     Weighted Clustering Coefficient           ANOVA   567.441841   
1    ADNI  Weighted Avg. Shortest Path Length           ANOVA   598.080717   
2      A4     Weighted Clustering Coefficient           ANOVA   570.450427   
3      A4  Weighted Avg. Shortest Path Length           ANOVA  2085.018052   
0    ADNI                Weighted Small World  Kruskal-Wallis  1528.089792   
1      A4                Weighted Small World  Kruskal-Wallis  1954.486242   

         p-value  p-value (adjusted) Significant Tukey_Mean_Difference  \
0  1.040341e-209       3.121024e-209         Yes                     2   
1  2.732263e-219       8.196789e-219         Yes                     2   
2  1.175259e-210       3.525778e-210         Yes                     2   
3   0.000000e+00        0.000000e+00         Yes                     2   
0   0.000000e+00        0.000000e+00         Yes                     2   
1   0.000

In [7]:
print(anova_results)

  Dataset                              Metric   Test    Statistic  \
0    ADNI     Weighted Clustering Coefficient  ANOVA   567.441841   
1    ADNI  Weighted Avg. Shortest Path Length  ANOVA   598.080717   
2      A4     Weighted Clustering Coefficient  ANOVA   570.450427   
3      A4  Weighted Avg. Shortest Path Length  ANOVA  2085.018052   

         p-value  p-value (adjusted) Significant  \
0  1.040341e-209       3.121024e-209         Yes   
1  2.732263e-219       8.196789e-219         Yes   
2  1.175259e-210       3.525778e-210         Yes   
3   0.000000e+00        0.000000e+00         Yes   

                                           Tukey_HSD  
0  [[group1, group2, meandiff, p-adj, lower, uppe...  
1  [[group1, group2, meandiff, p-adj, lower, uppe...  
2  [[group1, group2, meandiff, p-adj, lower, uppe...  
3  [[group1, group2, meandiff, p-adj, lower, uppe...  


split into adni and a4 in case

In [8]:
def split_results_by_dataset(df):
    adni_results = df[df['Dataset'] == 'ADNI']
    a4_results = df[df['Dataset'] == 'A4']
    return adni_results, a4_results

adni_anova_results, a4_anova_results = split_results_by_dataset(anova_results)
adni_tukey_results, a4_tukey_results = split_results_by_dataset(tukey_results)


NameError: name 'tukey_results' is not defined

In [None]:
print(adni_anova_results)

  Dataset                              Metric            Test    Statistic  \
0    ADNI     Weighted Clustering Coefficient           ANOVA   567.441841   
1    ADNI  Weighted Avg. Shortest Path Length           ANOVA   598.080717   
2    ADNI                Weighted Small World  Kruskal-Wallis  1528.089792   

         p-value  p-value (adjusted) Significant  
0  1.040341e-209       3.121024e-209         Yes  
1  2.732263e-219       8.196789e-219         Yes  
2   0.000000e+00        0.000000e+00         Yes  


In [None]:
print(a4_anova_results)

  Dataset                              Metric            Test    Statistic  \
3      A4     Weighted Clustering Coefficient           ANOVA   570.450427   
4      A4  Weighted Avg. Shortest Path Length           ANOVA  2085.018052   
5      A4                Weighted Small World  Kruskal-Wallis  1954.486242   

         p-value  p-value (adjusted) Significant  
3  1.175259e-210       3.525778e-210         Yes  
4   0.000000e+00        0.000000e+00         Yes  
5   0.000000e+00        0.000000e+00         Yes  


In [None]:
print(adni_tukey_results)

  Dataset                              Metric Tukey_Mean_Difference  \
0    ADNI     Weighted Clustering Coefficient                     2   
1    ADNI  Weighted Avg. Shortest Path Length                     2   
2    ADNI                Weighted Small World                  None   

  Tukey_Confidence_Interval Tukey_P-value (adjusted)  
0                    0.0098                      0.0  
1                    -0.349                      0.0  
2                      None                     None  


In [None]:
print(a4_tukey_results)

  Dataset                              Metric Tukey_Mean_Difference  \
3      A4     Weighted Clustering Coefficient                     2   
4      A4  Weighted Avg. Shortest Path Length                     2   
5      A4                Weighted Small World                  None   

  Tukey_Confidence_Interval Tukey_P-value (adjusted)  
3                    0.0053                      0.0  
4                   -0.8566                      0.0  
5                      None                     None  


Interpretation of A4 Tukey: 