# Check for potential confounding within metadata variables

In [766]:
# Import Python packages
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from scipy.stats.contingency import chi2_contingency
import statsmodels.formula.api as smf
from skbio import DistanceMatrix
from skbio.stats.distance import permanova, permdisp


### Prepare data for confounding assessment

In [767]:
# Load metadata
metadata_path = metadata_path = '../Metadata/16S_AD_South-Africa_metadata_subset.tsv'
metadata = pd.read_csv(metadata_path, sep='\t', index_col=0)
print("\nMetadata shape:", metadata.shape)
print("Metadata columns:", metadata.columns.tolist())


Metadata shape: (462, 29)
Metadata columns: ['PlateNumber', 'PlateLocation', 'i5', 'i5Sequence', 'i7', 'i7Sequence', 'identifier', 'Sequence', 'Plate ID', 'Well location', 'Volume (ul)', 'Lysozyme pretreatment', 'DNA extraction method', 'Purification method', 'Date of DNA extraction', 'pid', 'case_type', 'participant', 'area', 'sample_type', 'specimen', 'age_months', 'sex', 'enrolment_date', 'enrolment_season', 'hiv_exposure', 'hiv_status', 'household_size', 'o_scorad']


In [768]:
# Load Faith's PD
faith_pd = pd.read_csv('../Data/Alpha_Diversity/faith_pd_results.tsv', sep='\t', index_col=0)
print("Faith PD shape:", faith_pd.shape)
print("Faith PD columns:", faith_pd.columns.tolist())
print(faith_pd.head())

# Load Shannon
shannon = pd.read_csv('../Data/Alpha_Diversity/shannon_results.tsv', sep='\t', index_col=0)
print("\nShannon shape:", shannon.shape)
print("Shannon columns:", shannon.columns.tolist())
print(shannon.head())

Faith PD shape: (151, 1)
Faith PD columns: ['Faith_PD']
        Faith_PD
900221  3.134932
900570  8.753575
900091  3.333254
900245  7.948009
900581  8.964010

Shannon shape: (151, 1)
Shannon columns: ['Shannon']
         Shannon
900221  1.327170
900570  4.581213
900091  3.732909
900245  5.102464
900581  4.391689


In [769]:
# Convert to series
faith_pd_series = faith_pd.iloc[:, 0]
shannon_series = shannon.iloc[:, 0]

# Merge together to a df
analysis_df = metadata.copy()
analysis_df['faith_pd'] = faith_pd_series
analysis_df['shannon'] = shannon_series

print("MERGED DATASET")
print("="*60)
print(f"Total samples: {analysis_df.shape[0]}")
print(f"\nColumns available: {analysis_df.columns.tolist()}")
print(f"\nMissing data:")
print(analysis_df[['faith_pd', 'shannon', 'age_months', 'sex', 'area', 'enrolment_season']].isnull().sum())

MERGED DATASET
Total samples: 462

Columns available: ['PlateNumber', 'PlateLocation', 'i5', 'i5Sequence', 'i7', 'i7Sequence', 'identifier', 'Sequence', 'Plate ID', 'Well location', 'Volume (ul)', 'Lysozyme pretreatment', 'DNA extraction method', 'Purification method', 'Date of DNA extraction', 'pid', 'case_type', 'participant', 'area', 'sample_type', 'specimen', 'age_months', 'sex', 'enrolment_date', 'enrolment_season', 'hiv_exposure', 'hiv_status', 'household_size', 'o_scorad', 'faith_pd', 'shannon']

Missing data:
faith_pd            311
shannon             311
age_months            5
sex                  11
area                  0
enrolment_season      5
dtype: int64


### Confounding assessment for alpha diversity analysis

For a testing variable to be a confounder, it must satisfy both fo the following:

1) Confounder must be associated with exposure (area)

2) Confounder must be associated with outcome (alpha/beta diversity)

In [770]:
# Use the full analysis_df instead of healthy_df
full_df = analysis_df.dropna(subset=['faith_pd', 'shannon', 'age_months', 'sex', 'area', 'enrolment_season'])
full_df = full_df.drop_duplicates(subset='pid', keep='first')
full_df

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,faith_pd,shannon
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,9.0,female,8/11/2015,Winter,Unexposed,negative,7.0,34,3.134932,1.327170
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,18.0,female,9/23/2015,Spring,Unexposed,,4.0,40,4.952225,4.680957
900226,1,F1,SA506,CGTGAGTG,SA701,CGAGAGTT,SA701SA506,CGAGAGTT-CGTGAGTG,1.010000e+21,F1,...,18.0,male,8/13/2015,Winter,Unexposed,negative,6.0,34,5.501234,1.877964
900241,1,B4,SA502,ACTATCTG,SA704,ACTCACTG,SA704SA502,ACTCACTG-ACTATCTG,1.010000e+21,B4,...,21.0,female,8/17/2015,Winter,,,7.0,,8.904218,4.355834
900243,1,D4,SA504,CTGCGTGT,SA704,ACTCACTG,SA704SA504,ACTCACTG-CTGCGTGT,1.010000e+21,D4,...,17.0,male,8/13/2015,Winter,,,10.0,,10.280636,3.789278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900508,6,A10,SA501,ATCGTACG,SB710,TGCTCGTA,SB710SA501,TGCTCGTA-ATCGTACG,1.010000e+21,A10,...,13.0,female,10/1/2015,Spring,Unexposed,,4.0,21,6.361441,3.542446
900394,6,F12,SA506,CGTGAGTG,SB712,CGTAGCGA,SB712SA506,CGTAGCGA-CGTGAGTG,1.010000e+21,F12,...,21.0,male,9/17/2015,Spring,,,7.0,28,12.560509,5.422440
900392,4,D12,SA504,CTGCGTGT,SA712,TATAGCGA,SA712SA504,TATAGCGA-CTGCGTGT,1.010000e+21,D12,...,16.0,female,9/14/2015,Spring,,,9.0,,1.822421,1.028145
900397,5,A11,SB501,CTACTATA,SB711,AACGCTGA,SB711SB501,AACGCTGA-CTACTATA,1.010000e+21,A11,...,25.0,female,42264,Spring,Exposed,negative,5.0,54,4.916750,3.531621


In [771]:
print("\n--- First check: Do confounders differ between areas? ---")

# Age difference
cape_age = full_df.loc[full_df['area'] == 'Cape Town', 'age_months']
umtata_age = full_df.loc[full_df['area'] == 'Umtata', 'age_months']
u_age, p_age = mannwhitneyu(cape_age, umtata_age, alternative='two-sided')

print(f"\n***Age (months)***")
print(f"Cape Town:  median={cape_age.median():.1f}, mean={cape_age.mean():.1f}, n={len(cape_age)}")
print(f"Umtata:     median={umtata_age.median():.1f}, mean={umtata_age.mean():.1f}, n={len(umtata_age)}")
print(f"  Mann-Whitney U test: U={u_age:.1f}, p={p_age:.3f}")
if p_age < 0.05:
    print(f" ❗Age differs significantly between areas")
else:
    print("  Age does NOT differ significantly between areas")    

# Sex difference
sex_table = pd.crosstab(full_df['area'], full_df['sex'])
chi2_sex, p_sex, _, _ = chi2_contingency(sex_table)

print(f"\n***Sex distribution***")
print(sex_table)
print(f"  Chi-square test: χ²={chi2_sex:.2f}, p={p_sex:.3f}")
if p_sex < 0.05:
    print(f" ❗Sex distribution differs significantly between areas")
else:
    print("  Sex distribution does NOT differ significantly between areas")    




--- First check: Do confounders differ between areas? ---

***Age (months)***
Cape Town:  median=25.0, mean=23.9, n=43
Umtata:     median=21.0, mean=21.1, n=71
  Mann-Whitney U test: U=1870.0, p=0.045
 ❗Age differs significantly between areas

***Sex distribution***
sex        female  male
area                   
Cape Town      20    23
Umtata         31    40
  Chi-square test: χ²=0.01, p=0.919
  Sex distribution does NOT differ significantly between areas


In [772]:
# Continued

# Enrollment season difference
sex_table = pd.crosstab(full_df['area'], full_df['enrolment_season'])
chi2_sex, p_sex, _, _ = chi2_contingency(sex_table)

print(f"\n***Enrollment season distribution***")
print(sex_table)
print(f"  Chi-square test: χ²={chi2_sex:.2f}, p={p_sex:.3f}")
if p_sex < 0.05:
    print(f" ❗Enrollment season distribution differs significantly between areas")
else:
    print("  Enrollment season distribution does NOT differ significantly between areas")   

# Participant type difference
participant_table = pd.crosstab(full_df['area'], full_df['participant'])
chi2_part, p_part, _, _ = chi2_contingency(participant_table)

print(f"\n***Case/Control type distribution***")
print(participant_table)
print(f"  Chi-square test: χ²={chi2_part:.2f}, p={p_part:.3f}")
if p_part < 0.05:
    print(f" ❗Case/Control type differs significantly between areas")
else:
    print(f'  Case/Control type does NOT differ significantly between areas')    



***Enrollment season distribution***
enrolment_season  Autumn  Autumn   Spring  Spring   Summer  Winter
area                                                              
Cape Town              9       14       1        1       4      14
Umtata                 1        0      34        4       0      32
  Chi-square test: χ²=61.17, p=0.000
 ❗Enrollment season distribution differs significantly between areas

***Case/Control type distribution***
participant  case  control
area                      
Cape Town      30       13
Umtata         41       30
  Chi-square test: χ²=1.18, p=0.278
  Case/Control type does NOT differ significantly between areas


Based on the above, there is a statistically significant difference between age and season by area. Therefore, we must see if they are also associated with the outcome (faith pd, shannon, and rpca). If they are associated with an outcome, that would make it a confounder needing to be accounted for in the analysis or interpretation.

In [773]:
# Check if confounders associate with Faith PD
print("\n--- Do confounders associate with Faith PD? ---")

# Age → Faith PD
model_age_outcome = smf.ols("faith_pd ~ age_months", data=full_df).fit()
print(f"\nAge → Faith PD:")
print(f"  β={model_age_outcome.params['age_months']:.4f}, p={model_age_outcome.pvalues['age_months']:.3f}")
if model_age_outcome.pvalues['age_months'] < 0.05:
    print(f"Age is significantly associated with Faith PD")
else:
    print(f"Age is NOT significantly associated with Faith PD")

# Sex → Faith PD
model_sex_outcome = smf.ols("faith_pd ~ C(sex)", data=full_df).fit()
sex_param_name = [p for p in model_sex_outcome.params.index if 'sex' in p][0]
print(f"\nSex → Faith PD:")
print(f"  p={model_sex_outcome.pvalues[sex_param_name]:.3f}")
if model_sex_outcome.pvalues[sex_param_name] < 0.05:
    print(f"Sex is significantly associated with Faith PD")
else:
    print(f"Sex is NOT significantly associated with Faith PD")

# Enrollment_season → Faith PD
model_season_outcome = smf.ols("faith_pd ~ C(enrolment_season)", data=full_df).fit()
season_param_name = [p for p in model_season_outcome.params.index if 'enrolment_season' in p][0]
print(f"\nEnrollment season → Faith PD:")
print(f"  p={model_season_outcome.pvalues[season_param_name]:.3f}")
if model_season_outcome.pvalues[season_param_name] < 0.05:
    print(f"Enrolment season is significantly associated with Faith PD")
else:
    print("Enrolment season is NOT significantly associated with Faith PD")

# Participant type → Faith PD
model_part_outcome = smf.ols("faith_pd ~ C(participant)", data=full_df).fit()
part_param_name = [p for p in model_part_outcome.params.index if 'participant' in p][0]
print(f"\nParticipant type → Faith PD:")
print(f"  p={model_part_outcome.pvalues[part_param_name]:.3f}")
if model_part_outcome.pvalues[part_param_name] < 0.05:
    print(f"Participant type significantly associated with Faith PD")
else:
    print("Participant type is NOT significantly associated with Faith PD")


--- Do confounders associate with Faith PD? ---

Age → Faith PD:
  β=0.0287, p=0.305
Age is NOT significantly associated with Faith PD

Sex → Faith PD:
  p=0.059
Sex is NOT significantly associated with Faith PD

Enrollment season → Faith PD:
  p=0.222
Enrolment season is NOT significantly associated with Faith PD

Participant type → Faith PD:
  p=0.222
Participant type is NOT significantly associated with Faith PD


In [774]:
# Check if confounders associate with Shannon diversity
print("\n--- Do confounders associate with Shannon diversity? ---")

# Age → Shannon
model_age_outcome = smf.ols("shannon ~ age_months", data=full_df).fit()
print(f"\nAge → Shannon:")
print(f"  β={model_age_outcome.params['age_months']:.4f}, p={model_age_outcome.pvalues['age_months']:.3f}")
if model_age_outcome.pvalues['age_months'] < 0.05:
    print(f"Age significantly associated with Shannon")
else:
    print(f"Age is NOT significantly associated with Shannon")

# Sex → Shannon
model_sex_outcome = smf.ols("shannon ~ C(sex)", data=full_df).fit()
sex_param_name = [p for p in model_sex_outcome.params.index if 'sex' in p][0]
print(f"\nSex → Shannon:")
print(f"  p={model_sex_outcome.pvalues[sex_param_name]:.3f}")
if model_sex_outcome.pvalues[sex_param_name] < 0.05:
    print(f"Sex significantly associated with Shannon")
else:
    print(f"Sex is NOT significantly associated with Shannon")

# Enrollment_season type → Shannon
model_part_outcome = smf.ols("shannon ~ C(enrolment_season)", data=full_df).fit()
part_param_name = [p for p in model_part_outcome.params.index if 'enrolment_season' in p][0]
print(f"\nEnrollment season → Shannon:")
print(f"  p={model_part_outcome.pvalues[part_param_name]:.3f}")
if model_part_outcome.pvalues[part_param_name] < 0.05:
    print(f"Enrolment season is significantly associated with Shannon")
else:
    print("Enrolment season is NOT significantly associated with Shannon")

# Participant type → Shannon
model_part_outcome = smf.ols("shannon ~ C(participant)", data=full_df).fit()
part_param_name = [p for p in model_part_outcome.params.index if 'participant' in p][0]
print(f"\nParticipant type → Shannon:")
print(f"  p={model_part_outcome.pvalues[part_param_name]:.3f}")
if model_part_outcome.pvalues[part_param_name] < 0.05:
    print(f"Participant type significantly associated with Shannon")
else:
    print("Participant type is NOT significantly associated with Shannon")



--- Do confounders associate with Shannon diversity? ---

Age → Shannon:
  β=0.0009, p=0.954
Age is NOT significantly associated with Shannon

Sex → Shannon:
  p=0.843
Sex is NOT significantly associated with Shannon

Enrollment season → Shannon:
  p=0.402
Enrolment season is NOT significantly associated with Shannon

Participant type → Shannon:
  p=0.375
Participant type is NOT significantly associated with Shannon


Conclusion:  ✅ None of the tested variables (including age and season) are confounding factors with Faith PD or Shannon diversity

### Confounding assessment for beta diversity analysis

In [775]:
# Load as DataFrame
rpca_df = pd.read_csv('../Data/Beta_Diversity/rpca_distance_matrix.tsv', sep='\t', index_col=0)
rpca_df.index.name = None

# Ensure data are C-contiguous before creating DistanceMatrix
rpca_dm = DistanceMatrix(np.ascontiguousarray(rpca_df.values), ids=rpca_df.index)

In [777]:
# Load metadata
metadata_path = metadata_path = '../Metadata/16S_AD_South-Africa_metadata_subset.tsv'
metadata = pd.read_csv(metadata_path, sep='\t', index_col=0)
metadata.index = metadata.index.str.replace('_', '', regex=False)
metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sample_type,specimen,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,lesional skin,skin,24.0,male,4/16/2015,Autumn,Unexposed,negative,4.0,40
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,lesional skin,skin,9.0,female,8/11/2015,Winter,Unexposed,negative,7.0,34
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,lesional skin,skin,24.0,female,11/20/2014,Spring,Unexposed,negative,7.0,21
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,lesional skin,skin,18.0,female,9/23/2015,Spring,Unexposed,,4.0,40
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,lesional skin,skin,31.0,male,4/21/2015,Autumn,Unexposed,negative,7.0,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900401,5,C12,SB503,AGAGTCAC,SB712,CGTAGCGA,SB712SB503,CGTAGCGA-AGAGTCAC,1.010000e+21,C12,...,nonlesional skin,skin,21.0,female,9/17/2015,Spring,Exposed,negative,12.0,38
900402,6,B4,SA502,ACTATCTG,SB704,TCTCTATG,SB704SA502,TCTCTATG-ACTATCTG,1.010000e+21,B4,...,anterior nares,nasal,21.0,,,,,,,
Ca006ONL,6,F1,SA506,CGTGAGTG,SB701,CTCGACTT,SB701SA506,CTCGACTT-CGTGAGTG,1.010000e+21,F1,...,lesional skin,skin,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,nonlesional skin,skin,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34


In [782]:
exposure_var = 'area'
confounders = ['age_months', 'sex', 'enrolment_season']

print(f"Samples: {len(common_samples)}")
print(f"Testing confounders: {confounders}\n")

# ============================================================================
# 1. PERMDISP (Assumption Check for exposure variable)
# ============================================================================
print("="*70)
print("PERMDISP: Testing Dispersion Homogeneity for exposure variable")
print("="*70)

temp_meta = metadata_aligned[[exposure_var]].dropna()
common_ids = [sid for sid in dm.ids if sid in temp_meta.index]
dm_filtered = dm.filter(common_ids)
grouping = temp_meta.loc[common_ids, exposure_var]

disp_result = permdisp(dm_filtered, grouping, permutations=999)
print(f"\nTest statistic: {disp_result['test statistic']:.4f}")
print(f"P-value: {disp_result['p-value']:.3f}")

if disp_result['p-value'] < 0.05:
    print(f"⚠️  WARNING: Dispersions differ (p={disp_result['p-value']:.3f})")
    print("   PERMANOVA results for 'area' should be interpreted with caution!\n")
else:
    print(f"✓ Dispersions are homogeneous - PERMANOVA assumption met\n")

# ============================================================================
# 2. Define helper functions
# ============================================================================

def test_confounder_with_exposure(metadata, exposure, confounder):
    temp_df = metadata[[exposure, confounder]].dropna()
    if len(temp_df) < 10:
        return np.nan
    
    is_categorical = temp_df[confounder].dtype == 'object' or temp_df[confounder].nunique() < 10
    
    if is_categorical:
        ct = pd.crosstab(temp_df[confounder], temp_df[exposure])
        if ct.shape[0] > 1 and ct.shape[1] > 1:
            try:
                _, p_val, _, _ = chi2_contingency(ct)
                return p_val
            except Exception as e:
                print(f"  ⚠️  Chi-square failed for {confounder}: {e}")
                return np.nan
    else:
        groups = [temp_df[temp_df[exposure] == g][confounder] for g in temp_df[exposure].unique()]
        if len(groups) == 2 and all(len(g) > 0 for g in groups):
            try:
                _, p_val = mannwhitneyu(groups[0], groups[1])
                return p_val
            except Exception as e:
                print(f"  ⚠️  Mann–Whitney failed for {confounder}: {e}")
                return np.nan
    return np.nan


def run_permanova(dm, metadata, column, permutations=999):
    temp_meta = metadata[[column]].dropna()
    common_ids = [sid for sid in dm.ids if sid in temp_meta.index]
    dm_filtered = dm.filter(common_ids)
    temp_meta = temp_meta.loc[common_ids]
    
    if len(temp_meta) < 10:
        return None
    
    try:
        result = permanova(dm_filtered, temp_meta, column=column, permutations=permutations)
        return result
    except Exception as e:
        print(f"  ⚠️  PERMANOVA failed for {column}: {e}")
        return None


# ============================================================================
# 3. LOOP THROUGH CONFOUNDERS (THIS WAS MISSING!)
# ============================================================================

print("="*70)
print("BIVARIATE TESTING: Evaluating Confounder Criteria")
print("="*70)
print(f"\n{'Confounder':<20} {'With Area':<18} {'With Beta Div':<18} {'Status'}")
print("-"*70)

confounder_results = {}

for conf in confounders:
    if conf not in metadata_aligned.columns:
        print(f"{conf:<20} {'Not in metadata':<18} {'—':<18} Skipped")
        continue
    
    # Test 1: Confounder → Exposure (area)
    p_exposure = test_confounder_with_exposure(metadata_aligned, exposure_var, conf)
    
    # Test 2: Confounder → Outcome (beta diversity)
    perm_result = run_permanova(dm, metadata_aligned, conf, permutations=999)
    
    if perm_result is not None:
        if isinstance(perm_result, pd.Series):
            p_outcome = perm_result['p-value']
        else:
            p_outcome = perm_result['p-value'].iloc[0]
    else:
        p_outcome = np.nan
    
    # Determine confounder status
    if not np.isnan(p_exposure) and not np.isnan(p_outcome):
        if p_exposure < 0.05 and p_outcome < 0.05:
            status = "YES ⚠️"
        elif p_exposure < 0.1 and p_outcome < 0.1:
            status = "YES (p<0.1)"
        else:
            status = "No"
    else:
        status = "Unable to test"
    
    # Format p-values for display
    p_exp_str = f"p={p_exposure:.3f}" if not np.isnan(p_exposure) else "N/A"
    p_out_str = f"p={p_outcome:.3f}" if not np.isnan(p_outcome) else "N/A"
    
    print(f"{conf:<20} {p_exp_str:<18} {p_out_str:<18} {status}")
    
    confounder_results[conf] = {
        'p_exposure': p_exposure,
        'p_outcome': p_outcome,
        'status': status
    }

print("\n" + "="*70)
print("SUMMARY:")
true_confounders = [c for c, r in confounder_results.items() if 'YES' in r['status']]
print(f"True confounders (associated with BOTH area and beta diversity): {true_confounders}")
print("="*70)

Samples: 282
Testing confounders: ['age_months', 'sex', 'enrolment_season']

PERMDISP: Testing Dispersion Homogeneity for exposure variable

Test statistic: 0.9447
P-value: 0.346
✓ Dispersions are homogeneous - PERMANOVA assumption met

BIVARIATE TESTING: Evaluating Confounder Criteria

Confounder           With Area          With Beta Div      Status
----------------------------------------------------------------------
age_months           p=0.000            p=0.001            YES ⚠️
sex                  p=0.301            p=0.291            No
enrolment_season     p=0.000            p=0.001            YES ⚠️

SUMMARY:
True confounders (associated with BOTH area and beta diversity): ['age_months', 'enrolment_season']


In [783]:
# ============================================================================
# 3. Confounder evaluation loop
# ============================================================================
confounder_results = {}

print("\n" + "="*70)
print("Evaluating confounders: association with beta diversity")
print("="*70)

for conf in confounders:
    print(f"\n--- Checking confounder: {conf} ---")

    if conf not in metadata_aligned.columns:
        print(f"  ⚠️  Skipping {conf} (not in metadata)")
        continue

    # Test 1: Confounder ↔ Exposure (area)
    p_exposure = test_confounder_with_exposure(metadata_aligned, exposure_var, conf)
    print(f"  Association with exposure ('{exposure_var}') p = {p_exposure:.3f}" if not np.isnan(p_exposure) else "  Could not compute exposure test")

    # Test 2: Confounder ↔ Outcome (PERMANOVA)
    perm_result = run_permanova(dm, metadata_aligned, conf, permutations=999)

    if perm_result is None:
        p_outcome = np.nan
    elif isinstance(perm_result, (float, np.floating)):
        p_outcome = perm_result
    else:
        p_outcome = float(perm_result.get('p-value', np.nan))

    print(f"  Association with outcome (PERMANOVA) p = {p_outcome:.3f}" if not np.isnan(p_outcome) else "  Could not compute PERMANOVA")

    # Determine confounding status
    if not np.isnan(p_exposure) and not np.isnan(p_outcome):
        is_confounder = (p_exposure < 0.1 and p_outcome < 0.1)
        
        if p_exposure < 0.05 and p_outcome < 0.05:
            conf_status = "YES ⚠️"
        elif is_confounder:
            conf_status = "YES"
        else:
            conf_status = "No"
    else:
        conf_status = "N/A"

    print(f"  → Confounder status: {conf_status}")

    confounder_results[conf] = {
        'p_exposure': p_exposure,
        'p_outcome': p_outcome,
        'status': conf_status
    }


Evaluating confounders: association with beta diversity

--- Checking confounder: age_months ---
  Association with exposure ('area') p = 0.000
  Association with outcome (PERMANOVA) p = 0.001
  → Confounder status: YES ⚠️

--- Checking confounder: sex ---
  Association with exposure ('area') p = 0.301
  Association with outcome (PERMANOVA) p = 0.286
  → Confounder status: No

--- Checking confounder: enrolment_season ---
  Association with exposure ('area') p = 0.000
  Association with outcome (PERMANOVA) p = 0.001
  → Confounder status: YES ⚠️
