## Scatter plot showing the relative abundances of shared ASVs (present in both skin and nares) across paired samples. Each point represents a single ASV, with its average relative abundance in skin samples on the x-axis and nares samples on the y-axis.

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
import scipy.stats as stats
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [19]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group,microbiome_type
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD,nares


In [20]:
# Get participants with both skin and nares samples
skin_pids = metadata[metadata['microbiome_type'] == 'skin']['pid']
nares_pids = metadata[metadata['microbiome_type'] == 'nares']['pid']

# Find overlapping participants
shared_pids = set(skin_pids).intersection(set(nares_pids))

print(f"Number of participants with both skin and nares samples: {len(shared_pids)}")

# Display the participant IDs
print("\nParticipant IDs with both sample types:")
print(sorted(list(shared_pids)))
print(len(list(shared_pids)))


Number of participants with both skin and nares samples: 187

Participant IDs with both sample types:
['Ca-006-ON', 'Ca-007-NK', 'Ca-008-HN', 'Ca-009-ST', 'Ca-009-ZN', 'Ca-010-EB', 'Ca-011-LQ', 'Ca-013-NN', 'Ca-014-LB', 'Ca-015-AM', 'Ca-016-YT', 'Ca-017-OM', 'Ca-018-AS', 'Ca-019-EC', 'Ca-020-AZ', 'Ca-021-IM', 'Ca-022-MS', 'Ca-023-EJ', 'Ca-024-ZM', 'Ca-025-AC', 'Ca-026-KM', 'Ca-027-IM', 'Ca-028-PN', 'Ca-029-TD', 'Ca-030-LM', 'Ca-031-AN', 'Ca-032-LN', 'Ca-033-UD', 'Ca-034-LS', 'Ca-035-AR', 'Ca-036-TK', 'Ca-037-HM', 'Ca-042-AM', 'Ca-043-SS', 'Ca-045-HN', 'Ca-046-MT', 'Ca-047-MD', 'Ca-049-LM', 'Ca-050-MM', 'Ca-051-AM', 'Ca-052-LM', 'Ca-053-NN', 'Ca-054-RN', 'Ca-055-LN', 'Ca-056-DH', 'Ca-101-ID', 'Ca-103-BT', 'Ca-104-SB', 'Ca-105-LX', 'Ca-106-NM', 'Ca-107-BM', 'Ca-108-EJ', 'Ca-110-LN', 'Ca-111-IN', 'Ca-112-AM', 'Ca-113-MN', 'Ca-114-QT', 'Ca-115-OM', 'Ca-116-AN', 'Ca-117-SM', 'Ca-118-AS', 'Ca-119-AM', 'Ca-120-ST', 'Ca-121-BT', 'Ca-122-UK', 'Ca-123-PG', 'Ca-124-AB', 'Ca-125-IM', 'Ca-126-KB', 

In [21]:
# Read in table at ASV level
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Copromonas_ASV-3,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-17,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145
900344,984.0,611.0,114.0,82.0,22.0,8.0,8.0,6.0,3.0,2.0,...,0,0,0,0,0,0,0,0,0,0
900459,118.0,106.0,0,0,0,0,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,0
900221,22.0,0,0,0,0,16.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900570,389.0,0,0,0,8.0,11.0,0,0,0,5.0,...,0,0,0,0,0,0,0,0,0,0
900092,3106.0,1707.0,59.0,32.0,3.0,0,0,0,7.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,1168.0,593.0,16.0,0,28.0,736.0,0,0,36.0,388.0,...,0,0,0,0,0,0,0,0,0,17.0
900097,24.0,0,0,0,0,33.0,0,0,0,12.0,...,0,0,0,0,0,0,0,0,0,0
900498,15.0,17.0,0,0,0,34.0,0,14.0,0,25.0,...,0,0,0,0,0,0,0,0,0,0
900276,0,0,30.0,0,0,151.0,0,0,0,79.0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# # Get the column names before _ASV
# base_names = [col.split('_ASV')[0] for col in df.columns if '_ASV' in col]
# base_names = list(set(base_names))  # Get unique base names

# # Create a new dataframe to store collapsed values
# collapsed_df = pd.DataFrame(index=df.index)

# # For each base name, sum all columns that start with it
# for base in base_names:
#     matching_cols = [col for col in df.columns if col.startswith(base + '_ASV')]
#     collapsed_df[base] = df[matching_cols].sum(axis=1)

# # Add back any columns that don't contain _ASV
# non_asv_cols = [col for col in df.columns if '_ASV' not in col]
# collapsed_df[non_asv_cols] = df[non_asv_cols]

# # Replace the original df with collapsed version
# df = collapsed_df

# # Calculate column sums and sort columns by sum in descending order
# col_sums = df.sum()
# df = df[col_sums.sort_values(ascending=False).index]

# df

In [23]:
# Map the 'microbiome_type', 'group', 'o_scorad', and 'area' columns from metadata to df based on matching index
df['group'] = metadata.loc[df.index, 'group']
df['microbiome_type'] = metadata.loc[df.index, 'microbiome_type']
df['o_scorad'] = metadata.loc[df.index, 'o_scorad']
df['pid'] = metadata.loc[df.index, 'pid']
df['area'] = metadata.loc[df.index, 'area']
df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,group,microbiome_type,o_scorad,pid,area
900344,984.0,611.0,114.0,82.0,22.0,8.0,8.0,6.0,3.0,2.0,...,0,0,0,0,0,skin-H,skin,,Co-130-MM,Umtata
900459,118.0,106.0,0,0,0,0,0,0,0,3.0,...,0,0,0,0,0,nares-AD,nares,44,Ca-145-LM,Umtata
900221,22.0,0,0,0,0,16.0,0,0,0,0,...,0,0,0,0,0,skin-ADL,skin,34,Ca-101-ID,Umtata
900570,389.0,0,0,0,8.0,11.0,0,0,0,5.0,...,0,0,0,0,0,skin-ADNL,skin,36,Ca-046-MT,Cape Town
900092,3106.0,1707.0,59.0,32.0,3.0,0,0,0,7.0,0,...,0,0,0,0,0,nares-AD,nares,53,Ca-023-EJ,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,1168.0,593.0,16.0,0,28.0,736.0,0,0,36.0,388.0,...,0,0,0,0,17.0,skin-ADL,skin,54,Ca-125-IM,Umtata
900097,24.0,0,0,0,0,33.0,0,0,0,12.0,...,0,0,0,0,0,skin-ADNL,skin,44,Ca-025-AC,Cape Town
900498,15.0,17.0,0,0,0,34.0,0,14.0,0,25.0,...,0,0,0,0,0,skin-ADNL,skin,,Ca-157-LM,Umtata
900276,0,0,30.0,0,0,151.0,0,0,0,79.0,...,0,0,0,0,0,skin-ADL,skin,52,Ca-111-IN,Umtata


In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# --- STEP 1: Prepare the data ---

# Separate features and metadata
features = df.drop(columns=['microbiome_type', 'group'])
metadata_cols = df[['microbiome_type', 'group', 'area']]

# Coerce all to numeric and drop non-numeric columns
features_numeric = features.apply(pd.to_numeric, errors='coerce')
features_numeric = features_numeric.dropna(axis=1, how='all')

# Normalize to relative abundances
features_rel = features_numeric.div(features_numeric.sum(axis=1), axis=0)

# Add metadata back
features_rel['microbiome_type'] = metadata_cols['microbiome_type']
features_rel['group'] = metadata_cols['group']
features_rel['area'] = metadata_cols['area']

# --- STEP 2: Define subsets by area ---

# Cape Town samples
skin_CT = features_rel[(features_rel['microbiome_type'] == 'skin') & (features_rel['area'] == 'Cape Town')].drop(columns=['microbiome_type', 'group', 'area'])
nares_CT = features_rel[(features_rel['microbiome_type'] == 'nares') & (features_rel['area'] == 'Cape Town')].drop(columns=['microbiome_type', 'group', 'area'])

# Umtata samples
skin_UT = features_rel[(features_rel['microbiome_type'] == 'skin') & (features_rel['area'] == 'Umtata')].drop(columns=['microbiome_type', 'group', 'area'])
nares_UT = features_rel[(features_rel['microbiome_type'] == 'nares') & (features_rel['area'] == 'Umtata')].drop(columns=['microbiome_type', 'group', 'area'])

# --- STEP 3: Calculate mean relative abundances ---

mean_skin_CT = skin_CT.mean().astype(float)
mean_nares_CT = nares_CT.mean().astype(float)

mean_skin_UT = skin_UT.mean().astype(float)
mean_nares_UT = nares_UT.mean().astype(float)

# --- STEP 4: Make scatterplots ---

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# --- Plot 1: Cape Town ---

scatter_df_CT = pd.DataFrame({
    'mean_skin': mean_skin_CT,
    'mean_nares': mean_nares_CT
}).replace(0, 1e-6)

x_CT = np.where(mean_nares_CT.to_numpy() == 0, 1e-6, mean_nares_CT.to_numpy())
y_CT = np.where(mean_skin_CT.to_numpy() == 0, 1e-6, mean_skin_CT.to_numpy())

log_nares_CT = np.log10(x_CT)
log_skin_CT = np.log10(y_CT)

mask_CT = np.isfinite(log_nares_CT) & np.isfinite(log_skin_CT)
log_nares_CT = log_nares_CT[mask_CT]
log_skin_CT = log_skin_CT[mask_CT]

r_CT, pval_CT = stats.pearsonr(log_nares_CT, log_skin_CT)

axes[0].scatter(
    scatter_df_CT['mean_nares'],
    scatter_df_CT['mean_skin'],
    alpha=0.7,
    edgecolors='k',
    linewidths=0.5,
    color='#1f77b4'
)

lims_CT = [1e-6, scatter_df_CT.max().max()]
axes[0].plot(lims_CT, lims_CT, 'k--', alpha=0.75)

axes[0].set_xscale('log')
axes[0].set_yscale('log')
axes[0].set_xlabel('Mean Relative Abundance in Nares (Cape Town)', fontsize=12)
axes[0].set_ylabel('Mean Relative Abundance in Skin (Cape Town)', fontsize=12)
n_CT = len(set(features_rel[(features_rel['area'] == 'Cape Town')]['group'])) // 2
axes[0].set_title(f'Urban (Cape Town)', fontsize=14)

axes[0].text(
    0.05, 0.95,
    f'Pearson r = {r_CT:.2f}\n$p$ = {pval_CT:.2e}',
    ha='left', va='top',
    transform=axes[0].transAxes,
    fontsize=12
)

axes[0].plot(0.05, 0.82, marker='o', markersize=6, color='#1f77b4', transform=axes[0].transAxes, clip_on=False)
axes[0].text(0.07, 0.82, '16S ASV (n=798)', ha='left', va='center', transform=axes[0].transAxes, fontsize=11)

# --- Plot 2: Umtata ---

scatter_df_UT = pd.DataFrame({
    'mean_skin': mean_skin_UT,
    'mean_nares': mean_nares_UT
}).replace(0, 1e-6)

x_UT = np.where(mean_nares_UT.to_numpy() == 0, 1e-6, mean_nares_UT.to_numpy())
y_UT = np.where(mean_skin_UT.to_numpy() == 0, 1e-6, mean_skin_UT.to_numpy())

log_nares_UT = np.log10(x_UT)
log_skin_UT = np.log10(y_UT)

mask_UT = np.isfinite(log_nares_UT) & np.isfinite(log_skin_UT)
log_nares_UT = log_nares_UT[mask_UT]
log_skin_UT = log_skin_UT[mask_UT]

r_UT, pval_UT = stats.pearsonr(log_nares_UT, log_skin_UT)

axes[1].scatter(
    scatter_df_UT['mean_nares'],
    scatter_df_UT['mean_skin'],
    alpha=0.7,
    edgecolors='k',
    linewidths=0.5,
    color='#ff7f0e'
)

lims_UT = [1e-6, scatter_df_UT.max().max()]
axes[1].plot(lims_UT, lims_UT, 'k--', alpha=0.75)

axes[1].set_xscale('log')
axes[1].set_yscale('log')
axes[1].set_xlabel('Mean Relative Abundance in Nares (Umtata)', fontsize=12)
axes[1].set_ylabel('Mean Relative Abundance in Skin (Umtata)', fontsize=12)
n_UT = len(set(features_rel[(features_rel['area'] == 'Umtata')]['group'])) // 2
axes[1].set_title(f'Rural (Umtata)', fontsize=14)

axes[1].text(
    0.05, 0.95,
    f'Pearson r = {r_UT:.2f}\n$p$ = {pval_UT:.2e}',
    ha='left', va='top',
    transform=axes[1].transAxes,
    fontsize=12
)

axes[1].plot(0.05, 0.82, marker='o', markersize=6, color='#ff7f0e', transform=axes[1].transAxes, clip_on=False)
axes[1].text(0.07, 0.82, '16S ASV (n=798)', ha='left', va='center', transform=axes[1].transAxes, fontsize=11)

# --- Final layout ---
plt.suptitle('ASV Correlation between Skin and Nares by Location (Urban vs Rural)', fontsize=16)
plt.tight_layout()
plt.savefig('../Plots_draft/Analysis_figures/Co-occurrence/ASV-correlation_scatterplots_CapeTown_vs_Umtata.png', dpi=600)


##  2 × 2 stratified comparison by both disease status (AD+ vs AD−) and environment (Urban vs Rural)

In [25]:
features_rel['is_AD'] = features_rel['group'].isin(['skin-ADL', 'nares-AD'])
features_rel['area'] = metadata_cols['area']  # if not already included
features_rel


Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,o_scorad,microbiome_type,group,area,is_AD
900344,0.534202,0.331705,0.061889,0.044517,0.011944,0.004343,0.004343,0.003257,0.001629,0.001086,...,0.0,0.0,0.0,0.0,0.0,,skin,skin-H,Umtata,False
900459,0.06678,0.059989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001698,...,0.0,0.0,0.0,0.0,0.0,0.024901,nares,nares-AD,Umtata,True
900221,0.000744,0.0,0.0,0.0,0.0,0.000541,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00115,skin,skin-ADL,Umtata,True
900570,0.059562,0.0,0.0,0.0,0.001225,0.001684,0.0,0.0,0.0,0.000766,...,0.0,0.0,0.0,0.0,0.0,0.005512,skin,skin-ADNL,Cape Town,False
900092,0.617372,0.339296,0.011727,0.006361,0.000596,0.0,0.0,0.0,0.001391,0.0,...,0.0,0.0,0.0,0.0,0.0,0.010535,nares,nares-AD,Cape Town,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,0.116823,0.059312,0.0016,0.0,0.002801,0.073615,0.0,0.0,0.003601,0.038808,...,0.0,0.0,0.0,0.0,0.0017,0.005401,skin,skin-ADL,Umtata,True
900097,0.011869,0.0,0.0,0.0,0.0,0.01632,0.0,0.0,0.0,0.005935,...,0.0,0.0,0.0,0.0,0.0,0.021761,skin,skin-ADNL,Cape Town,False
900498,0.017689,0.020047,0.0,0.0,0.0,0.040094,0.0,0.016509,0.0,0.029481,...,0.0,0.0,0.0,0.0,0.0,,skin,skin-ADNL,Umtata,False
900276,0.0,0.0,0.04065,0.0,0.0,0.204607,0.0,0.0,0.0,0.107046,...,0.0,0.0,0.0,0.0,0.0,0.070461,skin,skin-ADL,Umtata,True


In [26]:
def get_group_data(df, area_val, is_AD_val, microbiome_type):
    return df[
        (df['area'] == area_val) &
        (df['is_AD'] == is_AD_val) &
        (df['microbiome_type'] == microbiome_type)
    ].drop(columns=['microbiome_type', 'group', 'area', 'is_AD'])


In [27]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

# Make sure you have these helper columns
features_rel['is_AD'] = features_rel['group'].isin(['skin-ADL', 'nares-AD'])

# Define plotting setup for 1x4 layout with custom figure size
fig, axes = plt.subplots(1, 4, figsize=(15, 5), sharex=True, sharey=True)

# Order: Cape Town Healthy, Cape Town AD+, Umtata Healthy, Umtata AD+
panel_info = [
    ('Cape Town', False, 'Cape Town Healthy', '#A7C7E7', 0),
    ('Cape Town', True,  'Cape Town AD+',     '#d2b48c', 1),
    ('Umtata',    False, 'Umtata Healthy',     '#ADD8E6', 2),
    ('Umtata',    True,  'Umtata AD+',        '#fa8072', 3) 
]

for area, is_AD, title, color, idx in panel_info:
    # Subset skin samples
    skin = features_rel[
        (features_rel['area'] == area) &
        (features_rel['is_AD'] == is_AD) &
        (features_rel['microbiome_type'] == 'skin')
    ].drop(columns=['microbiome_type', 'group', 'area', 'is_AD'])

    # Subset nares samples
    nares = features_rel[
        (features_rel['area'] == area) &
        (features_rel['is_AD'] == is_AD) &
        (features_rel['microbiome_type'] == 'nares')
    ].drop(columns=['microbiome_type', 'group', 'area', 'is_AD'])

    # Average across samples
    mean_skin = skin.mean()
    mean_nares = nares.mean()

    # Log transform
    log_skin = np.log10(mean_skin)
    log_nares = np.log10(mean_nares)

    # Remove NaNs and infs
    mask = np.isfinite(log_skin) & np.isfinite(log_nares)
    log_skin = log_skin[mask]
    log_nares = log_nares[mask]
    mean_skin = mean_skin[mask]
    mean_nares = mean_nares[mask]

    # Correlation
    r, pval = stats.pearsonr(log_skin, log_nares)

    # Plot
    ax = axes[idx]
    ax.scatter(
        mean_nares,
        mean_skin,
        alpha=0.7,
        edgecolors='k',
        linewidths=0.5,
        color=color
    )

    # Identity line
    #lims = [1e-6, max(max(mean_nares), max(mean_skin))]
    lims = [1e-6, 1e0]
    ax.plot(lims, lims, 'k--', alpha=0.75)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.set_title(f'{title}', fontsize=16)
    ax.set_xlabel(f'Nares ({area})', fontsize=14)
    ax.set_ylabel(f'Skin ({area})', fontsize=14)

    ax.text(
        0.05, 0.95,
        f'Pearson r = {r:.2f}\n$p$ = {pval:.1e}',
        ha='left', va='top',
        transform=ax.transAxes,
        fontsize=11
    )

    # Annotation markers
    ax.plot(0.05, 0.80, marker='o', markersize=6, color=color, transform=ax.transAxes)
    ax.text(0.07, 0.80, '16S ASV (n=798)', ha='left', va='center', transform=ax.transAxes, fontsize=10)

    # --- Label selected ASVs with short names and position adjustments ---
    label_map = {
        0: {  # Cape Town Healthy
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 1.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.6)
        },
        1: {  # Cape Town AD+
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 0.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.6)
        },
        2: {  # Umtata Healthy
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 0.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.65)
        },
        3: {  # Umtata AD+
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 0.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.6)        }
    }

    # Get label positions for this panel
    panel_labels = label_map.get(idx, {})

    for asv, (short_label, x_shift_factor, y_shift_factor) in panel_labels.items():
        if asv in mean_skin.index and asv in mean_nares.index:
            x = mean_nares[asv]
            y = mean_skin[asv]

            if np.isfinite(np.log10(x)) and np.isfinite(np.log10(y)):
                x_offset = x * x_shift_factor
                y_offset = y * y_shift_factor

                align = 'right' if x_shift_factor < 1 else 'left'

                # Draw thick black outline around the ASV point
                ax.scatter(
                    [x], [y],
                    s=80,
                    facecolors='none',
                    edgecolors='black',
                    linewidths=1
                )

                # Label
                ax.text(
                    x_offset,
                    y_offset,
                    short_label,
                    fontsize=10,
                    ha=align,
                    va='center',
                    color='black'
                )

# Final layout
plt.suptitle('Mean Relative Abundance of Taxa between Skin and Nares', fontsize=20, y=0.99)
plt.tight_layout(rect=[0, 0.02, 1, 1])
plt.savefig('../Plots_draft/Analysis_figures/Co-occurrence/ASV-correlation_1x4_AD_Urban_Rural.png', dpi=600)


  return sp_sum / ct
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  return sp_sum / ct
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
