# Relative Abundance ASVs between Skin and Nares by Region

In [43]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
import scipy.stats as stats
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [44]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group,microbiome_type
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD,nares


In [45]:
# Get participants with both skin and nares samples
skin_pids = metadata[metadata['microbiome_type'] == 'skin']['pid']
nares_pids = metadata[metadata['microbiome_type'] == 'nares']['pid']

# Find overlapping participants
shared_pids = set(skin_pids).intersection(set(nares_pids))

print(f"Number of participants with both skin and nares samples: {len(shared_pids)}")

# Display the participant IDs
print("\nParticipant IDs with both sample types:")
print(sorted(list(shared_pids)))
print(len(list(shared_pids)))


Number of participants with both skin and nares samples: 187

Participant IDs with both sample types:
['Ca-006-ON', 'Ca-007-NK', 'Ca-008-HN', 'Ca-009-ST', 'Ca-009-ZN', 'Ca-010-EB', 'Ca-011-LQ', 'Ca-013-NN', 'Ca-014-LB', 'Ca-015-AM', 'Ca-016-YT', 'Ca-017-OM', 'Ca-018-AS', 'Ca-019-EC', 'Ca-020-AZ', 'Ca-021-IM', 'Ca-022-MS', 'Ca-023-EJ', 'Ca-024-ZM', 'Ca-025-AC', 'Ca-026-KM', 'Ca-027-IM', 'Ca-028-PN', 'Ca-029-TD', 'Ca-030-LM', 'Ca-031-AN', 'Ca-032-LN', 'Ca-033-UD', 'Ca-034-LS', 'Ca-035-AR', 'Ca-036-TK', 'Ca-037-HM', 'Ca-042-AM', 'Ca-043-SS', 'Ca-045-HN', 'Ca-046-MT', 'Ca-047-MD', 'Ca-049-LM', 'Ca-050-MM', 'Ca-051-AM', 'Ca-052-LM', 'Ca-053-NN', 'Ca-054-RN', 'Ca-055-LN', 'Ca-056-DH', 'Ca-101-ID', 'Ca-103-BT', 'Ca-104-SB', 'Ca-105-LX', 'Ca-106-NM', 'Ca-107-BM', 'Ca-108-EJ', 'Ca-110-LN', 'Ca-111-IN', 'Ca-112-AM', 'Ca-113-MN', 'Ca-114-QT', 'Ca-115-OM', 'Ca-116-AN', 'Ca-117-SM', 'Ca-118-AS', 'Ca-119-AM', 'Ca-120-ST', 'Ca-121-BT', 'Ca-122-UK', 'Ca-123-PG', 'Ca-124-AB', 'Ca-125-IM', 'Ca-126-KB', 

In [46]:
# Read in table at ASV level
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Map the 'microbiome_type', 'group', 'o_scorad', and 'area' columns from metadata to df based on matching index
df['group'] = metadata.loc[df.index, 'group']
df['microbiome_type'] = metadata.loc[df.index, 'microbiome_type']
df['o_scorad'] = metadata.loc[df.index, 'o_scorad']
df['pid'] = metadata.loc[df.index, 'pid']
df['area'] = metadata.loc[df.index, 'area']
df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Cutibacterium_ASV-1,g___ASV-16,g___ASV-25,g___ASV-27,g__Cutibacterium_ASV-2,...,g__Capnocytophaga_820688_ASV-7,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-4,g__Bosea_ASV-2,g___ASV-119,group,microbiome_type,o_scorad,pid,area
900344,156.0,95.0,23.0,17.0,1.0,2.0,1.0,0,0,0,...,0,0,0,0,0,skin-H,skin,,Co-130-MM,Umtata
900459,21.0,30.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,nares-AD,nares,44,Ca-145-LM,Umtata
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,skin-ADL,skin,34,Ca-101-ID,Umtata
900570,18.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,skin-ADNL,skin,36,Ca-046-MT,Cape Town
900092,174.0,104.0,10.0,1.0,0,0,0,0,2.0,0,...,0,0,0,0,0,nares-AD,nares,53,Ca-023-EJ,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,38.0,21.0,0,0,1.0,18.0,0,0,0,12.0,...,0,0,0,0,0,skin-ADL,skin,54,Ca-125-IM,Umtata
900097,3.0,0,0,0,0,3.0,0,0,0,0,...,0,0,0,0,0,skin-ADNL,skin,44,Ca-025-AC,Cape Town
900498,4.0,6.0,0,0,0,13.0,0,6.0,0,13.0,...,0,0,0,0,0,skin-ADNL,skin,,Ca-157-LM,Umtata
900276,0,0,10.0,0,0,69.0,0,0,0,26.0,...,0,0,0,0,0,skin-ADL,skin,52,Ca-111-IN,Umtata


In [47]:
def get_group_data(df, area_val, is_AD_val, microbiome_type):
    return df[
        (df['area'] == area_val) &
        (df['is_AD'] == is_AD_val) &
        (df['microbiome_type'] == microbiome_type)
    ].drop(columns=['microbiome_type', 'group', 'area', 'is_AD'])


In [48]:
# --- STEP 1: Prepare the data ---

# Separate features and metadata
features = df.drop(columns=['microbiome_type', 'group'])
metadata_cols = df[['microbiome_type', 'group', 'area']]

# Coerce all to numeric and drop non-numeric columns
features_numeric = features.apply(pd.to_numeric, errors='coerce')
features_numeric = features_numeric.dropna(axis=1, how='all')

# Normalize to relative abundances
features_rel = features_numeric.div(features_numeric.sum(axis=1), axis=0)

# Add metadata back
features_rel['microbiome_type'] = metadata_cols['microbiome_type']
features_rel['group'] = metadata_cols['group']
features_rel['area'] = metadata_cols['area']

In [49]:
# Make sure you have these helper columns
features_rel['is_AD'] = features_rel['group'].isin(['skin-ADL', 'nares-AD'])

# Define plotting setup for 1x4 layout with custom figure size
fig, axes = plt.subplots(1, 4, figsize=(15, 5), sharex=True, sharey=True)

# Order: Cape Town Healthy, Cape Town AD+, Umtata Healthy, Umtata AD+
panel_info = [
    ('Cape Town', False, 'Cape Town Healthy', '#A7C7E7', 0),
    ('Cape Town', True,  'Cape Town AD+',     '#d2b48c', 1),
    ('Umtata',    False, 'Umtata Healthy',     '#ADD8E6', 2),
    ('Umtata',    True,  'Umtata AD+',        '#fa8072', 3) 
]

for area, is_AD, title, color, idx in panel_info:
    # Subset skin samples
    skin = features_rel[
        (features_rel['area'] == area) &
        (features_rel['is_AD'] == is_AD) &
        (features_rel['microbiome_type'] == 'skin')
    ].drop(columns=['microbiome_type', 'group', 'area', 'is_AD'])

    # Subset nares samples
    nares = features_rel[
        (features_rel['area'] == area) &
        (features_rel['is_AD'] == is_AD) &
        (features_rel['microbiome_type'] == 'nares')
    ].drop(columns=['microbiome_type', 'group', 'area', 'is_AD'])

    # Average across samples
    mean_skin = skin.mean()
    mean_nares = nares.mean()

    # Log transform
    log_skin = np.log10(mean_skin)
    log_nares = np.log10(mean_nares)

    # Remove NaNs and infs
    mask = np.isfinite(log_skin) & np.isfinite(log_nares)
    log_skin = log_skin[mask]
    log_nares = log_nares[mask]
    mean_skin = mean_skin[mask]
    mean_nares = mean_nares[mask]

    # Correlation
    r, pval = stats.pearsonr(log_skin, log_nares)

    # Plot
    ax = axes[idx]
    ax.scatter(
        mean_nares,
        mean_skin,
        alpha=0.7,
        edgecolors='k',
        linewidths=0.5,
        color=color
    )

    # Identity line
    #lims = [1e-6, max(max(mean_nares), max(mean_skin))]
    lims = [1e-5, 1e0]
    ax.plot(lims, lims, 'k--', alpha=0.75)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.set_title(f'{title}', fontsize=16)
    ax.set_xlabel(f'Nares ({area})', fontsize=14)
    ax.set_ylabel(f'Skin ({area})', fontsize=14)

    ax.text(
        0.05, 0.95,
        f'Pearson r = {r:.2f}\n$p$ = {pval:.1e}',
        ha='left', va='top',
        transform=ax.transAxes,
        fontsize=11
    )

    # Annotation markers
    ax.plot(0.05, 0.80, marker='o', markersize=6, color=color, transform=ax.transAxes)

    n_features = mask.sum()
    ax.text(
        0.07, 0.80,
        f'16S ASV (n={n_features})',
        ha='left', va='center',
        transform=ax.transAxes,
        fontsize=10
    )

    # --- Label selected ASVs with short names and position adjustments ---
    label_map = {
        0: {  # Cape Town Healthy
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 1.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.6)
        },
        1: {  # Cape Town AD+
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 0.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.6)
        },
        2: {  # Umtata Healthy
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 0.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.65)
        },
        3: {  # Umtata AD+
            'g__Streptococcus_ASV-1': ('Strep', 0.99, 0.6),
            'g__Staphylococcus_ASV-1': ('Staph', 0.99, 1.6)        }
    }

    # Get label positions for this panel
    panel_labels = label_map.get(idx, {})

    for asv, (short_label, x_shift_factor, y_shift_factor) in panel_labels.items():
        if asv in mean_skin.index and asv in mean_nares.index:
            x = mean_nares[asv]
            y = mean_skin[asv]

            if np.isfinite(np.log10(x)) and np.isfinite(np.log10(y)):
                x_offset = x * x_shift_factor
                y_offset = y * y_shift_factor

                align = 'right' if x_shift_factor < 1 else 'left'

                # Draw thick black outline around the ASV point
                ax.scatter(
                    [x], [y],
                    s=80,
                    facecolors='none',
                    edgecolors='black',
                    linewidths=1
                )

                # Label
                ax.text(
                    x_offset,
                    y_offset,
                    short_label,
                    fontsize=10,
                    ha=align,
                    va='center',
                    color='black'
                )

# Final layout
plt.suptitle('Mean Relative Abundance of Taxa between Skin and Nares', fontsize=20, y=0.99)
plt.tight_layout(rect=[0, 0.02, 1, 1])
plt.savefig('../Figures/Main/Fig_4B.png', dpi=600)


  return sp_sum / ct
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  return sp_sum / ct
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
  sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
