## Scatter plot showing the relative abundances of shared ASVs (present in both skin and nares) across paired samples. Each point represents a single ASV, with its average relative abundance in skin samples on the x-axis and nares samples on the y-axis.

In [479]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
import scipy.stats as stats
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [480]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group,microbiome_type
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD,nares


In [481]:
# Get participants with both skin and nares samples
skin_pids = metadata[metadata['microbiome_type'] == 'skin']['pid']
nares_pids = metadata[metadata['microbiome_type'] == 'nares']['pid']

# Find overlapping participants
shared_pids = set(skin_pids).intersection(set(nares_pids))

print(f"Number of participants with both skin and nares samples: {len(shared_pids)}")

# Display the participant IDs
print("\nParticipant IDs with both sample types:")
print(sorted(list(shared_pids)))
print(len(list(shared_pids)))


Number of participants with both skin and nares samples: 187

Participant IDs with both sample types:
['Ca-006-ON', 'Ca-007-NK', 'Ca-008-HN', 'Ca-009-ST', 'Ca-009-ZN', 'Ca-010-EB', 'Ca-011-LQ', 'Ca-013-NN', 'Ca-014-LB', 'Ca-015-AM', 'Ca-016-YT', 'Ca-017-OM', 'Ca-018-AS', 'Ca-019-EC', 'Ca-020-AZ', 'Ca-021-IM', 'Ca-022-MS', 'Ca-023-EJ', 'Ca-024-ZM', 'Ca-025-AC', 'Ca-026-KM', 'Ca-027-IM', 'Ca-028-PN', 'Ca-029-TD', 'Ca-030-LM', 'Ca-031-AN', 'Ca-032-LN', 'Ca-033-UD', 'Ca-034-LS', 'Ca-035-AR', 'Ca-036-TK', 'Ca-037-HM', 'Ca-042-AM', 'Ca-043-SS', 'Ca-045-HN', 'Ca-046-MT', 'Ca-047-MD', 'Ca-049-LM', 'Ca-050-MM', 'Ca-051-AM', 'Ca-052-LM', 'Ca-053-NN', 'Ca-054-RN', 'Ca-055-LN', 'Ca-056-DH', 'Ca-101-ID', 'Ca-103-BT', 'Ca-104-SB', 'Ca-105-LX', 'Ca-106-NM', 'Ca-107-BM', 'Ca-108-EJ', 'Ca-110-LN', 'Ca-111-IN', 'Ca-112-AM', 'Ca-113-MN', 'Ca-114-QT', 'Ca-115-OM', 'Ca-116-AN', 'Ca-117-SM', 'Ca-118-AS', 'Ca-119-AM', 'Ca-120-ST', 'Ca-121-BT', 'Ca-122-UK', 'Ca-123-PG', 'Ca-124-AB', 'Ca-125-IM', 'Ca-126-KB', 

In [482]:
# Read in table at ASV level
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Map the 'microbiome_type', 'group', 'o_scorad' columns from metadata to df based on matching index
df['group'] = metadata.loc[df.index, 'group']
df['microbiome_type'] = metadata.loc[df.index, 'microbiome_type']
df['o_scorad'] = metadata.loc[df.index, 'o_scorad']
df['pid'] = metadata.loc[df.index, 'pid']

df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Cutibacterium_ASV-1,g___ASV-18,g___ASV-28,g___ASV-26,g__Cutibacterium_ASV-2,...,g__Leptotrichia_A_993758_ASV-14,g__Capnocytophaga_820688_ASV-7,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-4,g__Bosea_ASV-2,g___ASV-140,group,microbiome_type,o_scorad,pid
900344,188.0,115.0,23.0,19.0,2.0,2.0,1.0,0,0,0,...,0,0,0,0,0,0,skin-H,skin,,Co-130-MM
900459,20.0,37.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,nares-AD,nares,44,Ca-145-LM
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,skin-ADL,skin,34,Ca-101-ID
900570,18.0,0,0,0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,skin-ADNL,skin,36,Ca-046-MT
900092,221.0,116.0,3.0,1.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,nares-AD,nares,53,Ca-023-EJ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,45.0,20.0,1.0,0,1.0,21.0,0,0,1.0,18.0,...,0,0,0,0,0,1.0,skin-ADL,skin,54,Ca-125-IM
900097,3.0,0,0,0,0,6.0,0,0,0,1.0,...,0,0,0,0,0,0,skin-ADNL,skin,44,Ca-025-AC
900498,5.0,4.0,0,0,0,11.0,0,6.0,0,13.0,...,0,0,0,0,0,0,skin-ADNL,skin,,Ca-157-LM
900276,0,0,13.0,0,0,63.0,0,0,0,51.0,...,0,0,0,0,0,0,skin-ADL,skin,52,Ca-111-IN


In [483]:
# --- STEP 1: Prepare the data ---

# Separate features and metadata
features = df.drop(columns=['microbiome_type', 'group'])
metadata = df[['microbiome_type', 'group']]

# Normalize features to relative abundances
features_rel = features.div(features.sum(axis=1), axis=0)

# Add metadata back
features_rel['microbiome_type'] = metadata['microbiome_type']
features_rel['group'] = metadata['group']

# --- STEP 2: Define subsets ---

# AD group
skin_ADL = features_rel[features_rel['group'] == 'skin-ADL'].drop(columns=['microbiome_type', 'group'])
nares_AD = features_rel[features_rel['group'] == 'nares-AD'].drop(columns=['microbiome_type', 'group'])

# Healthy group
skin_H = features_rel[features_rel['group'] == 'skin-H'].drop(columns=['microbiome_type', 'group'])
nares_H = features_rel[features_rel['group'] == 'nares-H'].drop(columns=['microbiome_type', 'group'])

# --- STEP 3: Calculate mean relative abundances ---

mean_skin_ADL = skin_ADL.mean().astype(float)
mean_nares_AD = nares_AD.mean().astype(float)

mean_skin_H = skin_H.mean().astype(float)
mean_nares_H = nares_H.mean().astype(float)

# --- STEP 4: Make scatterplots ---

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# --- Scatterplot 1: AD group ---

scatter_df_AD = pd.DataFrame({
    'mean_skin': mean_skin_ADL,
    'mean_nares': mean_nares_AD
}).replace(0, 1e-6)

r_AD, pval_AD = stats.pearsonr(
    np.log10(scatter_df_AD['mean_nares']),
    np.log10(scatter_df_AD['mean_skin'])
)

# Plot
scatter_ad = axes[0].scatter(
    scatter_df_AD['mean_nares'],
    scatter_df_AD['mean_skin'],
    alpha=0.7,
    edgecolors='k',
    linewidths=0.5,
    color='#E31A1C'  # RED for AD
)

lims_AD = [1e-6, scatter_df_AD.max().max()]
axes[0].plot(lims_AD, lims_AD, 'k--', alpha=0.75)

axes[0].set_xscale('log')
axes[0].set_yscale('log')
axes[0].set_xlabel('Mean Relative Abundance in Nares-AD', fontsize=12)
axes[0].set_ylabel('Mean Relative Abundance in Skin-ADL', fontsize=12)
n_AD = metadata['group'].isin(['skin-ADL', 'nares-AD']).sum() // 2
axes[0].set_title(f'AD Individuals (n={n_AD})', fontsize=18)

# Pearson r and p-value
axes[0].text(
    0.05, 0.95,
    f'Pearson r = {r_AD:.2f}\n$p$ = {pval_AD:.2e}',
    ha='left', va='top',
    transform=axes[0].transAxes,
    fontsize=12
)

# Plot a red dot manually
axes[0].plot(
    0.05, 0.82,  # x, y in axis coordinates
    marker='o',
    markersize=6,
    color='#E31A1C',  # red dot for AD
    transform=axes[0].transAxes,
    clip_on=False
)

# Text next to dot
axes[0].text(
    0.07, 0.82,
    '16S ASV (n=798)',
    ha='left', va='center',
    transform=axes[0].transAxes,
    fontsize=11
)

# Add dot legend
# axes[0].legend(loc='lower right', frameon=False, markerscale=1.5)

# --- Scatterplot 2: Healthy group ---

scatter_df_H = pd.DataFrame({
    'mean_skin': mean_skin_H,
    'mean_nares': mean_nares_H
}).replace(0, 1e-6)

r_H, pval_H = stats.pearsonr(
    np.log10(scatter_df_H['mean_nares']),
    np.log10(scatter_df_H['mean_skin'])
)

scatter_h = axes[1].scatter(
    scatter_df_H['mean_nares'],
    scatter_df_H['mean_skin'],
    alpha=0.7,
    edgecolors='k',
    linewidths=0.5,
    color='#ADD8E6',  # light blue dot
)

lims_H = [1e-6, scatter_df_H.max().max()]
axes[1].plot(lims_H, lims_H, 'k--', alpha=0.75)

axes[1].set_xscale('log')
axes[1].set_yscale('log')
axes[1].set_xlabel('Mean Relative Abundance in Nares-H', fontsize=12)
axes[1].set_ylabel('Mean Relative Abundance in Skin-H', fontsize=12)
n_H = metadata['group'].isin(['skin-H', 'nares-H']).sum() // 2
axes[1].set_title(f'Healthy Individuals (n={n_H})', fontsize=18)

# Pearson r and p-value
axes[1].text(
    0.05, 0.95,
    f'Pearson r = {r_H:.2f}\n$p$ = {pval_H:.2e}',
    ha='left', va='top',
    transform=axes[1].transAxes,
    fontsize=12
)

# Plot a blue dot manually
axes[1].plot(
    0.05, 0.82,
    marker='o',
    markersize=6,
    color='#ADD8E6',  # blue dot for Healthy
    transform=axes[1].transAxes,
    clip_on=False
)

# Text next to dot
axes[1].text(
    0.07, 0.82,
    '16S ASV (n=798)',
    ha='left', va='center',
    transform=axes[1].transAxes,
    fontsize=11
)

# Add dot legend
# axes[1].legend(loc='lower right', frameon=False, markerscale=1.5)

# --- Final layout ---
plt.suptitle('Microbial Correlation between Skin and Nares by Disease Status', fontsize=18)
plt.tight_layout()
plt.savefig('../Plots/Analysis_figures/Individual_Analyses/ASV-correlation_scatterplots_AD_vs_H.png', dpi=600)

TypeError: agg function failed [how->sum,dtype->Sparse[object, 0]]