(A) Venn diagram showing the overlap in genera detected in paired nasal and skin microbiome samples across individuals. (B) Bar plot displaying the number of unique amplicon sequence variants (ASVs) assigned to each overlapping genus, highlighting the microbial diversity shared across sites.

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import pearsonr
from matplotlib_venn import venn2
import matplotlib.patches as mpatches


In [17]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group,microbiome_type
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD,nares


In [18]:
# Read in table at collapsed genera  level
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_0pct_rare_Genus.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Cutibacterium_ASV-1,g___ASV-18,g___ASV-28,g___ASV-26,g__Cutibacterium_ASV-2,...,g__Copromonas_ASV-2,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-19,g___ASV-104,g__Leptotrichia_A_993758_ASV-14,g__Capnocytophaga_820688_ASV-7,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-4,g__Bosea_ASV-2,g___ASV-140
900344,188.0,115.0,23.0,19.0,2.0,2.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900459,20.0,37.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900570,18.0,0,0,0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900092,221.0,116.0,3.0,1.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,45.0,20.0,1.0,0,1.0,21.0,0,0,1.0,18.0,...,0,0,0,0,0,0,0,0,0,1.0
900097,3.0,0,0,0,0,6.0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
900498,5.0,4.0,0,0,0,11.0,0,6.0,0,13.0,...,0,0,0,0,0,0,0,0,0,0
900276,0,0,13.0,0,0,63.0,0,0,0,51.0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Map pid, group, and microbiome_type from metadata to df based on matching indexes
df['pid'] = metadata.loc[df.index, 'pid']
df['group'] = metadata.loc[df.index, 'group']
df['microbiome_type'] = metadata.loc[df.index, 'microbiome_type']
df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Cutibacterium_ASV-1,g___ASV-18,g___ASV-28,g___ASV-26,g__Cutibacterium_ASV-2,...,g___ASV-104,g__Leptotrichia_A_993758_ASV-14,g__Capnocytophaga_820688_ASV-7,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-4,g__Bosea_ASV-2,g___ASV-140,pid,group,microbiome_type
900344,188.0,115.0,23.0,19.0,2.0,2.0,1.0,0,0,0,...,0,0,0,0,0,0,0,Co-130-MM,skin-H,skin
900459,20.0,37.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Ca-145-LM,nares-AD,nares
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Ca-101-ID,skin-ADL,skin
900570,18.0,0,0,0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,Ca-046-MT,skin-ADNL,skin
900092,221.0,116.0,3.0,1.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,Ca-023-EJ,nares-AD,nares
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,45.0,20.0,1.0,0,1.0,21.0,0,0,1.0,18.0,...,0,0,0,0,0,0,1.0,Ca-125-IM,skin-ADL,skin
900097,3.0,0,0,0,0,6.0,0,0,0,1.0,...,0,0,0,0,0,0,0,Ca-025-AC,skin-ADNL,skin
900498,5.0,4.0,0,0,0,11.0,0,6.0,0,13.0,...,0,0,0,0,0,0,0,Ca-157-LM,skin-ADNL,skin
900276,0,0,13.0,0,0,63.0,0,0,0,51.0,...,0,0,0,0,0,0,0,Ca-111-IN,skin-ADL,skin


In [20]:
print(f'Number of unique individuals: ' + str(df['pid'].nunique()))

Number of unique individuals: 197


In [21]:
# # Get taxa columns (all except pid and microbiome_type)
# taxa_cols = df.columns[:-2]

# # Convert abundance table to presence/absence
# presence_absence = df[taxa_cols] > 0
# presence_absence
# # Calculate percentage of TRUE values for each taxon
# presence_percentages = (presence_absence.sum() / len(presence_absence)) * 100

# # Get taxa that are present in at least 10% of samples
# features_10_prev = presence_percentages[presence_percentages >= .2].index.tolist()
# print(f'Number of taxa present in at least 10% of samples: {len(features_10_prev)}')

In [22]:
# presence_absence

In [23]:
# # Calculate percentage of TRUE values for each column
# percent_true = (presence_absence.sum() / len(presence_absence)) * 100
# percent_true.to_csv('percent_true.csv')
# # Count columns with >= 10% TRUE values 
# # num_cols_10pct = (percent_true >= 10).sum()

# # print(f'Number of taxa present in at least 10% of samples: {num_cols_10pct}')


In [24]:
# # Filter taxa columns to only include features with >=10% prevalence
# ### note: would be good to check how confident these taxa assignments were (should be 100% due to gg2 mapping)
# ### note: probably need to check confidence of ASV assignments
# taxa_cols_filtered = df[features_10_prev]
# taxa_cols_filtered

In [25]:
# Get taxa columns (all except pid, groupmicrobiome_type)
taxa_cols = df.columns[:-3]


# # Filter taxa_cols_filtered to keep only prevalent taxa + metadata
# df_filtered = taxa_cols_filtered[features_10_prev + ['pid', 'microbiome_type']]

# Split into skin and nares samples
skin_samples = df[df['microbiome_type'] == 'skin']
nares_samples = df[df['microbiome_type'] == 'nares']

# Find individuals with both skin and nares samples
shared_pids = set(skin_samples['pid']) & set(nares_samples['pid'])

# Initialize sets for taxa present in each body site
skin_taxa = set()
nares_taxa = set()
overlapping_taxa = set()

# For each individual with both sample types
for pid in shared_pids:
    # Get samples for this individual
    skin_sample = skin_samples[skin_samples['pid'] == pid]
    nares_sample = nares_samples[nares_samples['pid'] == pid]
    
    # Find taxa present (abundance > 0) in each sample type
    skin_present = set(taxa_cols[skin_sample[taxa_cols].gt(0).any()])
    nares_present = set(taxa_cols[nares_sample[taxa_cols].gt(0).any()])
    
    # Add to overall sets
    skin_taxa.update(skin_present)
    nares_taxa.update(nares_present)
    # Add taxa present in both sites for this individual
    overlapping_taxa.update(skin_present & nares_present)

# Plot Venn diagram
plt.figure(figsize=(6, 6))
v = venn2(
    [skin_taxa, nares_taxa],
    set_labels=('', ''),
    set_colors=('#2a00ff', '#ffa501'),
    alpha=0.5
)

# Adjust font size for subset labels
for label in v.subset_labels:
    if label:
        label.set_fontsize(16)

# Customize circle outlines to match fill color
region_colors = {
    '10': '#2a00ff',    # Skin only
    '01': '#ffa501',     # Nares only
    '11': '#955280'   # Overlap
}
for region_id, color in region_colors.items():
    patch = v.get_patch_by_id(region_id)
    if patch:
        patch.set_linewidth(2)
        patch.set_edgecolor(color)

# Create matching legend
# Create matching legend patches (colors match Venn regions)
handles = [
    mpatches.Patch(color='#2a00ff', alpha=0.5, label='Skin only'),
    mpatches.Patch(color='#955280', alpha=0.5, label='Both'), 
    mpatches.Patch(color='#ffa501', alpha=0.5, label='Nares only')
    
]

# Add legend with specified font size
plt.legend(
    handles=handles,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.05),
    ncol=3,
    frameon=False,
    fontsize=16  # Adjust font size here
)

plt.suptitle('Unique ASV Overlap between Skin and Nares', fontsize=18)
plt.title('(Paired Per-Individual and (>1% Sample Prevalence))', fontsize=16)

plt.tight_layout()
plt.savefig('../Plots/Analysis_figures/Individual_Analyses/skin-nares_Venn_individual.png', dpi=600)

# Print stats
print(f"Number of taxa unique to skin: {len(skin_taxa - nares_taxa)}")
print(f"Number of taxa unique to nares: {len(nares_taxa - skin_taxa)}")
print(f"Number of taxa shared between skin and nares: {len(skin_taxa & nares_taxa)}")
print(f"Number of unique individuals: {len(shared_pids)}")

Number of taxa unique to skin: 266
Number of taxa unique to nares: 16
Number of taxa shared between skin and nares: 505
Number of unique individuals: 187


In [26]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import matplotlib.patches as mpatches

# --- STEP 1: Prepare ---

# Only keep numeric columns (i.e., ASV features)
taxa_cols = df.select_dtypes(include=[np.number]).columns.tolist()


# Split into skin and nares samples
skin_samples = df[df['microbiome_type'] == 'skin']
nares_samples = df[df['microbiome_type'] == 'nares']

# --- STEP 2: Separate by disease group ---

# AD individuals
skin_ADL_samples = skin_samples[skin_samples['group'] == 'skin-ADL']
nares_AD_samples = nares_samples[nares_samples['group'] == 'nares-AD']

# Healthy individuals
skin_H_samples = skin_samples[skin_samples['group'] == 'skin-H']
nares_H_samples = nares_samples[nares_samples['group'] == 'nares-H']

# --- STEP 3: Find taxa for each group ---

# For AD
skin_present_AD = skin_ADL_samples[taxa_cols].gt(0).any()
nares_present_AD = nares_AD_samples[taxa_cols].gt(0).any()

skin_taxa_AD = set(skin_present_AD[skin_present_AD].index)
nares_taxa_AD = set(nares_present_AD[nares_present_AD].index)

# For Healthy
skin_present_H = skin_H_samples[taxa_cols].gt(0).any()
nares_present_H = nares_H_samples[taxa_cols].gt(0).any()

skin_taxa_H = set(skin_present_H[skin_present_H].index)
nares_taxa_H = set(nares_present_H[nares_present_H].index)


# --- STEP 4: Plot Venn diagrams ---

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Colors
skin_color = '#2a00ff'
nares_color = '#ffa501'
overlap_color = '#955280'

# --- Venn for AD Individuals ---
plt.sca(axes[0])  # set current axis
v_AD = venn2(
    [skin_taxa_AD, nares_taxa_AD],
    set_labels=('', ''),
    set_colors=(skin_color, nares_color),
    alpha=0.5
)

# Customize font sizes
for label in v_AD.subset_labels:
    if label:
        label.set_fontsize(16)

# Customize circle outlines
region_colors = {
    '10': skin_color,
    '01': nares_color,
    '11': overlap_color
}
for region_id, color in region_colors.items():
    patch = v_AD.get_patch_by_id(region_id)
    if patch:
        patch.set_linewidth(2)
        patch.set_edgecolor(color)

# Title for AD
n_AD = skin_ADL_samples['pid'].nunique()  # Number of unique AD individuals
# axes[0].set_title(f'AD Individuals (n={n_AD})', fontsize=18)
axes[0].set_title(f'AD Individuals (n=107)', fontsize=20, y= 0.94)

# --- Venn for Healthy Individuals ---
plt.sca(axes[1])  # set current axis
v_H = venn2(
    [skin_taxa_H, nares_taxa_H],
    set_labels=('', ''),
    set_colors=(skin_color, nares_color),
    alpha=0.5
)

for label in v_H.subset_labels:
    if label:
        label.set_fontsize(16)

for region_id, color in region_colors.items():
    patch = v_H.get_patch_by_id(region_id)
    if patch:
        patch.set_linewidth(2)
        patch.set_edgecolor(color)

# Title for Healthy
n_H = skin_H_samples['pid'].nunique()     # Number of unique Healthy individuals
# axes[1].set_title(f'Healthy Individuals (n={n_H})', fontsize=18)
axes[1].set_title(f'Healthy Individuals (n=88)', fontsize=20, y= 0.94)

# --- Add Legend ---
handles = [
    mpatches.Patch(color=skin_color, alpha=0.5, label='Skin only'),
    mpatches.Patch(color=overlap_color, alpha=0.5, label='Both'),
    mpatches.Patch(color=nares_color, alpha=0.5, label='Nares only')
]

fig.legend(
    handles=handles,
    loc='lower center',
    bbox_to_anchor=(0.5, 0),
    ncol=3,
    frameon=False,
    fontsize=16
)


# --- Final layout tweaks ---
plt.tight_layout()
plt.subplots_adjust(top=0.9, bottom=0.1)  # Shift top lower and bottom higher

# --- Supertitle and subtitle ---
fig.suptitle('Paired Per-Individual ASV Overlap Between Skin and Nares by Disease Status', fontsize=20, y=0.99)
# fig.text(
#     0.5, 0.91,  # x=centered, y=slightly lower
#     'Paired Per-Individual and (>1% Sample Prevalence)',
#     ha='center',
#     fontsize=14
# )

plt.savefig('../Plots/Analysis_figures/Individual_Analyses/skin-nares_Venn_AD_vs_H.png', dpi=600)

# --- Print numbers ---
print("AD Individuals:")
print(f" - Skin only: {len(skin_taxa_AD - nares_taxa_AD)}")
print(f" - Nares only: {len(nares_taxa_AD - skin_taxa_AD)}")
print(f" - Shared: {len(skin_taxa_AD & nares_taxa_AD)}\n")

print("Healthy Individuals:")
print(f" - Skin only: {len(skin_taxa_H - nares_taxa_H)}")
print(f" - Nares only: {len(nares_taxa_H - skin_taxa_H)}")
print(f" - Shared: {len(skin_taxa_H & nares_taxa_H)}")
print(f"Number of unique individuals: {len(shared_pids)}")

AD Individuals:
 - Skin only: 247
 - Nares only: 52
 - Shared: 381

Healthy Individuals:
 - Skin only: 343
 - Nares only: 63
 - Shared: 260
Number of unique individuals: 187


In [27]:
# Taxa shared between skin and nares in each group
shared_AD = skin_taxa_AD & nares_taxa_AD
shared_H = skin_taxa_H & nares_taxa_H

# Taxa shared in AD but not in healthy
unique_to_AD = shared_AD - shared_H

# Print number and names
print(f"Taxa shared in skin & nares of AD but not in healthy: {len(unique_to_AD)}")
print(unique_to_AD)


Taxa shared in skin & nares of AD but not in healthy: 190
{'g__Carnobacterium_A_320743_ASV-1', 'g___ASV-92', 'g__Rhodoferax_A_585629_ASV-1', 'g__Pauljensenia_ASV-5', 'g__Leptotrichia_A_993758_ASV-5', 'g___ASV-64', 'g__Alloprevotella_ASV-2', 'g__Bacteroides_H_ASV-3', 'g__Corynebacterium_ASV-18', 'g___ASV-86', 'g__Prevotella_ASV-8', 'g__Pseudomonas_E_647464_ASV-1', 'g__Streptococcus_ASV-10', 'g__Exiguobacterium_A_ASV-3', 'g___ASV-139', 'g__Alloprevotella_ASV-3', 'g__Abiotrophia_ASV-2', 'g__Mesorhizobium_F_498388_ASV-2', 'g__Acinetobacter_ASV-8', 'g__Prevotella_ASV-41', 'g__Corynebacterium_ASV-20', 'g__Sphingomonas_L_486704_ASV-3', 'g__Mucilaginibacter_A_ASV-1', 'g___ASV-24', 'g__Microbacterium_A_383184_ASV-1', 'g__Mediterraneibacter_A_155507_ASV-1', 'g__Dorea_A_ASV-2', 'g___ASV-90', 'g__Brevibacterium_ASV-4', 'g__Veillonella_A_ASV-8', 'g___ASV-114', 'g__Lautropia_ASV-1', 'g__Actinomyces_ASV-4', 'g__Anaerococcus_ASV-9', 'g__Paracoccus_ASV-2', 'g__Paracoccus_ASV-6', 'g__Gemmiger_A_73129_AS

In [28]:
# Taxa only in skin (not in nares)
skin_only_taxa = skin_taxa - nares_taxa

# Taxa only in nares (not in skin)
nares_only_taxa = nares_taxa - skin_taxa

# Taxa shared between both
shared_taxa = overlapping_taxa

# Print them
# print("\n--- Taxa unique to skin ---")
# for taxon in sorted(skin_only_taxa):
#     print(taxon)

print("\n--- Taxa unique to nares ---")
for taxon in sorted(nares_only_taxa):
    print(taxon)

print("\n--- Taxa shared between skin and nares ---")
for taxon in sorted(shared_taxa):
    print(taxon)



--- Taxa unique to nares ---
g__Bacteroides_H_ASV-5
g__Clostridium_P_ASV-2
g__Leptotrichia_A_993758_ASV-11
g__Leptotrichia_A_993758_ASV-12
g__Leptotrichia_A_993758_ASV-17
g__Paraprevotella_ASV-1
g__Prevotella_ASV-35
g__Prevotella_ASV-42
g__Prevotella_ASV-44
g__Streptococcus_ASV-12
g__Streptococcus_ASV-13
g__UMGS1994_ASV-2
g___ASV-131
g___ASV-141
g___ASV-145
g___ASV-95

--- Taxa shared between skin and nares ---
g__Acetobacter_ASV-1
g__Acinetobacter_ASV-1
g__Acinetobacter_ASV-16
g__Acinetobacter_ASV-2
g__Acinetobacter_ASV-3
g__Acinetobacter_ASV-4
g__Acinetobacter_ASV-5
g__Acinetobacter_ASV-7
g__Acinetobacter_ASV-8
g__Actinomyces_ASV-1
g__Actinomyces_ASV-2
g__Actinomyces_ASV-3
g__Aeromonas_ASV-1
g__Agathobacter_164117_ASV-1
g__Aggregatibacter_736122_ASV-1
g__Aliicoccus_ASV-1
g__Alloprevotella_ASV-1
g__Alloprevotella_ASV-2
g__Anaerobutyricum_ASV-1
g__Anaerostipes_ASV-1
g__Bifidobacterium_388775_ASV-1
g__Bifidobacterium_388775_ASV-3
g__Bifidobacterium_388775_ASV-4
g__Bifidobacterium_38877

In [29]:
from collections import defaultdict

# Initialize structure: subject_taxa_dict[pid]['skin'] or ['nares'] = set of taxa
subject_taxa_dict = defaultdict(lambda: {'skin': set(), 'nares': set()})

for _, row in df.iterrows():
    subject = row['pic']
    site = row['site']  # must be 'skin' or 'nares'
    taxa_set = set(row['taxa'])  # make sure this is already a list or string split into taxa
    subject_taxa_dict[subject][site] |= taxa_set
subject_taxa_dict

KeyError: 'pic'

In [None]:
# Define subject lists by group
ad_subjects = [s for s in subject_taxa_dict if 'AD' in s]
healthy_subjects = [s for s in subject_taxa_dict if 'H' in s]

print(f"AD subjects: {len(ad_subjects)}")
print(f"Healthy subjects: {len(healthy_subjects)}")


In [14]:

import random
import numpy as np
import matplotlib.pyplot as plt

# Number of bootstrap iterations
n_iterations = 1000

# Equalize group sizes
n = min(len(ad_subjects), len(healthy_subjects))

bootstrap_counts = []

for _ in range(n_iterations):
    # Randomly sample n subjects from each group
    ad_sample = random.sample(ad_subjects, n)
    h_sample = random.sample(healthy_subjects, n)

    # Get taxa per site and group for sampled individuals
    ad_skin_taxa = set.union(*[subject_taxa_dict[s]['skin'] for s in ad_sample])
    ad_nares_taxa = set.union(*[subject_taxa_dict[s]['nares'] for s in ad_sample])
    h_skin_taxa = set.union(*[subject_taxa_dict[s]['skin'] for s in h_sample])
    h_nares_taxa = set.union(*[subject_taxa_dict[s]['nares'] for s in h_sample])

    # Shared taxa in each group
    shared_ad = ad_skin_taxa & ad_nares_taxa
    shared_h = h_skin_taxa & h_nares_taxa

    # Taxa shared only in AD
    unique_to_ad = shared_ad - shared_h

    # Record count
    bootstrap_counts.append(len(unique_to_ad))

# Summary statistics
mean_count = np.mean(bootstrap_counts)
ci_low, ci_high = np.percentile(bootstrap_counts, [2.5, 97.5])

print(f"Mean AD-specific shared taxa (bootstrapped): {mean_count:.1f}")
print(f"95% CI: ({ci_low:.1f}, {ci_high:.1f})")

# Optional: visualize
plt.hist(bootstrap_counts, bins=30, color='skyblue', edgecolor='black')
plt.axvline(mean_count, color='red', linestyle='--', label='Mean')
plt.xlabel('Shared taxa unique to AD')
plt.ylabel('Frequency')
plt.title('Bootstrapped distribution of AD-only shared taxa')
plt.legend()
plt.tight_layout()
plt.savefig("bootstrap_shared_taxa_AD.png", dpi=300)


NameError: name 'ad_subjects' is not defined