<!-- # ASV overlap Venn Diagram -->

In [21]:
# Import Python packages
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import pearsonr
from matplotlib_venn import venn2
import matplotlib.patches as mpatches
from scipy.stats import mannwhitneyu
from biom import Table
from gemelli.rpca import rpca
from scipy.spatial.distance import euclidean
import re
import matplotlib.colors as mcolors

In [22]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')

# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

# Add AD status column based on group values
metadata['ad_status'] = metadata['group'].apply(lambda x: 'AD' if x.split('-')[-1].startswith('AD') else 'H')

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group,microbiome_type,ad_status
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL,skin,AD
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin,AD
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL,skin,AD
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD,nares,AD


In [23]:
# Read in table at collapsed genera  level

taxa_level = 'Genus'
biom_path = f'../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_{taxa_level}-ASV-non-collapse.biom'

biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the 
df.index = df.index.str.replace('15564.', '')

df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Copromonas_ASV-3,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-17,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145
900344,984.0,611.0,114.0,82.0,22.0,8.0,8.0,6.0,3.0,2.0,...,0,0,0,0,0,0,0,0,0,0
900459,118.0,106.0,0,0,0,0,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,0
900221,22.0,0,0,0,0,16.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900570,389.0,0,0,0,8.0,11.0,0,0,0,5.0,...,0,0,0,0,0,0,0,0,0,0
900092,3106.0,1707.0,59.0,32.0,3.0,0,0,0,7.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,1168.0,593.0,16.0,0,28.0,736.0,0,0,36.0,388.0,...,0,0,0,0,0,0,0,0,0,17.0
900097,24.0,0,0,0,0,33.0,0,0,0,12.0,...,0,0,0,0,0,0,0,0,0,0
900498,15.0,17.0,0,0,0,34.0,0,14.0,0,25.0,...,0,0,0,0,0,0,0,0,0,0
900276,0,0,30.0,0,0,151.0,0,0,0,79.0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Map pid, group, and microbiome_type from metadata to df based on matching indexes
df['pid'] = metadata.loc[df.index, 'pid']
df['group'] = metadata.loc[df.index, 'group']
df['microbiome_type'] = metadata.loc[df.index, 'microbiome_type']
df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,pid,group,microbiome_type
900344,984.0,611.0,114.0,82.0,22.0,8.0,8.0,6.0,3.0,2.0,...,0,0,0,0,0,0,0,Co-130-MM,skin-H,skin
900459,118.0,106.0,0,0,0,0,0,0,0,3.0,...,0,0,0,0,0,0,0,Ca-145-LM,nares-AD,nares
900221,22.0,0,0,0,0,16.0,0,0,0,0,...,0,0,0,0,0,0,0,Ca-101-ID,skin-ADL,skin
900570,389.0,0,0,0,8.0,11.0,0,0,0,5.0,...,0,0,0,0,0,0,0,Ca-046-MT,skin-ADNL,skin
900092,3106.0,1707.0,59.0,32.0,3.0,0,0,0,7.0,0,...,0,0,0,0,0,0,0,Ca-023-EJ,nares-AD,nares
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,1168.0,593.0,16.0,0,28.0,736.0,0,0,36.0,388.0,...,0,0,0,0,0,0,17.0,Ca-125-IM,skin-ADL,skin
900097,24.0,0,0,0,0,33.0,0,0,0,12.0,...,0,0,0,0,0,0,0,Ca-025-AC,skin-ADNL,skin
900498,15.0,17.0,0,0,0,34.0,0,14.0,0,25.0,...,0,0,0,0,0,0,0,Ca-157-LM,skin-ADNL,skin
900276,0,0,30.0,0,0,151.0,0,0,0,79.0,...,0,0,0,0,0,0,0,Ca-111-IN,skin-ADL,skin


In [25]:
print(f'Number of unique individuals: ' + str(df['pid'].nunique()))

Number of unique individuals: 197


<!-- ## Combined Venn Diagram with all samples, all individuals -->

In [26]:
# Get taxa columns (all except pid, group, microbiome_type)
taxa_cols = df.columns[:-3]


# # Filter taxa_cols_filtered to keep only prevalent taxa + metadata
# df_filtered = taxa_cols_filtered[features_10_prev + ['pid', 'microbiome_type']]

# Split into skin and nares samples
skin_samples = df[df['microbiome_type'] == 'skin']
nares_samples = df[df['microbiome_type'] == 'nares']

# Find individuals with both skin and nares samples
shared_pids = set(skin_samples['pid']) & set(nares_samples['pid'])

# Initialize sets for taxa present in each body site
skin_taxa = set()
nares_taxa = set()
overlapping_taxa = set()

# For each individual with both sample types
for pid in shared_pids:
    # Get samples for this individual
    skin_sample = skin_samples[skin_samples['pid'] == pid]
    nares_sample = nares_samples[nares_samples['pid'] == pid]
    
    # Find taxa present (abundance > 0) in each sample type
    skin_present = set(taxa_cols[skin_sample[taxa_cols].gt(0).any()])
    nares_present = set(taxa_cols[nares_sample[taxa_cols].gt(0).any()])
    
    # Add to overall sets
    skin_taxa.update(skin_present)
    nares_taxa.update(nares_present)
    # Add taxa present in both sites for this individual
    overlapping_taxa.update(skin_present & nares_present)

# Plot Venn diagram
plt.figure(figsize=(6, 6))
v = venn2(
    [skin_taxa, nares_taxa],
    set_labels=('', ''),
    set_colors=('#2a00ff', '#ffa501'),
    alpha=0.5
)

# Adjust font size for subset labels
for label in v.subset_labels:
    if label:
        label.set_fontsize(16)

# Customize circle outlines to match fill color
region_colors = {
    '10': '#2a00ff',    # Skin only
    '01': '#ffa501',     # Nares only
    '11': '#955280'   # Overlap
}
for region_id, color in region_colors.items():
    patch = v.get_patch_by_id(region_id)
    if patch:
        patch.set_linewidth(2)
        patch.set_edgecolor(color)

# Create matching legend
# Create matching legend patches (colors match Venn regions)
handles = [
    mpatches.Patch(color='#2a00ff', alpha=0.5, label='Skin only'),
    mpatches.Patch(color='#955280', alpha=0.5, label='Both'), 
    mpatches.Patch(color='#ffa501', alpha=0.5, label='Nares only')
    
]

# Add legend with specified font size
plt.legend(
    handles=handles,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.05),
    ncol=3,
    frameon=False,
    fontsize=16  # Adjust font size here
)

plt.suptitle('Unique ASV Overlap between Skin and Nares', fontsize=18)
plt.title('(Paired Per-Individual and (>1% Sample Prevalence))', fontsize=16)

plt.tight_layout()
plt.savefig('../Plots/Analysis_figures/Co-occurrence/skin-nares_Venn_individual.png', dpi=600)

# Print stats
print(f"Number of taxa unique to skin: {len(skin_taxa - nares_taxa)}")
print(f"Number of taxa unique to nares: {len(nares_taxa - skin_taxa)}")
print(f"Number of taxa shared between skin and nares: {len(skin_taxa & nares_taxa)}")
print(f"Number of unique individuals: {len(shared_pids)}")

Number of taxa unique to skin: 98
Number of taxa unique to nares: 1
Number of taxa shared between skin and nares: 699
Number of unique individuals: 187


<!-- ## Separate Venn Diagram by AD status -->

In [27]:
# --- STEP 1: Prepare ---
# Only keep numeric columns (i.e., ASV features)
taxa_cols = df.select_dtypes(include=[np.number]).columns.tolist()


# Split into skin and nares samples
skin_samples = df[df['microbiome_type'] == 'skin']
nares_samples = df[df['microbiome_type'] == 'nares']

# --- STEP 2: Separate by disease group ---

# AD individuals
skin_ADL_samples = skin_samples[skin_samples['group'] == 'skin-ADL']
nares_AD_samples = nares_samples[nares_samples['group'] == 'nares-AD']

# Healthy individuals
skin_H_samples = skin_samples[skin_samples['group'] == 'skin-H']
nares_H_samples = nares_samples[nares_samples['group'] == 'nares-H']

# --- STEP 3: Find taxa for each group ---

# For AD
skin_present_AD = skin_ADL_samples[taxa_cols].gt(0).any()
nares_present_AD = nares_AD_samples[taxa_cols].gt(0).any()

skin_taxa_AD = set(skin_present_AD[skin_present_AD].index)
nares_taxa_AD = set(nares_present_AD[nares_present_AD].index)

# For Healthy
skin_present_H = skin_H_samples[taxa_cols].gt(0).any()
nares_present_H = nares_H_samples[taxa_cols].gt(0).any()

skin_taxa_H = set(skin_present_H[skin_present_H].index)
nares_taxa_H = set(nares_present_H[nares_present_H].index)

skin_ADL_samples

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,pid,group,microbiome_type
900221,22.0,0,0,0,0,16.0,0,0,0,0,...,0,0,0,0,0,0,0,Ca-101-ID,skin-ADL,skin
900129,0,0,0,0,0,25.0,2.0,0,2.0,3.0,...,0,0,0,0,0,0,0,Ca-035-AR,skin-ADL,skin
900423,0,0,0,0,0,40.0,19.0,0,0,22.0,...,0,0,0,0,0,0,0,Ca-133-LT,skin-ADL,skin
900581,198.0,0,0,0,32.0,46.0,0,0,0,23.0,...,0,0,0,0,0,0,0,Ca-049-LM,skin-ADL,skin
900460,722.0,493.0,0,0,200.0,40.0,0,0,50.0,28.0,...,0,0,0,0,0,0,0,Ca-146-SM,skin-ADL,skin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900081,1006.0,0,37.0,0,137.0,40.0,0,0,0,0,...,0,0,0,0,0,0,0,Ca-020-AZ,skin-ADL,skin
900093,18.0,0,0,0,0,0,0,0,0,0,...,0,0,0,2.0,0,0,0,Ca-024-ZM,skin-ADL,skin
900057,111.0,0,0,0,0,3.0,17.0,0,0,4.0,...,0,0,0,0,0,0,0,Ca-013-NN,skin-ADL,skin
9003972,1168.0,593.0,16.0,0,28.0,736.0,0,0,36.0,388.0,...,0,0,0,0,0,0,17.0,Ca-125-IM,skin-ADL,skin


In [28]:
# Get unique patient IDs for each group
skin_ADL_pids = set(skin_ADL_samples['pid'].unique())
nares_AD_pids = set(nares_AD_samples['pid'].unique())

# Count unique patients
n_AD = len(skin_ADL_pids.union(nares_AD_pids))
n_AD

106

In [29]:
# Get unique patient IDs for each group
skin_H_pids = set(skin_H_samples['pid'].unique())
nares_H_pids = set(nares_H_samples['pid'].unique())

# Count unique patients
n_H = len(skin_H_pids.union(nares_H_pids))
n_H

87

In [47]:
# --- STEP 4: Plot Venn diagrams ---

fig, axes = plt.subplots(2, 1, figsize=(6, 12))

# Colors
skin_color = '#2a00ff'
nares_color = '#ffa501'
overlap_color = '#955280'

# --- Venn for AD Individuals ---
plt.sca(axes[0])  # set current axis
v_AD = venn2(
    [skin_taxa_AD, nares_taxa_AD],
    set_labels=('', ''),
    set_colors=(skin_color, nares_color),
    alpha=0.5
)

# Customize font sizes
for label in v_AD.subset_labels:
    if label:
        label.set_fontsize(16)

# Customize circle outlines
region_colors = {
    '10': skin_color,
    '01': nares_color,
    '11': overlap_color
}
for region_id, color in region_colors.items():
    patch = v_AD.get_patch_by_id(region_id)
    if patch:
        patch.set_linewidth(2)
        patch.set_edgecolor(color)

# Title for AD

# axes[0].set_title(f'AD Individuals (n={n_AD})', fontsize=18)
axes[0].set_title(f'AD Individuals (n=107)', fontsize=20, y= 0.94)

# --- Venn for Healthy Individuals ---
plt.sca(axes[1])  # set current axis
v_H = venn2(
    [skin_taxa_H, nares_taxa_H],
    set_labels=('', ''),
    set_colors=(skin_color, nares_color),
    alpha=0.5
)

for label in v_H.subset_labels:
    if label:
        label.set_fontsize(16)

for region_id, color in region_colors.items():
    patch = v_H.get_patch_by_id(region_id)
    if patch:
        patch.set_linewidth(2)
        patch.set_edgecolor(color)

# Title for Healthy
n_H = skin_H_samples['pid'].nunique()     # Number of unique Healthy individuals
axes[1].set_title(f'Healthy Individuals (n=88)', fontsize=20, y= 0.94)

# --- Add Legend ---
handles = [
    mpatches.Patch(color=skin_color, alpha=0.5, label='Skin only'),
    mpatches.Patch(color=overlap_color, alpha=0.5, label='Both'),
    mpatches.Patch(color=nares_color, alpha=0.5, label='Nares only')
]

fig.legend(
    handles=handles,
    loc='lower center',
    bbox_to_anchor=(0.5, 0.04),
    ncol=3,
    frameon=False,
    fontsize=16
)


# --- Final layout tweaks ---
plt.tight_layout()
plt.subplots_adjust(top=0.9, bottom=0.1)  # Shift top lower and bottom higher

# --- Supertitle and subtitle ---
fig.suptitle('ASV Overlap Skin and Nares', fontsize=26, y=0.97)
# fig.text(
#     0.5, 0.91,  # x=centered, y=slightly lower
#     'Paired Per-Individual and (>1% Sample Prevalence)',
#     ha='center',
#     fontsize=14
# )

plt.savefig('../Plots/Analysis_figures/Co-occurrence/skin-nares_Venn_AD_vs_H.png', dpi=600)

# --- Print numbers ---
print("AD Individuals:")
print(f" - Skin only: {len(skin_taxa_AD - nares_taxa_AD)}")
print(f" - Nares only: {len(nares_taxa_AD - skin_taxa_AD)}")
print(f" - Shared: {len(skin_taxa_AD & nares_taxa_AD)}\n")

print("Healthy Individuals:")
print(f" - Skin only: {len(skin_taxa_H - nares_taxa_H)}")
print(f" - Nares only: {len(nares_taxa_H - skin_taxa_H)}")
print(f" - Shared: {len(skin_taxa_H & nares_taxa_H)}")


AD Individuals:
 - Skin only: 168
 - Nares only: 19
 - Shared: 602

Healthy Individuals:
 - Skin only: 250
 - Nares only: 43
 - Shared: 479


In [31]:
# AD
ad_skin_only_asvs = skin_taxa_AD - nares_taxa_AD
ad_nose_only_asvs = nares_taxa_AD - skin_taxa_AD
ad_shared_asvs = skin_taxa_AD & nares_taxa_AD

# Healthy
h_skin_only_asvs = skin_taxa_H - nares_taxa_H
h_nose_only_asvs = nares_taxa_H - skin_taxa_H
h_shared_asvs = skin_taxa_H & nares_taxa_H


In [32]:
# Define group-to-color mappings
group_colors = {
    "AD_skin_only": "#947fff",
    "AD_nose_only": "#ffd17f",
    "AD_shared": "#e7b9d8",
}


# Create a list of (ASV, color, label) tuples
itol_entries = []

for asv in ad_skin_only_asvs:
    itol_entries.append((asv, group_colors["AD_skin_only"], "AD_skin_only"))
for asv in ad_nose_only_asvs:
    itol_entries.append((asv, group_colors["AD_nose_only"], "AD_nose_only"))
for asv in ad_shared_asvs:
    itol_entries.append((asv, group_colors["AD_shared"], "AD_shared"))


# Write to iTOL file
with open("../Data/Trees/itol_AD_asv_group_colors.txt", "w") as f:
    f.write("DATASET_COLORSTRIP\n")
    f.write("SEPARATOR TAB\n")
    f.write("DATASET_LABEL\tASV Groups\n")
    f.write("COLOR\t#ff0000\n\n")

    f.write("LEGEND_TITLE\tASV Group\n")
    f.write("LEGEND_SHAPES\t1\t1\t1\t1\t1\t1\n")
    f.write("LEGEND_COLORS\t" + "\t".join(group_colors.values()) + "\n")
    f.write("LEGEND_LABELS\t" + "\t".join(group_colors.keys()) + "\n\n")

    f.write("DATA\n")
    for entry in itol_entries:
        f.write(f"{entry[0]}\t{entry[1]}\t{entry[2]}\n")


In [33]:
# Define group-to-color mappings
group_colors = {
    "H_skin_only": "#947fff",
    "H_nose_only": "#ffd17f",
    "H_shared": "#e7b9d8",
}


# Create a list of (ASV, color, label) tuples
itol_entries = []

for asv in h_skin_only_asvs:
    itol_entries.append((asv, group_colors["H_skin_only"], "H_skin_only"))
for asv in h_nose_only_asvs:
    itol_entries.append((asv, group_colors["H_nose_only"], "H_nose_only"))
for asv in h_shared_asvs:
    itol_entries.append((asv, group_colors["H_shared"], "H_shared"))

# Write to iTOL file
with open("../Data/Trees/itol_H_asv_group_colors.txt", "w") as f:
    f.write("DATASET_COLORSTRIP\n")
    f.write("SEPARATOR TAB\n")
    f.write("DATASET_LABEL\tASV Groups\n")
    f.write("COLOR\t#ff0000\n\n")

    f.write("LEGEND_TITLE\tASV Group\n")
    f.write("LEGEND_SHAPES\t1\t1\t1\t1\t1\t1\n")
    f.write("LEGEND_COLORS\t" + "\t".join(group_colors.values()) + "\n")
    f.write("LEGEND_LABELS\t" + "\t".join(group_colors.keys()) + "\n\n")

    f.write("DATA\n")
    for entry in itol_entries:
        f.write(f"{entry[0]}\t{entry[1]}\t{entry[2]}\n")


In [34]:
all_asvs_AD = skin_taxa_AD | nares_taxa_AD
print(f"Total unique ASVs in AD individuals: {len(all_asvs_AD)}")

Total unique ASVs in AD individuals: 789


In [35]:
from Bio import SeqIO

# Path to your full FASTA file
input_fasta = "../Data/Fasta/209766_filtered_by_prevalence_1pct.fasta"

# Output FASTA with only AD ASVs
output_fasta = "../Data/Fasta/209766_filtered_by_prevalence_1pct_AD_only.fasta"

# Convert to set for fast lookup
asv_ids_to_keep = set(all_asvs_AD)

# Filter and write new FASTA
with open(output_fasta, "w") as out_f:
    for record in SeqIO.parse(input_fasta, "fasta"):
        if record.id in asv_ids_to_keep:
            SeqIO.write(record, out_f, "fasta")

print(f"Filtered FASTA written to {output_fasta} with {len(asv_ids_to_keep)} AD ASVs.")


Filtered FASTA written to ../Data/Fasta/209766_filtered_by_prevalence_1pct_AD_only.fasta with 789 AD ASVs.


In [36]:
all_asvs_Healthy = skin_taxa_H | nares_taxa_H
print(f"Total unique ASVs in Healthy individuals: {len(all_asvs_Healthy)}")

Total unique ASVs in Healthy individuals: 772


In [37]:
# Path to your full FASTA file
input_fasta = "../Data/Fasta/209766_filtered_by_prevalence_1pct.fasta"

# Output FASTA with only AD ASVs
output_fasta = "../Data/Fasta/209766_filtered_by_prevalence_1pct_Healthy_only.fasta"

# Convert to set for fast lookup
asv_ids_to_keep = set(all_asvs_Healthy)

# Filter and write new FASTA
with open(output_fasta, "w") as out_f:
    for record in SeqIO.parse(input_fasta, "fasta"):
        if record.id in asv_ids_to_keep:
            SeqIO.write(record, out_f, "fasta")

print(f"Filtered FASTA written to {output_fasta} with {len(asv_ids_to_keep)} Healthy ASVs.")

Filtered FASTA written to ../Data/Fasta/209766_filtered_by_prevalence_1pct_Healthy_only.fasta with 772 Healthy ASVs.


<!-- ## Bootstrapped plot -->

In [38]:
# 1. Find common subjects with both skin and nares samples
def get_paired_subjects(skin_df, nares_df):
    return set(skin_df['pid']) & set(nares_df['pid'])

ad_subjects = list(get_paired_subjects(skin_ADL_samples, nares_AD_samples))
healthy_subjects = list(get_paired_subjects(skin_H_samples, nares_H_samples))

n_iterations = 100
n_paired = min(len(ad_subjects), len(healthy_subjects))  # smallest group size
print(f'Bootstrapping {n_iterations} times with {n_paired} paired individuals')

ad_overlap = np.zeros(n_iterations)
healthy_overlap = np.zeros(n_iterations)

taxa_cols = df.select_dtypes(include=[np.number]).columns

for i in range(n_iterations):
    # 2. Bootstrap same number of paired individuals
    sampled_ad_subjects = np.random.choice(ad_subjects, n_paired, replace=True)
    sampled_healthy_subjects = np.random.choice(healthy_subjects, n_paired, replace=True)

    # 3. Get skin and nares samples for each subject and stack
    ad_skin = skin_ADL_samples[skin_ADL_samples['pid'].isin(sampled_ad_subjects)][taxa_cols]
    ad_nares = nares_AD_samples[nares_AD_samples['pid'].isin(sampled_ad_subjects)][taxa_cols]

    h_skin = skin_H_samples[skin_H_samples['pid'].isin(sampled_healthy_subjects)][taxa_cols]
    h_nares = nares_H_samples[nares_H_samples['pid'].isin(sampled_healthy_subjects)][taxa_cols]

    # 4. Determine taxa present in each body site
    present_ad_skin = (ad_skin > 0).any(axis=0)
    present_ad_nares = (ad_nares > 0).any(axis=0)
    present_h_skin = (h_skin > 0).any(axis=0)
    present_h_nares = (h_nares > 0).any(axis=0)

    # 5. Store overlap
    ad_overlap[i] = np.sum(present_ad_skin & present_ad_nares)
    healthy_overlap[i] = np.sum(present_h_skin & present_h_nares)


Bootstrapping 100 times with 84 paired individuals


In [39]:
print(f"AD overlap (mean ± SD): {np.mean(ad_overlap):.1f} ± {np.std(ad_overlap):.1f}")
print(f"Healthy overlap (mean ± SD): {np.mean(healthy_overlap):.1f} ± {np.std(healthy_overlap):.1f}")


AD overlap (mean ± SD): 474.6 ± 23.4
Healthy overlap (mean ± SD): 382.3 ± 20.6


In [40]:
# Define fixed-width bins
all_vals = np.concatenate([ad_overlap, healthy_overlap])
bin_min = int(np.floor(all_vals.min()) - 2)
bin_max = int(np.ceil(all_vals.max()) + 2)
bins = np.arange(bin_min, bin_max + 2, 2)  # Finer binning if values are close

# Create figure
plt.figure(figsize=(8, 5))

# Plot AD histogram and KDE
sns.histplot(ad_overlap, bins=bins, color='#e31a1c', stat='density', alpha=0.6, label='AD')
sns.kdeplot(ad_overlap, color='#e31a1c', linewidth=2, alpha=1.0)

# Plot Healthy histogram and KDE
sns.histplot(healthy_overlap, bins=bins, color='#78a4c1', stat='density', alpha=0.6, label='Healthy')
sns.kdeplot(healthy_overlap, color='#78a4c1', linewidth=2, alpha=1.0)

# Customize axis and layout
plt.xlabel("Shared Taxa (Skin ∩ Nares)", fontsize=14)
plt.ylabel("Probability Density", fontsize=14)
plt.title("Overlap of Shared ASVs Between Skin and Nares Per Child", fontsize=16)
# plt.xlim(150, 320)
plt.legend(title="Skin Status")

stat, p = mannwhitneyu(ad_overlap, healthy_overlap)
plt.gca().text(
    0.02, 0.97,  # Adjust Y value as needed to position under legend
    f"Mann–Whitney U\np={p:.1e}",
    transform=plt.gca().transAxes,
    ha='left', va='top', fontsize=10,
    bbox=dict(boxstyle='round', facecolor='white', edgecolor='none', alpha=0.7)
)

plt.tight_layout()

# Save figure
plt.savefig("../Plots/Analysis_figures/Co-occurrence/bootstrap_overlap_taxa.png", dpi=600)

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
