# Strep and severity score correlations

In [278]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [279]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [280]:
# Read in table at ASV level
biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Get samples that don't start with 'skin' in metadata
skin_samples = metadata[metadata['group'].str.startswith('skin')].index

# Filter df to keep only skin samples
df = df.loc[skin_samples]
df

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Bacillus_P_294101_ASV-2,g__Cutibacterium_ASV-1,g___ASV-18,g___ASV-28,g___ASV-25,...,g__Blautia_A_141781_ASV-7,g___ASV-358,g__Peptoniphilus_A_ASV-7,g___ASV-154,g___ASV-169,g__UBA952_ASV-1,g__Petroclostridium_ASV-1,g__Capnocytophaga_820690_ASV-4,g___ASV-202,g__Streptococcus_ASV-37
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,15.0,0,0,0,0,0,13.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca010EBL,8.0,0,0,0,0,0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900460,66.0,44.0,0,0,15.0,0,4.0,0,0,7.0,...,0,0,0,0,0,0,0,0,0,0
900051,30.0,0,0,0,0,0,10.0,5.0,0,2.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,66.0,0,99.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONL,12.0,0,0,0,0,0,27.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONNL,9.0,0,0,0,0,0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [281]:
df.columns.tolist()

['g__Streptococcus_ASV-1',
 'g__Streptococcus_ASV-2',
 'g__Corynebacterium_ASV-1',
 'g__Corynebacterium_ASV-3',
 'g___ASV-3',
 'g__Bacillus_P_294101_ASV-2',
 'g__Cutibacterium_ASV-1',
 'g___ASV-18',
 'g___ASV-28',
 'g___ASV-25',
 'g__Cutibacterium_ASV-2',
 'g__Bifidobacterium_388775_ASV-4',
 'g__Streptococcus_ASV-36',
 'g__Haemophilus_D_734546_ASV-1',
 'g__Dolosigranulum_ASV-1',
 'g__Dolosigranulum_ASV-2',
 'g__Haemophilus_D_734546_ASV-2',
 'g__Staphylococcus_ASV-2',
 'g__Staphylococcus_ASV-1',
 'g__Micrococcus_ASV-2',
 'g__Lactococcus_A_346120_ASV-1',
 'g__Limosilactobacillus_ASV-1',
 'g__Prevotella_ASV-1',
 'g__Corynebacterium_ASV-11',
 'g__Prevotella_ASV-2',
 'g__Fusobacterium_C_ASV-3',
 'g__Neisseria_563205_ASV-4',
 'g__Arthrobacter_D_ASV-1',
 'g__Gemella_ASV-2',
 'g___ASV-51',
 'g__Leptotrichia_A_993758_ASV-1',
 'g__Porphyromonas_A_859423_ASV-2',
 'g__Fusobacterium_C_ASV-2',
 'g___ASV-6',
 'g___ASV-65',
 'g__Brachybacterium_ASV-2',
 'g__Neisseria_563205_ASV-3',
 'g__Veillonella_A_

In [282]:
# Calculate column sums and sort columns by sum in descending order
col_sums = df.sum()
df = df[col_sums.sort_values(ascending=False).index]
df.columns.tolist()

['g__Staphylococcus_ASV-1',
 'g__Staphylococcus_ASV-2',
 'g__Streptococcus_ASV-1',
 'g__Micrococcus_ASV-1',
 'g__Corynebacterium_ASV-2',
 'g__Streptococcus_ASV-2',
 'g___ASV-1',
 'g__Prevotella_ASV-1',
 'g___ASV-2',
 'g__Streptococcus_ASV-3',
 'g__Acinetobacter_ASV-1',
 'g__Cutibacterium_ASV-1',
 'g__Prevotella_ASV-2',
 'g__Haemophilus_D_734546_ASV-1',
 'g__Chryseobacterium_796614_ASV-1',
 'g__Acinetobacter_ASV-2',
 'g__Micrococcus_ASV-2',
 'g___ASV-4',
 'g__Acinetobacter_ASV-3',
 'g__Corynebacterium_ASV-4',
 'g___ASV-5',
 'g__Veillonella_A_ASV-1',
 'g__Escherichia_710834_ASV-1',
 'g__Corynebacterium_ASV-5',
 'g__Cutibacterium_ASV-2',
 'g__Neisseria_563205_ASV-1',
 'g__Staphylococcus_ASV-3',
 'g__SIO2C1_ASV-1',
 'g__Psychrobacter_ASV-1',
 'g___ASV-7',
 'g___ASV-8',
 'g__Massilia_ASV-1',
 'g__Acinetobacter_ASV-4',
 'g__Chryseobacterium_796614_ASV-2',
 'g__Corynebacterium_ASV-6',
 'g___ASV-10',
 'g__Neisseria_563205_ASV-2',
 'g___ASV-9',
 'g__Streptococcus_ASV-5',
 'g__Aeromonas_ASV-1',


In [283]:
def rclr_transform(df, pseudocount=1e-6):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df



rclr_df = rclr_transform(df)

In [284]:
# Map the 'group' column from metadata to df based on matching index
rclr_df['group'] = metadata.loc[rclr_df.index, 'group']
rclr_df

Unnamed: 0_level_0,g__Staphylococcus_ASV-1,g__Staphylococcus_ASV-2,g__Streptococcus_ASV-1,g__Micrococcus_ASV-1,g__Corynebacterium_ASV-2,g__Streptococcus_ASV-2,g___ASV-1,g__Prevotella_ASV-1,g___ASV-2,g__Streptococcus_ASV-3,...,g__Fluviicola_ASV-2,g___ASV-369,g__Flavobacterium_ASV-25,g___ASV-434,g__Staphylococcus_ASV-33,g__Flavobacterium_ASV-8,g___ASV-489,g__Staphylococcus_ASV-34,g__Streptococcus_ASV-37,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,-0.203199,-1.301811,0.713092,0.713092,,,2.639770,,-0.385521,1.140536,...,,,,,,,,,,skin-ADL
900221,,,,,-2.027367,,,,,,...,,,,,,,,,,skin-ADL
Ca010EBL,3.054638,2.731238,0.803347,,,,2.019742,1.02649,-1.276094,0.515665,...,,,,,,,,,,skin-ADL
900460,1.657654,1.657654,3.139258,-0.357249,,2.733793,,,,1.945336,...,,,,,,,,,,skin-ADL
900051,3.756053,2.448896,2.021452,,,,,,,-1.379744,...,,,,,,,,,,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,,,2.749671,,,,,,,-0.746836,...,,,,,,,,,,skin-ADL
Ca006ONL,3.315374,1.783898,1.090750,,-1.394155,,2.243430,,-1.394155,0.551754,...,,,,,,,,,,skin-ADL
Ca006ONL2,,,,,,,,,,,...,,,,,,,,,,skin-ADL
Ca006ONNL,3.708304,1.790251,0.729379,-0.774698,0.611596,,1.751030,,-0.369233,-1.467845,...,,,,,,,,,,skin-ADNL


In [285]:
rclr_df['group'].value_counts()

# rclr_df = rclr_df.rename(columns={
#     col: col.replace("g__Haemophilus_D_734546", "g__Haemophilus")
#     for col in df.columns
#     if col.startswith("g__Haemophilus_D_734546")
# })


# rclr_df = rclr_df.rename(columns={
#     col: col.replace("g__Veillonella_A", "g__Veillonella")
#     for col in df.columns
#     if col.startswith("g__Veillonella_A")
# })



group
skin-ADNL    111
skin-ADL     107
skin-H        87
Name: count, dtype: int64

In [286]:
def pairwise_mannwhitney_bh(df, feature, group_col='group'):
    groups = df[group_col].unique()
    comparisons = list(itertools.combinations(groups, 2))

    raw_pvals = []
    labels = []
    pairs = []

    for g1, g2 in comparisons:
        x = df[df[group_col] == g1][feature].dropna()
        y = df[df[group_col] == g2][feature].dropna()
        _, p = mannwhitneyu(x, y, alternative='two-sided')
        raw_pvals.append(p)
        labels.append(f"{g1} vs {g2}")
        pairs.append((g1, g2))

    _, pvals_corrected, _, _ = multipletests(raw_pvals, method='fdr_bh')

    results_df = pd.DataFrame({
        'Comparison': labels,
        'Pair': pairs,
        'Raw p-value': raw_pvals,
        'BH-corrected p-value': pvals_corrected
    })

    return results_df

In [287]:
# Filter for skin samples
skin_samples = rclr_df[rclr_df['group'].isin(['skin-ADL', 'skin-ADNL', 'skin-H'])]
skin_samples

Unnamed: 0_level_0,g__Staphylococcus_ASV-1,g__Staphylococcus_ASV-2,g__Streptococcus_ASV-1,g__Micrococcus_ASV-1,g__Corynebacterium_ASV-2,g__Streptococcus_ASV-2,g___ASV-1,g__Prevotella_ASV-1,g___ASV-2,g__Streptococcus_ASV-3,...,g__Fluviicola_ASV-2,g___ASV-369,g__Flavobacterium_ASV-25,g___ASV-434,g__Staphylococcus_ASV-33,g__Flavobacterium_ASV-8,g___ASV-489,g__Staphylococcus_ASV-34,g__Streptococcus_ASV-37,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,-0.203199,-1.301811,0.713092,0.713092,,,2.639770,,-0.385521,1.140536,...,,,,,,,,,,skin-ADL
900221,,,,,-2.027367,,,,,,...,,,,,,,,,,skin-ADL
Ca010EBL,3.054638,2.731238,0.803347,,,,2.019742,1.02649,-1.276094,0.515665,...,,,,,,,,,,skin-ADL
900460,1.657654,1.657654,3.139258,-0.357249,,2.733793,,,,1.945336,...,,,,,,,,,,skin-ADL
900051,3.756053,2.448896,2.021452,,,,,,,-1.379744,...,,,,,,,,,,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,,,2.749671,,,,,,,-0.746836,...,,,,,,,,,,skin-ADL
Ca006ONL,3.315374,1.783898,1.090750,,-1.394155,,2.243430,,-1.394155,0.551754,...,,,,,,,,,,skin-ADL
Ca006ONL2,,,,,,,,,,,...,,,,,,,,,,skin-ADL
Ca006ONNL,3.708304,1.790251,0.729379,-0.774698,0.611596,,1.751030,,-0.369233,-1.467845,...,,,,,,,,,,skin-ADNL


In [288]:
def pairwise_mannwhitney_bh(df, feature, group_col='group'):
    groups = df[group_col].unique()
    comparisons = list(itertools.combinations(groups, 2))

    raw_pvals = []
    labels = []
    pairs = []

    for g1, g2 in comparisons:
        x = df[df[group_col] == g1][feature].dropna()
        y = df[df[group_col] == g2][feature].dropna()

        # Skip comparison if either group has no data
        if len(x) == 0 or len(y) == 0:
            continue

        _, p = mannwhitneyu(x, y, alternative='two-sided')
        raw_pvals.append(p)
        labels.append(f"{g1} vs {g2}")
        pairs.append((g1, g2))

    if len(raw_pvals) == 0:
        return pd.DataFrame(columns=['Comparison', 'Pair', 'Raw p-value', 'BH-corrected p-value'])

    _, pvals_corrected, _, _ = multipletests(raw_pvals, method='fdr_bh')

    results_df = pd.DataFrame({
        'Comparison': labels,
        'Pair': pairs,
        'Raw p-value': raw_pvals,
        'BH-corrected p-value': pvals_corrected
    })

    return results_df


In [289]:
# Define custom color palette
group_palette = {
    'skin-H': '#ADD8E6',     # baby blue
    'skin-ADNL': '#FFDAB9',  # peach
    'skin-ADL': '#E31A1C'    # red
}

strip_palette = {
    'skin-H': '#6CA6CD',     # darker baby blue
    'skin-ADNL': '#E6AC8F',  # darker peach
    'skin-ADL': '#A50000'    # darker red
}


In [290]:
def plot_multi_taxa_boxplots(data, taxa_list, group_palette, strip_palette, title_name, order=['skin-H', 'skin-ADNL', 'skin-ADL']):
    n_taxa = len(taxa_list)
    group_count = len(order)

    # Prepare long format dataframe
    plot_data = []
    for i, taxon in enumerate(taxa_list):
        for j, group in enumerate(order):
            xpos = i * group_count + j
            values = data.loc[data['group'] == group, taxon]
            for v in values:
                plot_data.append({
                    'x': xpos,
                    'taxon': taxon.replace(' g__', '').split('_')[0],
                    'value': v,
                    'group': group
                })

    plot_df = pd.DataFrame(plot_data)

    # Plot
    fig, ax = plt.subplots(figsize=(n_taxa * 2.5, 5))

    sns.boxplot(
        data=plot_df,
        x='x',
        y='value',
        hue='group',
        palette=group_palette,
        width=0.5,
        fliersize=0,
        dodge=False,
        ax=ax
    )

    sns.stripplot(
        data=plot_df,
        x='x',
        y='value',
        hue='group',
        palette=strip_palette,
        dodge=False,
        jitter=True,
        size=4,
        alpha=0.75,
        ax=ax
    )

    # Remove duplicated legends
    handles, labels = ax.get_legend_handles_labels()

    # Count samples per group
    group_counts = {g: data[data['group'] == g].shape[0] for g in order}

    # Create updated legend labels
    legend_labels = [f"{g.replace('skin-', '')} (n={group_counts[g]})" for g in order]

    # Add updated legend — horizontally at bottom-left
    ax.legend(
        handles[:3],
        legend_labels,
        # title='Sample',
        loc='lower left',
        bbox_to_anchor=(0.35, -0.35),  # adjust y for spacing below plot
        ncol=len(order),            # show horizontally
        fontsize=12,
        title_fontsize=10,
        frameon=True,
        borderaxespad=0
    )


    # X-tick labels per taxon group
    xtick_positions = [i * group_count + 1 for i in range(n_taxa)]
    # xtick_labels = [taxon.replace('g__', '').split('_')[0] for taxon in taxa_list]
    xtick_labels = [taxon.replace('g__', '').replace('_ASV-', '\nASV-') for taxon in taxa_list]
    ax.set_xticks(xtick_positions)
    ax.set_xticklabels(xtick_labels, fontsize=14)

    ax.set_xlim(-0.5, n_taxa * group_count - 0.5)
    ax.set_ylabel("RCLR-transformed abundance", fontsize=14)
    ax.tick_params(axis='y', labelsize=14)
    ax.set_xlabel(" ")

    ax.set_title(title_name, fontsize=18, pad=14)

    # ------------------------
    # Annotate p-values
    # ------------------------
    for i, taxon in enumerate(taxa_list):
        stats = pairwise_mannwhitney_bh(data, taxon)
        y_max = data[taxon].max()
        offset = y_max * 0.15

        for k, row in stats.iterrows():
            g1, g2 = row['Pair']
            pval = row['BH-corrected p-value']
            xpos1 = i * group_count + order.index(g1)
            xpos2 = i * group_count + order.index(g2)
            x = (xpos1 + xpos2) / 2
            y = y_max + offset * k

            stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''
            label = f"{pval:.1e} {stars}"

            ax.plot([xpos1, xpos1, xpos2, xpos2], [y - 0.01, y, y, y - 0.01], lw=1, color='black')
            ax.text(x, y + 0.01, label, ha='center', va='bottom', fontsize=12)

    sns.despine()
    fig.tight_layout()
    return fig

In [291]:
taxa_list = ['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2', 'g__Staphylococcus_ASV-1',
            'g__Micrococcus_ASV-1', 'g__Veillonella_A_ASV-1', 'g__Haemophilus_D_734546_ASV-1']
fig = plot_multi_taxa_boxplots(skin_samples, taxa_list, group_palette, strip_palette, 'Differential Taxa on Skin by Atopic Dermatitis Status')
fig.savefig('../Plots/Analysis_figures/RCLR_Abundance/rclr_multi_taxa_boxplot_all_taxa_ASV.png', dpi=600)


  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


## Show correlation with lesion severity

In [292]:
# Map o_scorad and pid from metadata to skin_samples based on matching indexes
skin_samples = skin_samples.merge(metadata[['o_scorad']], left_index=True, right_index=True)
skin_samples = skin_samples.merge(metadata[['pid']], left_index=True, right_index=True)

skin_samples

Unnamed: 0_level_0,g__Staphylococcus_ASV-1,g__Staphylococcus_ASV-2,g__Streptococcus_ASV-1,g__Micrococcus_ASV-1,g__Corynebacterium_ASV-2,g__Streptococcus_ASV-2,g___ASV-1,g__Prevotella_ASV-1,g___ASV-2,g__Streptococcus_ASV-3,...,g__Flavobacterium_ASV-25,g___ASV-434,g__Staphylococcus_ASV-33,g__Flavobacterium_ASV-8,g___ASV-489,g__Staphylococcus_ASV-34,g__Streptococcus_ASV-37,group,o_scorad,pid
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,-0.203199,-1.301811,0.713092,0.713092,,,2.639770,,-0.385521,1.140536,...,,,,,,,,skin-ADL,40,Ca-009-ST
900221,,,,,-2.027367,,,,,,...,,,,,,,,skin-ADL,34,Ca-101-ID
Ca010EBL,3.054638,2.731238,0.803347,,,,2.019742,1.02649,-1.276094,0.515665,...,,,,,,,,skin-ADL,21,Ca-010-EB
900460,1.657654,1.657654,3.139258,-0.357249,,2.733793,,,,1.945336,...,,,,,,,,skin-ADL,40,Ca-146-SM
900051,3.756053,2.448896,2.021452,,,,,,,-1.379744,...,,,,,,,,skin-ADL,41,Ca-011-LQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,,,2.749671,,,,,,,-0.746836,...,,,,,,,,skin-ADL,78,Ca-126-KB
Ca006ONL,3.315374,1.783898,1.090750,,-1.394155,,2.243430,,-1.394155,0.551754,...,,,,,,,,skin-ADL,34,Ca-006-ON
Ca006ONL2,,,,,,,,,,,...,,,,,,,,skin-ADL,34,Ca-006-ON
Ca006ONNL,3.708304,1.790251,0.729379,-0.774698,0.611596,,1.751030,,-0.369233,-1.467845,...,,,,,,,,skin-ADNL,34,Ca-006-ON


In [293]:
# Convert 'o_scorad' to numeric (coerce errors to NaN)
skin_samples['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Drop rows with missing SCORAD values
skin_samples = skin_samples.dropna(subset=['o_scorad'])
skin_samples

Unnamed: 0_level_0,g__Staphylococcus_ASV-1,g__Staphylococcus_ASV-2,g__Streptococcus_ASV-1,g__Micrococcus_ASV-1,g__Corynebacterium_ASV-2,g__Streptococcus_ASV-2,g___ASV-1,g__Prevotella_ASV-1,g___ASV-2,g__Streptococcus_ASV-3,...,g__Flavobacterium_ASV-25,g___ASV-434,g__Staphylococcus_ASV-33,g__Flavobacterium_ASV-8,g___ASV-489,g__Staphylococcus_ASV-34,g__Streptococcus_ASV-37,group,o_scorad,pid
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,-0.203199,-1.301811,0.713092,0.713092,,,2.639770,,-0.385521,1.140536,...,,,,,,,,skin-ADL,40.0,Ca-009-ST
900221,,,,,-2.027367,,,,,,...,,,,,,,,skin-ADL,34.0,Ca-101-ID
Ca010EBL,3.054638,2.731238,0.803347,,,,2.019742,1.02649,-1.276094,0.515665,...,,,,,,,,skin-ADL,21.0,Ca-010-EB
900460,1.657654,1.657654,3.139258,-0.357249,,2.733793,,,,1.945336,...,,,,,,,,skin-ADL,40.0,Ca-146-SM
900051,3.756053,2.448896,2.021452,,,,,,,-1.379744,...,,,,,,,,skin-ADL,41.0,Ca-011-LQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,,,2.749671,,,,,,,-0.746836,...,,,,,,,,skin-ADL,78.0,Ca-126-KB
Ca006ONL,3.315374,1.783898,1.090750,,-1.394155,,2.243430,,-1.394155,0.551754,...,,,,,,,,skin-ADL,34.0,Ca-006-ON
Ca006ONL2,,,,,,,,,,,...,,,,,,,,skin-ADL,34.0,Ca-006-ON
Ca006ONNL,3.708304,1.790251,0.729379,-0.774698,0.611596,,1.751030,,-0.369233,-1.467845,...,,,,,,,,skin-ADNL,34.0,Ca-006-ON


In [294]:
# Filter to only include skin-ADL samples for severity correlation analysis
# rclr_df = rclr_df[(rclr_df['group'] == 'skin-ADL') | (rclr_df['group'] == 'skin-ADNL')]
skin_samples

Unnamed: 0_level_0,g__Staphylococcus_ASV-1,g__Staphylococcus_ASV-2,g__Streptococcus_ASV-1,g__Micrococcus_ASV-1,g__Corynebacterium_ASV-2,g__Streptococcus_ASV-2,g___ASV-1,g__Prevotella_ASV-1,g___ASV-2,g__Streptococcus_ASV-3,...,g__Flavobacterium_ASV-25,g___ASV-434,g__Staphylococcus_ASV-33,g__Flavobacterium_ASV-8,g___ASV-489,g__Staphylococcus_ASV-34,g__Streptococcus_ASV-37,group,o_scorad,pid
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,-0.203199,-1.301811,0.713092,0.713092,,,2.639770,,-0.385521,1.140536,...,,,,,,,,skin-ADL,40.0,Ca-009-ST
900221,,,,,-2.027367,,,,,,...,,,,,,,,skin-ADL,34.0,Ca-101-ID
Ca010EBL,3.054638,2.731238,0.803347,,,,2.019742,1.02649,-1.276094,0.515665,...,,,,,,,,skin-ADL,21.0,Ca-010-EB
900460,1.657654,1.657654,3.139258,-0.357249,,2.733793,,,,1.945336,...,,,,,,,,skin-ADL,40.0,Ca-146-SM
900051,3.756053,2.448896,2.021452,,,,,,,-1.379744,...,,,,,,,,skin-ADL,41.0,Ca-011-LQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,,,2.749671,,,,,,,-0.746836,...,,,,,,,,skin-ADL,78.0,Ca-126-KB
Ca006ONL,3.315374,1.783898,1.090750,,-1.394155,,2.243430,,-1.394155,0.551754,...,,,,,,,,skin-ADL,34.0,Ca-006-ON
Ca006ONL2,,,,,,,,,,,...,,,,,,,,skin-ADL,34.0,Ca-006-ON
Ca006ONNL,3.708304,1.790251,0.729379,-0.774698,0.611596,,1.751030,,-0.369233,-1.467845,...,,,,,,,,skin-ADNL,34.0,Ca-006-ON


In [295]:
fig, axes = plt.subplots(1, len(taxa_list), figsize=(len(taxa_list) * 2, 3.5), sharey=True)

for i, taxon in enumerate(taxa_list):
    ax = axes[i]
    
    # Drop missing values
    df = skin_samples[['o_scorad', taxon]].dropna()

    # Plot regression
    sns.regplot(
        data=df,
        x='o_scorad',
        y=taxon,
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
        ax=ax
    )

    # Compute Pearson correlation
    r, pval = pearsonr(df['o_scorad'], df[taxon])
    r_label = f"Pearson r = {r:.2f}\np = {pval:.1e}"
    ax.text(0.05, 0.95, r_label, transform=ax.transAxes,
            fontsize=10, va='top', ha='left', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

    # Format axes
    # ax.set_title(taxon.replace(' g__', '').split('_')[0], fontsize=12)
    ax.set_title(taxon.replace('g__', ''), fontsize=10)
    ax.set_xlabel("SCORAD Severity", fontsize=12)
    if i == 0:
        ax.set_ylabel("RCLR-transformed abundance", fontsize=12)
    else:
        ax.set_ylabel("")
        ax.set_yticklabels([])

    ax.set_ylim(-3, 7)

plt.tight_layout(rect=[0, 0, 1, 0.92])  # Leaves space for suptitle
plt.suptitle("Correlation Between Taxa on Skin and Atopic Dermatitis Severity", fontsize=14, y=0.98)
plt.savefig('../Plots/Analysis_figures/Severity_Correlations/rclr_abundance_vs_severity_ASV_skin.png', dpi=600)