# Strep and severity score correlations

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [3]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [4]:
# Read in table at ASV level
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_absolute.biom'
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Get samples that don't start with 'skin' in metadata
skin_samples = metadata[metadata['group'].str.startswith('skin')].index

# Filter df to keep only skin samples
df = df.loc[skin_samples]
df

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Eubacterium_M,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,65.0,7.0,170.0,0,5.0,20.0,0,14.0,0,19.0,...,0,0,0,0,0,0,0,0,0,0
900221,0,27.0,0,0,2.0,3.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca010EBL,22.0,132.0,65.0,0,19.0,31.0,24.0,0,0,6.0,...,0,0,0,0,0,0,0,0,0,0
900460,150.0,32.0,50.0,3.0,0,2.0,28.0,4.0,0,13.0,...,0,0,0,0,0,0,0,0,0,0
900051,33.0,222.0,24.0,0,10.0,0,0,0,0,13.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,78.0,15.0,2.0,0,102.0,0,28.0,0,62.0,2.0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONL,19.0,135.0,59.0,1.0,14.0,16.0,0,0,0,29.0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONNL,13.0,203.0,33.0,0,10.0,17.0,3.0,1.0,0,3.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def rclr_transform(df, pseudocount=1e-6):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df



rclr_df = rclr_transform(df)

In [6]:
# Map the 'group' column from metadata to df based on matching index
rclr_df['group'] = metadata.loc[rclr_df.index, 'group']
rclr_df

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1.744767,-0.483710,2.706178,,-0.820182,0.566112,,0.209437,,0.514819,...,,,,,,,,,,skin-ADL
900221,,1.126537,,,-1.476152,-1.070687,,,,,...,,,,,,,,,,skin-ADL
Ca010EBL,1.055786,2.847545,2.139130,,0.909182,1.398730,1.142797,,,-0.243497,...,,,,,,,,,,skin-ADL
900460,3.510481,1.965581,2.411868,-0.401542,,-0.807007,1.832050,-0.113860,,1.064795,...,,,,,,,,,,skin-ADL
900051,1.435857,3.342027,1.117404,,0.241935,,,,,0.504299,...,,,,,,,,,,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,2.337271,0.688612,-1.326291,,2.605535,,1.312766,,2.107696,-1.326291,...,,,,,,,,,,skin-ADL
Ca006ONL,1.128272,3.089107,2.261370,-1.816166,0.822890,0.956421,,,,1.551128,...,,,,,,,,,,skin-ADL
Ca006ONL2,,,,,,,,,,,...,,,,,,,,,,skin-ADL
Ca006ONNL,0.635278,3.383534,1.566836,,0.372913,0.903542,-0.831059,-1.929671,,-0.831059,...,,,,,,,,,,skin-ADNL


In [7]:
rclr_df['group'].value_counts()

group
skin-ADNL    111
skin-ADL     107
skin-H        87
Name: count, dtype: int64

In [8]:
def pairwise_mannwhitney_bh(df, feature, group_col='group'):
    groups = df[group_col].unique()
    comparisons = list(itertools.combinations(groups, 2))

    raw_pvals = []
    labels = []
    pairs = []

    for g1, g2 in comparisons:
        x = df[df[group_col] == g1][feature].dropna()
        y = df[df[group_col] == g2][feature].dropna()
        _, p = mannwhitneyu(x, y, alternative='two-sided')
        raw_pvals.append(p)
        labels.append(f"{g1} vs {g2}")
        pairs.append((g1, g2))

    _, pvals_corrected, _, _ = multipletests(raw_pvals, method='fdr_bh')

    results_df = pd.DataFrame({
        'Comparison': labels,
        'Pair': pairs,
        'Raw p-value': raw_pvals,
        'BH-corrected p-value': pvals_corrected
    })

    return results_df

In [9]:
# Filter for skin samples
skin_samples = rclr_df[rclr_df['group'].isin(['skin-ADL', 'skin-ADNL', 'skin-H'])]
skin_samples

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1.744767,-0.483710,2.706178,,-0.820182,0.566112,,0.209437,,0.514819,...,,,,,,,,,,skin-ADL
900221,,1.126537,,,-1.476152,-1.070687,,,,,...,,,,,,,,,,skin-ADL
Ca010EBL,1.055786,2.847545,2.139130,,0.909182,1.398730,1.142797,,,-0.243497,...,,,,,,,,,,skin-ADL
900460,3.510481,1.965581,2.411868,-0.401542,,-0.807007,1.832050,-0.113860,,1.064795,...,,,,,,,,,,skin-ADL
900051,1.435857,3.342027,1.117404,,0.241935,,,,,0.504299,...,,,,,,,,,,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,2.337271,0.688612,-1.326291,,2.605535,,1.312766,,2.107696,-1.326291,...,,,,,,,,,,skin-ADL
Ca006ONL,1.128272,3.089107,2.261370,-1.816166,0.822890,0.956421,,,,1.551128,...,,,,,,,,,,skin-ADL
Ca006ONL2,,,,,,,,,,,...,,,,,,,,,,skin-ADL
Ca006ONNL,0.635278,3.383534,1.566836,,0.372913,0.903542,-0.831059,-1.929671,,-0.831059,...,,,,,,,,,,skin-ADNL


In [10]:
# # Define custom color palette
# group_palette = {
#     'skin-H': '#ADD8E6',     # baby blue
#     'skin-ADNL': '#FFDAB9',  # peach
#     'skin-ADL': '#E31A1C'    # red
# }

# strip_palette = {
#     'skin-H': '#6CA6CD',     # darker baby blue
#     'skin-ADNL': '#E6AC8F',  # darker peach
#     'skin-ADL': '#A50000'    # darker red
# }


# # -----------------------
# # Streptococcus plot
# # -----------------------
# fig_strep, ax1 = plt.subplots(figsize=(4, 6))
# sns.boxplot(
#     data=skin_samples,
#     x='group',
#     y=' g__Streptococcus',
#     ax=ax1,
#     palette=group_palette,
#     width=0.8,
#     order=['skin-H', 'skin-ADNL', 'skin-ADL']  # desired order
# )

# sns.stripplot(
#     data=skin_samples,
#     x='group',
#     y=' g__Streptococcus',
#     ax=ax1,
#     order=['skin-H', 'skin-ADNL', 'skin-ADL'],
#     hue='group',
#     palette=strip_palette,
#     dodge=False,
#     jitter=True,
#     size=5,
#     alpha=0.8,
#     legend=False
# )


# ax1.set_title('Strep abundance', fontsize=14)
# ax1.set_xlabel('')
# ax1.set_ylabel('RCLR-transformed abundance', fontsize=12)
# ax1.set_xticklabels([f'H\n(n=87)', f'AD-NL\n(n=111)', f'AD-L\n(n=107)'], fontsize=12)


# # Annotate Streptococcus stats
# strep_stats = pairwise_mannwhitney_bh(skin_samples, ' g__Streptococcus')

# y_max = skin_samples[' g__Streptococcus'].max()
# offset = (y_max * 0.1)

# for i, row in strep_stats.iterrows():
#     g1, g2 = row['Pair']
#     pval = row['BH-corrected p-value']
#     xpos1, xpos2 = ['skin-H', 'skin-ADNL', 'skin-ADL'].index(g1), ['skin-H', 'skin-ADNL', 'skin-ADL'].index(g2)
#     x = (xpos1 + xpos2) / 2
#     y = y_max + offset * i

#     stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else 'ns'
#     label = f"p = {pval:.3e} {stars}"
#     ax1.plot([xpos1, xpos1, xpos2, xpos2], [y-0.01, y, y, y-0.01], lw=1.5, color='black')
#     ax1.text(x, y + 0.01, label, ha='center', va='bottom', fontsize=10)

# fig_strep.tight_layout()
# fig_strep.savefig('../Plots/Analysis_figures/RCLR_Abundance/Strep_boxplot_annotated.png', dpi=600)


# # -----------------------
# # Staphylococcus plot
# # -----------------------
# fig_staph, ax2 = plt.subplots(figsize=(4, 6))
# sns.boxplot(
#     data=skin_samples,
#     x='group',
#     y=' g__Staphylococcus',
#     ax=ax2,
#     palette=group_palette,
#     width=0.75,
#     order=['skin-H', 'skin-ADNL', 'skin-ADL']  # desired order
# )

# sns.stripplot(
#     data=skin_samples,
#     x='group',
#     y=' g__Staphylococcus',
#     ax=ax2,
#     order=['skin-H', 'skin-ADNL', 'skin-ADL'],
#     hue='group',
#     palette=strip_palette,
#     dodge=False,
#     jitter=True,
#     size=5,
#     alpha=0.75,
#     legend=False
# )



# ax2.set_title('Staph abundance', fontsize=14)
# ax2.set_ylabel('RCLR-transformed abundance', fontsize=12)
# ax2.set_xlabel('')
# ax2.set_xticklabels([f'H\n(n=87)', f'AD-NL\n(n=111)', f'AD-L\n(n=107)'], fontsize=12)

# # Annotate Staphylococcus stats
# staph_stats = pairwise_mannwhitney_bh(skin_samples, ' g__Staphylococcus')

# y_max = skin_samples[' g__Staphylococcus'].max()
# offset = (y_max * 0.1)

# for i, row in staph_stats.iterrows():
#     g1, g2 = row['Pair']
#     pval = row['BH-corrected p-value']
#     xpos1, xpos2 = ['skin-H', 'skin-ADNL', 'skin-ADL'].index(g1), ['skin-H', 'skin-ADNL', 'skin-ADL'].index(g2)
#     x = (xpos1 + xpos2) / 2
#     y = y_max + offset * i

#     stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else 'ns'
#     label = f"p = {pval:.3e} {stars}"
#     ax2.plot([xpos1, xpos1, xpos2, xpos2], [y-0.01, y, y, y-0.01], lw=1.5, color='black')
#     ax2.text(x, y + 0.01, label, ha='center', va='bottom', fontsize=10)

# fig_staph.tight_layout()
# fig_staph.savefig('../Plots/Analysis_figures/RCLR_Abundance/Staph_boxplot_annotated.png', dpi=600)


In [11]:
# def plot_multi_taxa_boxplots(data, taxa_list, group_palette, strip_palette, title_name, order=['skin-H', 'skin-ADNL', 'skin-ADL']):
#     n_taxa = len(taxa_list)
#     group_count = len(order)

#     # Prepare long format dataframe
#     plot_data = []
#     for i, taxon in enumerate(taxa_list):
#         for j, group in enumerate(order):
#             xpos = i * group_count + j
#             values = data.loc[data['group'] == group, taxon]
#             for v in values:
#                 plot_data.append({
#                     'x': xpos,
#                     'taxon': taxon.replace(' g__', '').split('_')[0],
#                     'value': v,
#                     'group': group
#                 })

#     plot_df = pd.DataFrame(plot_data)

#     # Plot
#     fig, ax = plt.subplots(figsize=(n_taxa * 2.5, 5))

#     sns.boxplot(
#         data=plot_df,
#         x='x',
#         y='value',
#         hue='group',
#         palette=group_palette,
#         width=0.5,
#         fliersize=0,
#         dodge=False,
#         ax=ax
#     )

#     sns.stripplot(
#         data=plot_df,
#         x='x',
#         y='value',
#         hue='group',
#         palette=strip_palette,
#         dodge=False,
#         jitter=True,
#         size=4,
#         alpha=0.75,
#         ax=ax
#     )

#     # Remove duplicated legends
#     handles, labels = ax.get_legend_handles_labels()
#     ax.legend(
#     handles[:3],
#     [g.replace('skin-', '') for g in order],
#     title='Sample',
#     loc='upper left',
#     fontsize=12, 
#     title_fontsize = 12,
#     bbox_to_anchor=(0.95, 1.1),  # (x, y) offset from the axes
#     borderaxespad=0,
#     frameon=True
# )

#     # X-tick labels per taxon group
#     xtick_positions = [i * group_count + 1 for i in range(n_taxa)]
#     xtick_labels = [taxon.replace(' g__', '').split('_')[0] for taxon in taxa_list]
#     ax.set_xticks(xtick_positions)
#     ax.set_xticklabels(xtick_labels, fontsize=15)

#     ax.set_xlim(-0.5, n_taxa * group_count - 0.5)
#     ax.set_ylabel("RCLR-transformed abundance", fontsize=14)
#     ax.tick_params(axis='y', labelsize=11)
#     ax.set_xlabel(" ")

#     ax.set_title(title_name, fontsize=18)

#     # ------------------------
#     # Annotate p-values
#     # ------------------------
#     for i, taxon in enumerate(taxa_list):
#         stats = pairwise_mannwhitney_bh(data, taxon)
#         y_max = data[taxon].max()
#         offset = y_max * 0.15

#         for k, row in stats.iterrows():
#             g1, g2 = row['Pair']
#             pval = row['BH-corrected p-value']
#             xpos1 = i * group_count + order.index(g1)
#             xpos2 = i * group_count + order.index(g2)
#             x = (xpos1 + xpos2) / 2
#             y = y_max + offset * k

#             stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''
#             label = f"{pval:.1e} {stars}"

#             ax.plot([xpos1, xpos1, xpos2, xpos2], [y - 0.01, y, y, y - 0.01], lw=1, color='black')
#             ax.text(x, y + 0.01, label, ha='center', va='bottom', fontsize=11)

#     sns.despine()
#     fig.tight_layout()
#     return fig

In [12]:
# taxa_list = [' g__Staphylococcus', ' g__Streptococcus', ' g__Haemophilus_D_734546', ' g__Veillonella_A', ' g__Corynebacterium', 
#              ' g__Acinetobacter', ' g__Micrococcus', ' g__Veillonella_A']
# fig = plot_multi_taxa_boxplots(skin_samples, taxa_list, group_palette, strip_palette, 'Differential Bacteria on Skin by AD Status')
# fig.savefig('../Plots/Analysis_figures/RCLR_Abundance/rclr_multi_taxa_boxplot_all_taxa.png', dpi=600)


In [13]:
# taxa_list = [' g__Streptococcus', ' g__Haemophilus_D_734546', ' g__Veillonella_A', ' g__Corynebacterium', 
#              ' g__Acinetobacter', ' g__Micrococcus', ' g__Veillonella_A']
# fig = plot_multi_taxa_boxplots(skin_samples, taxa_list, group_palette, strip_palette, 'Significant Differential Taxa by AD Status')
# fig.savefig('../Plots/Analysis_figures/RCLR_Abundance/rclr_multi_taxa_boxplot_significant_taxa.png', dpi=600)


In [14]:
# taxa_list = [' g__Staphylococcus', ' g__Prevotella']
# fig = plot_multi_taxa_boxplots(skin_samples, taxa_list, group_palette, strip_palette, 'Non-Sig Differential Taxa by AD status')
# fig.savefig('../Plots/Analysis_figures/RCLR_Abundance/rclr_multi_taxa_boxplot_non-significant_taxa.png', dpi=600)


## Show correlation with lesion severity

In [15]:
# Map o_scorad from metadata to skin_samples based on matching indexes
# skin_samples = df
skin_samples = skin_samples.merge(metadata[['o_scorad']], left_index=True, right_index=True)
skin_samples

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G,group,o_scorad
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1.744767,-0.483710,2.706178,,-0.820182,0.566112,,0.209437,,0.514819,...,,,,,,,,,skin-ADL,40
900221,,1.126537,,,-1.476152,-1.070687,,,,,...,,,,,,,,,skin-ADL,34
Ca010EBL,1.055786,2.847545,2.139130,,0.909182,1.398730,1.142797,,,-0.243497,...,,,,,,,,,skin-ADL,21
900460,3.510481,1.965581,2.411868,-0.401542,,-0.807007,1.832050,-0.113860,,1.064795,...,,,,,,,,,skin-ADL,40
900051,1.435857,3.342027,1.117404,,0.241935,,,,,0.504299,...,,,,,,,,,skin-ADL,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,2.337271,0.688612,-1.326291,,2.605535,,1.312766,,2.107696,-1.326291,...,,,,,,,,,skin-ADL,78
Ca006ONL,1.128272,3.089107,2.261370,-1.816166,0.822890,0.956421,,,,1.551128,...,,,,,,,,,skin-ADL,34
Ca006ONL2,,,,,,,,,,,...,,,,,,,,,skin-ADL,34
Ca006ONNL,0.635278,3.383534,1.566836,,0.372913,0.903542,-0.831059,-1.929671,,-0.831059,...,,,,,,,,,skin-ADNL,34


In [16]:
# Convert 'o_scorad' to numeric (coerce errors to NaN)
skin_samples['o_scorad'] = pd.to_numeric(skin_samples['o_scorad'], errors='coerce')

# Drop rows with missing SCORAD values
skin_samples = skin_samples.dropna(subset=['o_scorad'])



In [17]:
# Filter to only include skin-ADL samples for severity correlation analysis
skin_samples = skin_samples[skin_samples['group'] == 'skin-ADL']
skin_samples

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G,group,o_scorad
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1.744767,-0.483710,2.706178,,-0.820182,0.566112,,0.209437,,0.514819,...,,,,,,,,,skin-ADL,40.0
900221,,1.126537,,,-1.476152,-1.070687,,,,,...,,,,,,,,,skin-ADL,34.0
Ca010EBL,1.055786,2.847545,2.139130,,0.909182,1.398730,1.142797,,,-0.243497,...,,,,,,,,,skin-ADL,21.0
900460,3.510481,1.965581,2.411868,-0.401542,,-0.807007,1.832050,-0.113860,,1.064795,...,,,,,,,,,skin-ADL,40.0
900051,1.435857,3.342027,1.117404,,0.241935,,,,,0.504299,...,,,,,,,,,skin-ADL,41.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004002,2.208196,3.189025,2.043893,-0.189699,1.419739,-1.288311,1.602060,0.097983,,1.014274,...,,,,,-1.288311,,,,skin-ADL,38.0
900403,2.789983,3.905690,1.987636,-0.171848,-0.171848,0.338978,0.926765,1.368597,,0.808982,...,,,,,,,,,skin-ADL,78.0
9004032,2.337271,0.688612,-1.326291,,2.605535,,1.312766,,2.107696,-1.326291,...,,,,,,,,,skin-ADL,78.0
Ca006ONL,1.128272,3.089107,2.261370,-1.816166,0.822890,0.956421,,,,1.551128,...,,,,,,,,,skin-ADL,34.0


In [18]:
# skin_samples = skin_samples.rename(columns={
#     col: col.replace(" g__Veillonella_A", " g__Veillonella")
#     for col in skin_samples.columns
#     if col.startswith(" g__Veillonella_A")
# })

In [19]:
taxa_list = [' g__Streptococcus', ' g__Staphylococcus', ' g__Micrococcus', ' g__Veillonella_A',
              ' g__Haemophilus_D_734546',]

# taxa_list = [' g__Staphylococcus', ' g__Streptococcus', ' g__Veillonella']

fig, axes = plt.subplots(1, len(taxa_list), figsize=(len(taxa_list) * 1.75, 4), sharey=True)

for i, taxon in enumerate(taxa_list):
    ax = axes[i]
    
    # Drop missing values
    df = skin_samples[['o_scorad', taxon]].dropna()

    # Plot regression
    sns.regplot(
        data=df,
        x='o_scorad',
        y=taxon,
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
        ax=ax
    )

    # Compute Pearson correlation
    r, pval = pearsonr(df['o_scorad'], df[taxon])
    r_label = f"Pearson r = {r:.2f}\np = {pval:.2}"
    ax.text(0.05, 0.95, r_label, transform=ax.transAxes,
            fontsize=10, va='top', ha='left', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

    # Format axes
    ax.set_title(taxon.strip(), fontsize=10)
    # ax.set_title(taxon.replace('g__', 'g__').split('_')[0], fontsize=10)

    ax.set_xlabel("SCORAD Severity", fontsize=10)
    if i == 0:
        ax.set_ylabel("RCLR-transformed abundance", fontsize=10)
    else:
        ax.set_ylabel("")
        ax.set_yticklabels([])

    ax.set_ylim(-3, 7)

plt.tight_layout(rect=[0, 0, 1, 0.92])  # Leaves space for suptitle
plt.suptitle("Correlation Between Bacteria on Skin and AD Lesion Severity", fontsize=14, y=0.98)
plt.savefig('../Plots/Analysis_figures/Severity_Correlations/rclr_abundance_vs_severity_skin.png', dpi=600)