# Strep and severity score correlations

In [95]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [96]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [97]:
# Read in table at ASV level
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus.biom'
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Get samples that don't start with 'skin' in metadata
skin_samples = metadata[metadata['group'].str.startswith('skin')].index

# Subset to only certain area samples
skin_samples = metadata[metadata['area'].str.startswith('Cape Town')].index

# Filter df to keep only skin samples
df = df.loc[skin_samples]


# Group columns by genus (everything before _ASV) and sum the values
# df = df.groupby(lambda x: x.split('_ASV')[0], axis=1).sum()
df

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Copromonas_ASV-3,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-17,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,41.0,0,0,0,0,29.0,0,0,0,9.0,...,0,0,0,0,0,0,0,0,0,0
Ca010EBL,26.0,0,0,0,0,27.0,0,0,0,16.0,...,0,0,0,0,0,0,0,0,0,0
900051,120.0,0,0,0,0,31.0,10.0,0,5.0,15.0,...,0,0,0,0,0,0,0,0,0,0
900057,111.0,0,0,0,0,3.0,17.0,0,0,4.0,...,0,0,0,0,0,0,0,0,0,0
Ca009STNL,7.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,0,1.0,0,0,0,5.0,0,4.0,0,7.0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONNL,23.0,0,0,0,0,28.0,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONNL2,31.0,20.0,0,0,0,14.0,0,0,0,6.0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONPN,0,0,286.0,0,13.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
def rclr_transform(df, pseudocount=1e-6):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df



rclr_df = rclr_transform(df)

In [99]:
# Map the 'group' column from metadata to df based on matching index
rclr_df['group'] = metadata.loc[rclr_df.index, 'group']
rclr_df

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-17,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,0.826373,,,,,0.480097,,,,-0.689974,...,,,,,,,,,,skin-ADL
Ca010EBL,0.804844,,,,,0.842585,,,,0.319337,...,,,,,,,,,,skin-ADL
900051,2.173682,,,,,0.820178,-0.311224,,-1.004371,0.094241,...,,,,,,,,,,skin-ADL
900057,2.355164,,,,,-1.255754,0.478847,,,-0.968072,...,,,,,,,,,,skin-ADL
Ca009STNL,-1.334073,,,,,,,,,,...,,,,,,,,,,skin-ADNL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,,-1.390625,,,,0.218813,,-0.004331,,0.555285,...,,,,,,,,,,skin-ADL
Ca006ONNL,0.252379,,,,,0.449089,,,,-1.784503,...,,,,,,,,,,skin-ADNL
Ca006ONNL2,0.580610,0.142356,,,,-0.214319,,,,-1.061617,...,,,,,,,,,,skin-ADNL
Ca006ONPN,,,2.152846,,-0.938196,,,,,,...,,,,,,,,,,nares-AD


In [100]:
def pairwise_mannwhitney_bh(df, feature, group_col='group'):
    groups = df[group_col].unique()
    comparisons = list(itertools.combinations(groups, 2))

    raw_pvals = []
    labels = []
    pairs = []

    for g1, g2 in comparisons:
        x = df[df[group_col] == g1][feature].dropna()
        y = df[df[group_col] == g2][feature].dropna()
        _, p = mannwhitneyu(x, y, alternative='two-sided')
        raw_pvals.append(p)
        labels.append(f"{g1} vs {g2}")
        pairs.append((g1, g2))

    _, pvals_corrected, _, _ = multipletests(raw_pvals, method='fdr_bh')

    results_df = pd.DataFrame({
        'Comparison': labels,
        'Pair': pairs,
        'Raw p-value': raw_pvals,
        'BH-corrected p-value': pvals_corrected
    })

    return results_df

In [101]:
# Filter for skin samples
skin_samples = rclr_df[rclr_df['group'].isin(['skin-ADL', 'skin-ADNL', 'skin-H'])]
# skin_samples = rclr_df[rclr_df['group'].isin(['skin-ADL', 'skin-ADNL'])]

skin_samples

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-17,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,0.826373,,,,,0.480097,,,,-0.689974,...,,,,,,,,,,skin-ADL
Ca010EBL,0.804844,,,,,0.842585,,,,0.319337,...,,,,,,,,,,skin-ADL
900051,2.173682,,,,,0.820178,-0.311224,,-1.004371,0.094241,...,,,,,,,,,,skin-ADL
900057,2.355164,,,,,-1.255754,0.478847,,,-0.968072,...,,,,,,,,,,skin-ADL
Ca009STNL,-1.334073,,,,,,,,,,...,,,,,,,,,,skin-ADNL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900135,1.204565,0.916883,,,,0.799100,,,-0.587195,0.565485,...,,,,,,,,,,skin-H
Ca006ONL,0.730854,,,,,1.870289,,,,0.260851,...,,,,,,,,,,skin-ADL
Ca006ONL2,,-1.390625,,,,0.218813,,-0.004331,,0.555285,...,,,,,,,,,,skin-ADL
Ca006ONNL,0.252379,,,,,0.449089,,,,-1.784503,...,,,,,,,,,,skin-ADNL


In [102]:
def pairwise_mannwhitney_bh(df, feature, group_col='group'):
    groups = df[group_col].unique()
    comparisons = list(itertools.combinations(groups, 2))

    raw_pvals = []
    labels = []
    pairs = []

    for g1, g2 in comparisons:
        x = df[df[group_col] == g1][feature].dropna()
        y = df[df[group_col] == g2][feature].dropna()

        # Skip comparison if either group has no data
        if len(x) == 0 or len(y) == 0:
            continue

        _, p = mannwhitneyu(x, y, alternative='two-sided')
        raw_pvals.append(p)
        labels.append(f"{g1} vs {g2}")
        pairs.append((g1, g2))

    if len(raw_pvals) == 0:
        return pd.DataFrame(columns=['Comparison', 'Pair', 'Raw p-value', 'BH-corrected p-value'])

    _, pvals_corrected, _, _ = multipletests(raw_pvals, method='fdr_bh')

    results_df = pd.DataFrame({
        'Comparison': labels,
        'Pair': pairs,
        'Raw p-value': raw_pvals,
        'BH-corrected p-value': pvals_corrected
    })

    return results_df


In [103]:
# Define custom color palette
group_palette = {
    'skin-H': '#ADD8E6',     # baby blue
    'skin-ADNL': '#FFDAB9',  # peach
    'skin-ADL': '#E31A1C'    # red
}

strip_palette = {
    'skin-H': '#6CA6CD',     # darker baby blue
    'skin-ADNL': '#E6AC8F',  # darker peach
    'skin-ADL': '#A50000'    # darker red
}


In [104]:
# def plot_multi_taxa_boxplots(data, taxa_list, group_palette, strip_palette, title_name, order=['skin-H', 'skin-ADNL', 'skin-ADL']):
#     n_taxa = len(taxa_list)
#     group_count = len(order)

#     # Prepare long format dataframe
#     plot_data = []
#     for i, taxon in enumerate(taxa_list):
#         for j, group in enumerate(order):
#             xpos = i * group_count + j
#             values = data.loc[data['group'] == group, taxon]
#             for v in values:
#                 plot_data.append({
#                     'x': xpos,
#                     'taxon': taxon.replace(' g__', '').split('_')[0],
#                     'value': v,
#                     'group': group
#                 })

#     plot_df = pd.DataFrame(plot_data)

#     # Dynamically adjust figure width
#     width_per_taxon = 3
#     fig_width = max(5, min(n_taxa * width_per_taxon, 25))
#     fig, ax = plt.subplots(figsize=(fig_width, 5))

#     sns.boxplot(
#         data=plot_df,
#         x='x',
#         y='value',
#         hue='group',
#         palette=group_palette,
#         width=0.5,
#         fliersize=0,
#         dodge=False,
#         ax=ax
#     )

#     sns.stripplot(
#         data=plot_df,
#         x='x',
#         y='value',
#         hue='group',
#         palette=strip_palette,
#         dodge=False,
#         jitter=True,
#         size=4,
#         alpha=0.75,
#         ax=ax
#     )

#     # Remove duplicated legends
#     handles, labels = ax.get_legend_handles_labels()

#     # Count samples per group
#     group_counts = {g: data[data['group'] == g].shape[0] for g in order}

#     # Create updated legend labels
#     legend_labels = [f"{g.replace('skin-', '')} (n={group_counts[g]})" for g in order]

#     # Dynamically adjust legend position
#     legend_y_offset = -0.35
#     ax.legend(
#         handles[:3],
#         legend_labels,
#         loc='lower center',
#         bbox_to_anchor=(0.5, legend_y_offset),
#         ncol=len(order),
#         fontsize=12,
#         title_fontsize=10,
#         frameon=True,
#         borderaxespad=0
#     )
#     # ax.legend('')

#     # X-tick labels per taxon group
#     xtick_positions = [i * group_count + 1 for i in range(n_taxa)]
#     xtick_labels = [taxon.replace('g__', '').replace('_ASV-', '\nASV-') for taxon in taxa_list]
#     xtick_fontsize = 16 #if n_taxa > 2 else 16
#     ax.set_xticks(xtick_positions)
#     ax.set_xticklabels(xtick_labels, fontsize=xtick_fontsize)

#     ax.set_xlim(-0.5, n_taxa * group_count - 0.5)
#     ax.set_ylabel("RCLR-transformed abundance", fontsize=16)
#     ax.tick_params(axis='y', labelsize=16)
#     ax.set_xlabel(" ")

#     ax.set_title(title_name, fontsize=18, pad=16)

#     # ------------------------
#     # Annotate p-values
#     # ------------------------
#     for i, taxon in enumerate(taxa_list):
#         stats = pairwise_mannwhitney_bh(data, taxon)
#         y_max = data[taxon].max()
#         offset = y_max * 0.25

#         for k, row in stats.iterrows():
#             g1, g2 = row['Pair']
#             pval = row['BH-corrected p-value']
#             xpos1 = i * group_count + order.index(g1)
#             xpos2 = i * group_count + order.index(g2)
#             x = (xpos1 + xpos2) / 2
#             y = y_max + offset * k

#             stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''
#             label = f"{pval:.1e} {stars}"

#             ax.plot([xpos1, xpos1, xpos2, xpos2], [y - 0.01, y, y, y - 0.01], lw=1, color='black')
#             ax.text(x, y + 0.01, label, ha='center', va='bottom', fontsize=14)

#     sns.despine()
#     fig.subplots_adjust(bottom=0.25)
#     return fig


In [105]:
# taxa_list = ['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2', 'g__Staphylococcus_ASV-1', 'g__Micrococcus_ASV-1', 'g__Veillonella_A_ASV-1']

# # fig = plot_multi_taxa_boxplots(skin_samples, taxa_list, group_palette, strip_palette, 'Differential Skin Taxa of Umtata Samples')
# # fig.savefig('../Plots/Analysis_figures/RCLR_Abundance/rclr_multi_taxa_boxplot_all_taxa_ASV_skin_Umtata.png', dpi=600)


In [106]:
# def plot_multi_taxa_boxplots_vertical(data, taxa_list, group_palette, strip_palette, title_name, order=['skin-H', 'skin-ADNL', 'skin-ADL']):

#     n_taxa = len(taxa_list)
#     fig, axes = plt.subplots(nrows=n_taxa, ncols=1, figsize=(3, 3 * n_taxa), sharex=True)

#     if n_taxa == 1:
#         axes = [axes]

#     for i, taxon in enumerate(taxa_list):
#         ax = axes[i]
#         df = data[['group', taxon]].dropna().copy()
        
#         df['taxon'] = taxon.replace('g__', '').replace('_ASV-', ' ASV-')


#         sns.boxplot(
#             data=df,
#             x='group',
#             y=taxon,
#             order=order,
#             palette=group_palette,
#             ax=ax,
#             width=0.5,
#             fliersize=0
#         )

#         sns.stripplot(
#             data=df,
#             x='group',
#             y=taxon,
#             order=order,
#             palette=strip_palette,
#             ax=ax,
#             jitter=True,
#             size=4,
#             alpha=0.7
#         )

#         ax.set_title(df['taxon'].iloc[0], fontsize=16, y = 1.05)
#         # ax.set_ylabel("RCLR abundance Cape Town", fontsize=14)
#         ax.set_ylabel("")

#         ax.tick_params(axis='y', labelsize=10)
#         # Increase x-tick label size
#         ax.tick_params(axis='x', labelsize=10)
#         ax.set_xlabel("")
#         ax.set_xticklabels(['H', 'ADNL', 'ADL'], fontsize=14)


#         # P-values
#         stats = pairwise_mannwhitney_bh(data, taxon)
#         y_max = data[taxon].max()
#         offset = y_max * 0.2
#         for k, row in stats.iterrows():
#             g1, g2 = row['Pair']
#             pval = row['BH-corrected p-value']
#             stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''
#             xpos1 = order.index(g1)
#             xpos2 = order.index(g2)
#             x = (xpos1 + xpos2) / 2
#             y = y_max + offset * k
#             ax.plot([xpos1, xpos1, xpos2, xpos2], [y-0.01, y, y, y-0.01], lw=1, color='black')
#             ax.text(x, y + 0.01, f"{pval:.1e} {stars}", ha='center', fontsize=12)

#     # fig.suptitle(title_name, fontsize=18, y=0.95)
#     fig.text(0.05, 0.5, 'RCLR Relative Abundance Umtata', va='center', rotation='vertical', fontsize=16)
#     plt.tight_layout(rect=[0.08, 0, 1, 0.96])
#     plt.subplots_adjust(hspace=0.3)

#     sns.despine()

#     return fig


In [107]:
# taxa_list = ['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2', 'g__Staphylococcus_ASV-1', 'g__Micrococcus_ASV-1', 'g__Veillonella_A_ASV-1']

# fig = plot_multi_taxa_boxplots_vertical(skin_samples, taxa_list, group_palette, strip_palette, 'Differential Skin Taxa of Umtata Samples')
# fig.savefig('../Plots/Analysis_figures/RCLR_Abundance/rclr_multi_taxa_boxplot_all_taxa_ASV_skin_Umtata.png', dpi=600)


In [108]:
# taxa_list = ['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2']

# fig = plot_multi_taxa_boxplots_vertical(skin_samples, taxa_list, group_palette, strip_palette, 'Differential Taxa')
# fig.savefig('../Plots/Analysis_figures/RCLR_Abundance/rclr_multi_taxa_boxplot_all_taxa_ASV_skin_Cape Town.png', dpi=600)


## Show correlation with lesion severity

In [109]:
# Map o_scorad and pid from metadata to skin_samples based on matching indexes
skin_samples = skin_samples.merge(metadata[['o_scorad']], left_index=True, right_index=True)
skin_samples = skin_samples.merge(metadata[['pid']], left_index=True, right_index=True)
skin_samples = skin_samples.merge(metadata[['area']], left_index=True, right_index=True)

skin_samples

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,group,o_scorad,pid,area
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,0.826373,,,,,0.480097,,,,-0.689974,...,,,,,,,skin-ADL,40,Ca-009-ST,Cape Town
Ca010EBL,0.804844,,,,,0.842585,,,,0.319337,...,,,,,,,skin-ADL,21,Ca-010-EB,Cape Town
900051,2.173682,,,,,0.820178,-0.311224,,-1.004371,0.094241,...,,,,,,,skin-ADL,41,Ca-011-LQ,Cape Town
900057,2.355164,,,,,-1.255754,0.478847,,,-0.968072,...,,,,,,,skin-ADL,33,Ca-013-NN,Cape Town
Ca009STNL,-1.334073,,,,,,,,,,...,,,,,,,skin-ADNL,40,Ca-009-ST,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900135,1.204565,0.916883,,,,0.799100,,,-0.587195,0.565485,...,,,,,,,skin-H,,Co-016-CB,Cape Town
Ca006ONL,0.730854,,,,,1.870289,,,,0.260851,...,,,,,,,skin-ADL,34,Ca-006-ON,Cape Town
Ca006ONL2,,-1.390625,,,,0.218813,,-0.004331,,0.555285,...,,,,,,,skin-ADL,34,Ca-006-ON,Cape Town
Ca006ONNL,0.252379,,,,,0.449089,,,,-1.784503,...,,,,,,,skin-ADNL,34,Ca-006-ON,Cape Town


In [110]:
# Convert 'o_scorad' to numeric (coerce errors to NaN)
skin_samples['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Drop rows with missing SCORAD values
# skin_samples = skin_samples.dropna(subset=['o_scorad'])
skin_samples

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,group,o_scorad,pid,area
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,0.826373,,,,,0.480097,,,,-0.689974,...,,,,,,,skin-ADL,40.0,Ca-009-ST,Cape Town
Ca010EBL,0.804844,,,,,0.842585,,,,0.319337,...,,,,,,,skin-ADL,21.0,Ca-010-EB,Cape Town
900051,2.173682,,,,,0.820178,-0.311224,,-1.004371,0.094241,...,,,,,,,skin-ADL,41.0,Ca-011-LQ,Cape Town
900057,2.355164,,,,,-1.255754,0.478847,,,-0.968072,...,,,,,,,skin-ADL,33.0,Ca-013-NN,Cape Town
Ca009STNL,-1.334073,,,,,,,,,,...,,,,,,,skin-ADNL,40.0,Ca-009-ST,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900135,1.204565,0.916883,,,,0.799100,,,-0.587195,0.565485,...,,,,,,,skin-H,,Co-016-CB,Cape Town
Ca006ONL,0.730854,,,,,1.870289,,,,0.260851,...,,,,,,,skin-ADL,34.0,Ca-006-ON,Cape Town
Ca006ONL2,,-1.390625,,,,0.218813,,-0.004331,,0.555285,...,,,,,,,skin-ADL,34.0,Ca-006-ON,Cape Town
Ca006ONNL,0.252379,,,,,0.449089,,,,-1.784503,...,,,,,,,skin-ADNL,34.0,Ca-006-ON,Cape Town


In [111]:
taxa_list = [' g__Streptococcus', ' g__Staphylococcus']


In [112]:
# fig, axes = plt.subplots(1, len(taxa_list), figsize=(len(taxa_list) * 2, 3.5), sharey=True)

# for i, taxon in enumerate(taxa_list):
#     ax = axes[i]
    
#     # Drop missing values
#     df = skin_samples[['o_scorad', taxon]].dropna()

#     # Plot regression
#     sns.regplot(
#         data=df,
#         x='o_scorad',
#         y=taxon,
#         scatter_kws={'alpha': 0.5, 's': 20},
#         line_kws={'color': 'black'},
#         ax=ax
#     )

#     # Compute Pearson correlation
#     r, pval = pearsonr(df['o_scorad'], df[taxon])
#     r_label = f"Pearson r = {r:.2f}\np = {pval:.1e}"
#     ax.text(0.05, 0.95, r_label, transform=ax.transAxes,
#             fontsize=10, va='top', ha='left', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

#     # Format axes
#     # ax.set_title(taxon.replace(' g__', '').split('_')[0], fontsize=12)
#     ax.set_title(taxon.replace('g__', ''), fontsize=10)
#     ax.set_xlabel("SCORAD Severity", fontsize=12)
#     if i == 0:
#         ax.set_ylabel("RCLR-transformed abundance", fontsize=12)
#     else:
#         ax.set_ylabel("")
#         ax.set_yticklabels([])

#     ax.set_ylim(-3, 7)

# plt.tight_layout(rect=[0, 0, 1, 0.92])  # Leaves space for suptitle
# plt.suptitle("Correlation Between Taxa on Skin and Atopic Dermatitis Severity", fontsize=14, y=0.98)
# plt.savefig('../Plots/Analysis_figures/Severity_Correlations/rclr_abundance_vs_severity_skin_x.png', dpi=600)

In [113]:
def plot_combined_box_and_severity(data, taxa_list, group_palette, strip_palette, title_name, order=['skin-H', 'skin-ADNL', 'skin-ADL']):
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy.stats import pearsonr
    import pandas as pd

    n_taxa = len(taxa_list)
    # fig, axes = plt.subplots(nrows=n_taxa, ncols=2, figsize=(7, 3 * n_taxa), sharey='row')
    fig, axes = plt.subplots(
        nrows=n_taxa,
        ncols=2,
        figsize=(6, 3 * n_taxa),
        sharey='row',
        gridspec_kw={'width_ratios': [2, 1.5]}  # Left column 2x wider than right
    )


    if n_taxa == 1:
        axes = [axes]

    # Manually define vertical positions for titles
    title_y_positions = [0.91, 0.73, 0.55, 0.36, 0.175]  # Adjust these if n_taxa changes
    # title_y_positions = [0.84, 0.41]
    # title_y_positions = [0.88, 0.59, 0.29]


    for i, taxon in enumerate(taxa_list):
        ax_box = axes[i][0] if n_taxa > 1 else axes[0]
        ax_corr = axes[i][1] if n_taxa > 1 else axes[1]

        # Prepare boxplot data
        df_box = data[['group', taxon]].copy()
        df_box = df_box[df_box['group'].isin(order)]
        df_box['group'] = pd.Categorical(df_box['group'], categories=order, ordered=True)
        df_box = df_box.dropna(subset=[taxon])
        df_box['taxon'] = taxon.replace('g__', '').replace('_ASV-', ' ASV-')

        # Prepare severity correlation data (only AD samples)
        df_corr = data[data['group'].isin(['skin-ADNL', 'skin-ADL'])][['o_scorad', taxon]].dropna().copy()
        df_corr['taxon'] = df_box['taxon'].iloc[0]

        # --- Left panel: Boxplot ---
        sns.boxplot(
            data=df_box,
            x='group',
            y=taxon,
            order=order,
            palette=group_palette,
            ax=ax_box,
            width=0.5,
            fliersize=0
        )

        sns.stripplot(
            data=df_box,
            x='group',
            y=taxon,
            order=order,
            palette=strip_palette,
            ax=ax_box,
            jitter=True,
            size=4,
            alpha=0.7
        )

        ax_box.set_ylabel("RCLR Abundance", fontsize=14)
        ax_box.set_xlabel("")
        ax_box.set_xticklabels(['H', 'ADNL', 'ADL'], fontsize=14)
        ax_box.tick_params(axis='y', labelsize=10)

        # P-values
        stats = pairwise_mannwhitney_bh(data, taxon)
        y_max = df_box[taxon].max()
        # offset = y_max * 0.3
        offset = y_max * 0.1


        for k, row in stats.iterrows():
            g1, g2 = row['Pair']
            pval = row['BH-corrected p-value']
            stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''
            xpos1 = order.index(g1)
            xpos2 = order.index(g2)
            x = (xpos1 + xpos2) / 2
            y = y_max + offset * k
            ax_box.plot([xpos1, xpos1, xpos2, xpos2], [y - 0.01, y, y, y - 0.01], lw=1, color='black')
            ax_box.text(x, y, f"{pval:.1e} {stars}", ha='center', fontsize=12)

        # --- Right: Correlation with severity ---
        ax_corr.set_visible(True)
        if not df_corr.empty:
            # Calculate correlation
            r, pval = pearsonr(df_corr['o_scorad'], df_corr[taxon])

            # Choose dot color
            if pval > 0.05:
                dot_color = 'grey'
            elif pval <= 0.05:
                dot_color = 'salmon'
            else:
                dot_color = None

            # Plot
            sns.regplot(
                data=df_corr,
                x='o_scorad',
                y=taxon,
                scatter_kws={'alpha': 0.5, 's': 20, 'color': dot_color} if dot_color else {'alpha': 0.5, 's': 20},
                line_kws={'color': 'black'},
                ax=ax_corr
            )

            # Add correlation label
            r_label = f"Pearson r = {r:.2f}\np = {pval:.1e}"
            ax_corr.text(0.05, 0.90, r_label, transform=ax_corr.transAxes,
                        fontsize=12, va='top', ha='left',
                        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))


        ax_corr.set_xlabel("SCORAD", fontsize=14)
        ax_corr.tick_params(axis='x', labelsize=10)
        ax_corr.set_ylim(-3, 7)
        ax_corr.set_ylabel("") if i > 0 else ax_corr.set_ylabel("")

        # --- Centered taxon name above the row ---
        fig.text(0.5, title_y_positions[i], df_box['taxon'].iloc[0], ha='center', fontsize=16)

    # fig.suptitle(title_name, fontsize=18, y=0.99)
    fig.suptitle(title_name, fontsize=18, y=0.97)

    # fig.text(0.01, 0.5, 'RCLR Abundance', va='center', rotation='vertical', fontsize=14)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    # plt.subplots_adjust(hspace=0.5)
    plt.subplots_adjust(hspace=1)

    sns.despine()

    return fig


In [114]:
taxa_list = ['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2', 'g__Staphylococcus_ASV-1', 'g__Micrococcus_ASV-1', 'g__Veillonella_A_ASV-1']

# taxa_list = ['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2', 'g__Staphylococcus_ASV-1']

fig = plot_combined_box_and_severity(
    data=skin_samples,
    taxa_list=taxa_list,
    group_palette=group_palette,
    strip_palette=strip_palette,
    title_name='\nDifferential Skin Taxa in Cape Town Children')
fig.savefig('../Plots/Analysis_figures/Severity_Correlations/combined_box_scatter_Cape Town.png', dpi=1000)


  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na',