# Differential Taxa and Correlation with Severity in Cape Town and Umtata Nares Samples

In [97]:
# Import Python packages
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches



In [98]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})


In [99]:
def rclr_transform(df, pseudocount=1e-6):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df

In [100]:
def get_rclr_transformed_by_region(biom_path, metadata, regions, nares_groups=None):
    """
    Reads a BIOM table, subsets samples by region and group, applies rclr transformation,
    and appends group metadata to each resulting DataFrame.

    Parameters:
        biom_path (str): Path to the BIOM table.
        metadata (pd.DataFrame): Metadata with 'group' and 'area' columns.
        regions (list): List of region names (e.g., ['Cape Town', 'Umtata']).
        nares_groups (list, optional): List of nares sample group labels to include (e.g., ['nares-AD', 'nares-H]).

    Returns:
        dict: Dictionary of rclr-transformed DataFrames with group labels, keyed by region.
    """
    rclr_dfs = {}

    for location in regions:
        # Load and format the BIOM table
        biom_tbl = load_table(biom_path)
        df = pd.DataFrame(biom_tbl.to_dataframe().T)
        df.index = df.index.str.replace('15564.', '', regex=False)

        # Subset metadata to relevant nares samples in the specified region
        nares_samples = metadata[
            (metadata['group'].str.startswith('nares')) &
            (metadata['area'].str.startswith(location))
        ].index

        # Filter dataframe to include only relevant samples
        df = df.loc[df.index.intersection(nares_samples)]

        # Apply rclr transformation
        rclr_df = rclr_transform(df)

        # Append group information
        rclr_df['group'] = metadata.loc[rclr_df.index, 'group']

        # Optional: filter for specific nares groups
        if nares_groups is not None:
            rclr_df = rclr_df[rclr_df['group'].isin(nares_groups)]

        # Append o_scorad information
        rclr_df['o_scorad'] = metadata.loc[rclr_df.index, 'o_scorad']    

        # Save to dictionary
        rclr_dfs[location] = rclr_df

    return rclr_dfs


In [101]:
def log_transform(df, pseudocount=1e-6):
    """
    Applies log transformation to relative abundance data.
    Converts counts to relative abundance per sample (row), adds pseudocount,
    and takes log.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense() if hasattr(df, 'sparse') else df

    # Convert to relative abundances per sample (row-wise)
    rel_abundance_df = df.div(df.sum(axis=1), axis=0)

    # Apply log transformation with pseudocount
    log_dfs = np.log(rel_abundance_df + pseudocount)

    return log_dfs


In [102]:
def get_log_transformed_by_region(biom_path, metadata, regions, nares_groups=None):
    """
    Reads a BIOM table, subsets samples by region and group, applies log transformation
    to relative abundance data, and appends group and SCORAD metadata.

    Parameters:
        biom_path (str): Path to the BIOM table.
        metadata (pd.DataFrame): Metadata with 'group' and 'area' columns.
        regions (list): List of region names (e.g., ['Cape Town', 'Umtata']).
        nares_groups (list, optional): List of nares sample group labels to include.

    Returns:
        dict: Dictionary of log-transformed DataFrames with group and SCORAD, keyed by region.
    """
    log_dfs = {}

    for location in regions:
        # Load and format the BIOM table
        biom_tbl = load_table(biom_path)
        df = pd.DataFrame(biom_tbl.to_dataframe().T)
        df.index = df.index.str.replace('15564.', '', regex=False)

        # Subset metadata to relevant nares samples in the specified region
        nares_samples = metadata[
            (metadata['group'].str.startswith('nares')) &
            (metadata['area'].str.startswith(location))
        ].index

        # Filter dataframe to include only relevant samples
        df = df.loc[df.index.intersection(nares_samples)]

        # Apply log transformation
        log_df = log_transform(df)

        # Append group information
        log_df['group'] = metadata.loc[log_df.index, 'group']

        # Optional: filter for specific nares groups
        if nares_groups is not None:
            log_df = log_df[log_df['group'].isin(nares_groups)]

        # Append SCORAD information
        log_df['o_scorad'] = metadata.loc[log_df.index, 'o_scorad']

        # Save to dictionary
        log_dfs[location] = log_df

    return log_dfs


In [103]:
def pairwise_mannwhitney_bh(df, feature, group_col='group'):
    groups = df[group_col].unique()
    comparisons = list(itertools.combinations(groups, 2))

    raw_pvals = []
    labels = []
    pairs = []

    for g1, g2 in comparisons:
        x = df[df[group_col] == g1][feature].dropna()
        y = df[df[group_col] == g2][feature].dropna()

        # Skip comparison if either group has no data
        if len(x) == 0 or len(y) == 0:
            continue

        _, p = mannwhitneyu(x, y, alternative='two-sided')
        raw_pvals.append(p)
        labels.append(f"{g1} vs {g2}")
        pairs.append((g1, g2))

    if len(raw_pvals) == 0:
        return pd.DataFrame(columns=['Comparison', 'Pair', 'Raw p-value', 'BH-corrected p-value'])

    _, pvals_corrected, _, _ = multipletests(raw_pvals, method='fdr_bh')

    results_df = pd.DataFrame({
        'Comparison': labels,
        'Pair': pairs,
        'Raw p-value': raw_pvals,
        'BH-corrected p-value': pvals_corrected
    })

    return results_df


In [104]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.stats import pearsonr, mannwhitneyu

def plot_combined_box_and_severity(region, rclr_df, log_df, taxa_list, group_palette, strip_palette, title_name, order=['nares-H', 'nares-AD']):
    n_taxa = len(taxa_list)
    fig, axes = plt.subplots(
        nrows=n_taxa,
        ncols=2,
        figsize=(6, 3 * n_taxa),
        sharey=False,
        gridspec_kw={'width_ratios': [2, 1.5]}
    )

    if n_taxa == 1:
        axes = [axes]

    title_y_positions = [0.94, 0.75, 0.56, 0.37, 0.18]

    for i, taxon in enumerate(taxa_list):
        ax_box = axes[i][0] if n_taxa > 1 else axes[0]
        ax_corr = axes[i][1] if n_taxa > 1 else axes[1]

        df_box = rclr_df[['group', taxon]].copy()
        df_box = df_box[df_box['group'].isin(order)]
        df_box['group'] = pd.Categorical(df_box['group'], categories=order, ordered=True)
        df_box = df_box.dropna(subset=[taxon])
        df_box['taxon'] = taxon.replace('g__', '').replace('_ASV-', ' ASV-')

        # Determine custom box color
        if taxon == 'g__Streptococcus_ASV-1' and region == 'Umtata':
            custom_palette = {grp: '#ff5050' for grp in order}
        else:
            custom_palette = {grp: '#c0c0c0' for grp in order}  # gray

        # Mann–Whitney U test for significance annotation
        group1 = df_box[df_box['group'] == order[0]][taxon]
        group2 = df_box[df_box['group'] == order[1]][taxon]
        stat, pval = mannwhitneyu(group1, group2, alternative='two-sided')
        stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''

        df_corr = log_df[log_df['group'].isin(order)][['o_scorad', taxon]].copy()
        df_corr['o_scorad'] = pd.to_numeric(df_corr['o_scorad'], errors='coerce')
        df_corr[taxon] = pd.to_numeric(df_corr[taxon], errors='coerce')
        df_corr = df_corr.dropna()
        df_corr['rank_scorad'] = df_corr['o_scorad'].rank()
        df_corr['rank_abundance'] = df_corr[taxon].rank()
        df_corr = df_corr[df_corr['rank_abundance'] >= 25]
        df_corr['taxon'] = df_box['taxon'].iloc[0]

        # Plot box and dots
        sns.boxplot(data=df_box, x='group', y=taxon, order=order, palette=custom_palette,
                    ax=ax_box, width=0.5, fliersize=0)
        sns.stripplot(data=df_box, x='group', y=taxon, order=order, color='#3b3b3b',
                      ax=ax_box, jitter=True, size=4, alpha=0.7)

        ax_box.set_ylabel("RCLR Rel. Abundance", fontsize=13)
        ax_box.set_xlabel("")
        ax_box.set_xticklabels(['H', 'AD'], fontsize=13)
        ax_box.tick_params(axis='y', labelsize=10)
        ax_box.set_ylim(-2, 8)

        # Add significance annotation
        if pval < 0.1:
            y_max = df_box[taxon].max()
            y = y_max + 0.6
            ax_box.plot([0, 0, 1, 1], [y, y + 0.05, y + 0.05, y], lw=1, color='black')
            ax_box.text(0.5, y + 0.1, f"{pval:.1e} {stars}", ha='center', va='bottom', fontsize=10)

        # Correlation plot
        ax_corr.set_visible(True)
        if not df_corr.empty:
            r, corr_pval = pearsonr(df_corr['rank_scorad'], df_corr['rank_abundance'])
            dot_color = 'grey' if corr_pval > 0.05 else '#ff5050'

            sns.regplot(
                data=df_corr,
                x='rank_scorad',
                y='rank_abundance',
                scatter_kws={'alpha': 0.5, 's': 20, 'color': dot_color},
                line_kws={'color': 'black'},
                ax=ax_corr
            )

            r_label = f"Spearman r = {r:.2f}\n*p* = {corr_pval:.1e}"
            ax_corr.text(0.05, 0.90, r_label, transform=ax_corr.transAxes,
                         fontsize=12, va='top', ha='left',
                         bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

        ax_corr.set_xlabel("SCORAD (ranked)", fontsize=13)
        ax_corr.set_ylabel("Rank Rel. Abundance", fontsize=13)
        ax_corr.tick_params(axis='x', labelsize=10)
        ax_corr.tick_params(axis='y', labelsize=10)

        if i < len(title_y_positions):
            fig.text(0.5, title_y_positions[i], df_box['taxon'].iloc[0], ha='center', fontsize=16)

    fig.suptitle(title_name, fontsize=18, y=0.98)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.subplots_adjust(hspace=0.5, wspace=0.5)
    sns.despine()

    return fig


In [105]:
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
regions = ['Cape Town', 'Umtata']
nares_groups = ['nares-H', 'nares-AD']

# Run the function
rclr_dfs = get_rclr_transformed_by_region(
    biom_path=biom_path,
    metadata=metadata,
    regions=regions,
    nares_groups=nares_groups
)

In [106]:
# Define custom color palette
group_palette = {
    'nares-H': '#ADD8E6',     # baby blue
    'nares-AD': '#E31A1C'    # red
}

strip_palette = {
    'nares-H': '#6CA6CD',     # darker baby blue
    'nares-AD': '#A50000'    # darker red
}

In [107]:
log_dfs = get_log_transformed_by_region(
    biom_path=biom_path,
    metadata=metadata,
    regions=['Cape Town', 'Umtata'],  # Or just ['Cape Town'] if that's all you need
    nares_groups=['nares-H', 'nares-AD']
)


In [108]:
fig = plot_combined_box_and_severity(
    region='Cape Town',
    rclr_df=rclr_dfs['Cape Town'],
    log_df=log_dfs['Cape Town'],
    taxa_list=[
        'g__Streptococcus_ASV-1',
        'g__Streptococcus_ASV-2',
        'g__Staphylococcus_ASV-1',
        'g__Micrococcus_ASV-1',
        'g__Veillonella_A_ASV-1'
    ],
    group_palette=group_palette,
    strip_palette=strip_palette,
    title_name='Differential Taxa in Cape Town Children'
)

fig.savefig('../FIgures/Supplementary/Suppl_Fig_4A.png', dpi=1000)


  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


In [109]:
fig = plot_combined_box_and_severity(
    region='Umtata',
    rclr_df=rclr_dfs['Umtata'],
    log_df=log_dfs['Umtata'],
    taxa_list=[
        'g__Streptococcus_ASV-1',
        'g__Streptococcus_ASV-2',
        'g__Staphylococcus_ASV-1',
        'g__Micrococcus_ASV-1',
        'g__Veillonella_A_ASV-1'
    ],
    group_palette=group_palette,
    strip_palette=strip_palette,
    title_name='Differential Taxa in Umtata Children'
)
fig.savefig('../FIgures/Supplementary/Suppl_Fig_4B.png', dpi=1000)


  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
