# Differential Taxa and Correlation with Severity in Cape Town and Umtata Skin Samples

In [1]:
# Import Python packages
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr

In [2]:
# Set rarefaction depth
depth = 350

In [3]:
# Load the metadata
metadata_path = '../Metadata/16S_AD_South-Africa_metadata_subset.tsv'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional_skin': 'skin-ADL',
    'case-nonlesional_skin': 'skin-ADNL', 
    'control-nonlesional_skin': 'skin-H',
    'case-anterior_nares': 'nares-AD',
    'control-anterior_nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,specimen,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,skin,24.0,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,skin,9.0,female,8/11/2015,Winter,Unexposed,negative,7.0,34,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,skin,24.0,female,11/20/2014,Spring,Unexposed,negative,7.0,21,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,skin,18.0,female,9/23/2015,Spring,Unexposed,,4.0,40,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,skin,31.0,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900401,5,C12,SB503,AGAGTCAC,SB712,CGTAGCGA,SB712SB503,CGTAGCGA-AGAGTCAC,1.010000e+21,C12,...,skin,21.0,female,9/17/2015,Spring,Exposed,negative,12.0,38,skin-ADNL
900402,6,B4,SA502,ACTATCTG,SB704,TCTCTATG,SB704SA502,TCTCTATG-ACTATCTG,1.010000e+21,B4,...,nasal,21.0,,,,,,,,nares-AD
Ca006ONL,6,F1,SA506,CGTGAGTG,SB701,CTCGACTT,SB701SA506,CTCGACTT-CGTGAGTG,1.010000e+21,F1,...,skin,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,skin,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,skin-ADNL


In [4]:
def rclr_transform(df, pseudocount=1e-5):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df

In [5]:
def get_rclr_transformed_by_region(biom_path, metadata, regions, skin_groups=None):
    """
    Reads a BIOM table, subsets samples by region and group, applies rclr transformation,
    and appends group metadata to each resulting DataFrame.

    Parameters:
        biom_path (str): Path to the BIOM table.
        metadata (pd.DataFrame): Metadata with 'group' and 'area' columns.
        regions (list): List of region names (e.g., ['Cape Town', 'Umtata']).
        skin_groups (list, optional): List of skin sample group labels to include (e.g., ['skin-ADL', 'skin-ADNL', 'skin-H']).

    Returns:
        dict: Dictionary of rclr-transformed DataFrames with group labels, keyed by region.
    """
    rclr_dfs = {}

    for location in regions:
        # Load and format the BIOM table
        biom_tbl = load_table(biom_path)
        df = pd.DataFrame(biom_tbl.to_dataframe().T)
        df.index = df.index.str.replace('15564.', '', regex=False)

        # Subset metadata to relevant skin samples in the specified region
        skin_samples = metadata[
            (metadata['group'].str.startswith('skin')) &
            (metadata['area'].str.startswith(location))
        ].index

        # Filter dataframe to include only relevant samples
        df = df.loc[df.index.intersection(skin_samples)]

        # Apply rclr transformation
        rclr_df = rclr_transform(df)

        # Append group information
        rclr_df['group'] = metadata.loc[rclr_df.index, 'group']

        # Optional: filter for specific skin groups
        if skin_groups is not None:
            rclr_df = rclr_df[rclr_df['group'].isin(skin_groups)]

        # Append o_scorad information
        rclr_df['o_scorad'] = metadata.loc[rclr_df.index, 'o_scorad']    

        # Save to dictionary
        rclr_dfs[location] = rclr_df

    return rclr_dfs


In [6]:
def log_transform(df, pseudocount=1e-6):
    """
    Applies log transformation to relative abundance data.
    Converts counts to relative abundance per sample (row), adds pseudocount,
    and takes log.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense() if hasattr(df, 'sparse') else df

    # Convert to relative abundances per sample (row-wise)
    rel_abundance_df = df.div(df.sum(axis=1), axis=0)

    # Apply log transformation with pseudocount
    log_dfs = np.log(rel_abundance_df + pseudocount)

    return log_dfs


In [7]:
def get_log_transformed_by_region(biom_path, metadata, regions, skin_groups=None):
    """
    Reads a BIOM table, subsets samples by region and group, applies log transformation
    to relative abundance data, and appends group and SCORAD metadata.

    Parameters:
        biom_path (str): Path to the BIOM table.
        metadata (pd.DataFrame): Metadata with 'group' and 'area' columns.
        regions (list): List of region names (e.g., ['Cape Town', 'Umtata']).
        skin_groups (list, optional): List of skin sample group labels to include.

    Returns:
        dict: Dictionary of log-transformed DataFrames with group and SCORAD, keyed by region.
    """
    log_dfs = {}

    for location in regions:
        # Load and format the BIOM table
        biom_tbl = load_table(biom_path)
        df = pd.DataFrame(biom_tbl.to_dataframe().T)
        df.index = df.index.str.replace('15564.', '', regex=False)

        # Subset metadata to relevant skin samples in the specified region
        skin_samples = metadata[
            (metadata['group'].str.startswith('skin')) &
            (metadata['area'].str.startswith(location))
        ].index

        # Filter dataframe to include only relevant samples
        df = df.loc[df.index.intersection(skin_samples)]

        # Apply log transformation
        log_df = log_transform(df)

        # Append group information
        log_df['group'] = metadata.loc[log_df.index, 'group']

        # Optional: filter for specific skin groups
        if skin_groups is not None:
            log_df = log_df[log_df['group'].isin(skin_groups)]

        # Append SCORAD information
        log_df['o_scorad'] = metadata.loc[log_df.index, 'o_scorad']

        # Save to dictionary
        log_dfs[location] = log_df

    return log_dfs


In [8]:
def pairwise_mannwhitney_bh(df, feature, group_col='group'):
    groups = df[group_col].unique()
    comparisons = list(itertools.combinations(groups, 2))

    raw_pvals = []
    labels = []
    pairs = []

    for g1, g2 in comparisons:
        x = df[df[group_col] == g1][feature].dropna()
        y = df[df[group_col] == g2][feature].dropna()

        # Skip comparison if either group has no data
        if len(x) == 0 or len(y) == 0:
            continue

        _, p = mannwhitneyu(x, y, alternative='two-sided')
        raw_pvals.append(p)
        labels.append(f"{g1} vs {g2}")
        pairs.append((g1, g2))

    if len(raw_pvals) == 0:
        return pd.DataFrame(columns=['Comparison', 'Pair', 'Raw p-value', 'BH-corrected p-value'])

    _, pvals_corrected, _, _ = multipletests(raw_pvals, method='fdr_bh')

    results_df = pd.DataFrame({
        'Comparison': labels,
        'Pair': pairs,
        'Raw p-value': raw_pvals,
        'BH-corrected p-value': pvals_corrected
    })

    return results_df


In [9]:
def plot_combined_box_and_severity(region, rclr_df, log_df, taxa_list, group_palette, strip_palette, title_name, order=['skin-H', 'skin-ADNL', 'skin-ADL']):

    n_taxa = len(taxa_list)
    fig, axes = plt.subplots(
        nrows=n_taxa,
        ncols=2,
        figsize=(6, 3 * n_taxa),
        sharey=False,
        gridspec_kw={'width_ratios': [2, 1.5]}
    )

    if n_taxa == 1:
        axes = [axes]

    if region == 'Cape Town':
        title_y_positions = [0.94, 0.75, 0.56, 0.37, 0.18]
    elif region == 'Umtata':    
        title_y_positions = [0.94, 0.75, 0.56, 0.37, 0.18]

    for i, taxon in enumerate(taxa_list):
        ax_box = axes[i][0] if n_taxa > 1 else axes[0]
        ax_corr = axes[i][1] if n_taxa > 1 else axes[1]

        # Boxplot data (RCLR)
        df_box = rclr_df[['group', taxon]].copy()
        df_box = df_box[df_box['group'].isin(order)]
        df_box['group'] = pd.Categorical(df_box['group'], categories=order, ordered=True)
        df_box = df_box.dropna(subset=[taxon])
        df_box['taxon'] = taxon.replace('g__', '').replace('_ASV-', ' ASV-')

        # Correlation data (Rank SCORAD vs. Rank Abundance)
        df_corr = log_df[log_df['group'].isin(['skin-ADNL', 'skin-ADL'])][['o_scorad', taxon]].copy()
        df_corr['o_scorad'] = pd.to_numeric(df_corr['o_scorad'], errors='coerce')
        df_corr[taxon] = pd.to_numeric(df_corr[taxon], errors='coerce')
        df_corr = df_corr.dropna()
        df_corr['rank_scorad'] = df_corr['o_scorad'].rank()
        df_corr['rank_abundance'] = df_corr[taxon].rank()
        df_corr = df_corr[(df_corr['rank_abundance'] >= 25)]
        df_corr['taxon'] = df_box['taxon'].iloc[0]

        # Run stats first
        stats = pairwise_mannwhitney_bh(rclr_df, taxon)
        significant = (stats['BH-corrected p-value'] < 0.05).any()

        # Determine color rule
        if significant:
            if region == 'Cape Town':
                if df_box['taxon'].iloc[0] == 'Streptococcus ASV-1':
                    custom_palette = {'skin-H': '#ff5050', 'skin-ADNL': '#ff5050', 'skin-ADL': '#ff5050'}
                else:
                    custom_palette = {'skin-H': '#ff5050', 'skin-ADNL': '#ff5050', 'skin-ADL': '#ff5050'}

            elif region == 'Umtata':
                if df_box['taxon'].iloc[0] == 'Micrococcus ASV-1':
                    custom_palette = {'skin-H': '#66CCFF', 'skin-ADNL': '#66CCFF', 'skin-ADL': '#66CCFF'}
                else:
                    custom_palette = {'skin-H': '#ff5050', 'skin-ADNL': '#ff5050', 'skin-ADL': '#ff5050'}
        else:
            # If not significant, make all groups grey
            custom_palette = {'skin-H': 'grey', 'skin-ADNL': 'grey', 'skin-ADL': 'grey'}

        
        # Boxplot (RCLR)
        sns.boxplot(
            data=df_box,
            x='group',
            y=taxon,
            order=order,
            palette=custom_palette,
            ax=ax_box,
            width=0.5,
            fliersize=0
        )


        sns.stripplot(
            data=df_box,
            x='group',
            y=taxon,
            order=order,
            palette=strip_palette,
            ax=ax_box,
            jitter=True,
            size=4,
            alpha=0.7
        )

        ax_box.set_ylabel("RCLR Rel. Abundance", fontsize=13)
        ax_box.set_xlabel("")
        ax_box.set_xticklabels(['H', 'ADNL', 'ADL'], fontsize=13)
        ax_box.tick_params(axis='y', labelsize=10)

        # Optional stats
        stats = pairwise_mannwhitney_bh(rclr_df, taxon)
        stats_sorted = stats.sort_values(by='BH-corrected p-value')
        y_base = df_box[taxon].max() + 0.5
        gap = 1.5

        y_positions = []
        for _, row in stats_sorted.iterrows():
            y = y_base
            while any(abs(y - existing_y) < gap for existing_y in y_positions):
                y += gap
            y_positions.append(y)

        if y_positions:
            ax_box.set_ylim(top=max(y_positions) + gap * 0.7)

        for (y, (_, row)) in zip(y_positions, stats_sorted.iterrows()):
            g1, g2 = row['Pair']
            pval = row['BH-corrected p-value']
            stars = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''

            xpos1 = order.index(g1)
            xpos2 = order.index(g2)
            x_center = (xpos1 + xpos2) / 2

            ax_box.plot([xpos1, xpos1, xpos2, xpos2],
                        [y, y - 0.05, y - 0.05, y],
                        lw=1, color='black')

            ax_box.text(x_center, y + 0.05, f"{pval:.1e} {stars}",
                        ha='center', va='bottom', fontsize=10)

        # Correlation plot (rank_scorad vs rank_abundance)
        ax_corr.set_visible(True)
        if not df_corr.empty:
            r, pval = pearsonr(df_corr['rank_scorad'], df_corr['rank_abundance'])
            dot_color = 'grey' if pval > 0.05 else '#ff5050'

            sns.regplot(
                data=df_corr,
                x='rank_scorad',
                y='rank_abundance',
                scatter_kws={'alpha': 0.5, 's': 20, 'color': dot_color},
                line_kws={'color': 'black'},
                ax=ax_corr
            )

            r_label = f"Spearman r = {r:.2f}\np = {pval:.1e}"
            ax_corr.text(0.05, 0.90, r_label, transform=ax_corr.transAxes,
                         fontsize=12, va='top', ha='left',
                         bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

        ax_corr.set_xlabel("oSCORAD (ranked)", fontsize=13)
        ax_corr.set_ylabel("Rank Rel. Abundance", fontsize=13)
        ax_corr.tick_params(axis='x', labelsize=10)
        ax_corr.tick_params(axis='y', labelsize=10)

        if i < len(title_y_positions):
            fig.text(0.5, title_y_positions[i], df_box['taxon'].iloc[0], ha='center', fontsize=16)

    fig.suptitle(title_name, fontsize=18, y=0.98)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.subplots_adjust(hspace=0.5, wspace=0.5)
    sns.despine()

    return fig


In [10]:
biom_path = f'../Data/Tables/Count_Tables/6_209766_feature_table_dedup_prev-filt-1pct_rare-{depth}_Genus-ASV_skin.biom'
regions = ['Cape Town', 'Umtata']
skin_groups = ['skin-H', 'skin-ADNL', 'skin-ADL']

# Run the function
rclr_dfs = get_rclr_transformed_by_region(
    biom_path=biom_path,
    metadata=metadata,
    regions=regions,
    skin_groups=skin_groups
)

In [11]:
# Define custom color palette
group_palette = {
    'skin-H': '#ADD8E6',     # baby blue
    'skin-ADNL': '#FFDAB9',  # peach
    'skin-ADL': '#E31A1C'    # red
}

strip_palette = {
    'skin-H': '#3b3b3b',     
    'skin-ADNL': '#3b3b3b',  
    'skin-ADL': '#3b3b3b'    
}

In [12]:
log_dfs = get_log_transformed_by_region(
    biom_path=biom_path,
    metadata=metadata,
    regions=['Cape Town', 'Umtata'],  # Or just ['Cape Town'] if that's all you need
    skin_groups=['skin-H', 'skin-ADNL', 'skin-ADL']
)


In [13]:
fig = plot_combined_box_and_severity(
    region='Cape Town',
    rclr_df=rclr_dfs['Cape Town'],
    log_df=log_dfs['Cape Town'],
    taxa_list=[
        'g__Streptococcus_ASV-1',
        'g__Streptococcus_ASV-2',
        'g__Staphylococcus_ASV-1',
        'g__Micrococcus_ASV-1',
        'g__Veillonella_A_ASV-1'
    ],
    group_palette=group_palette,
    strip_palette=strip_palette,
    title_name='Skin Differential Taxa in Cape Town Children'
)
fig.savefig('../Figures/Main/Fig_4E.jpg', dpi=1000)


  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na',

In [14]:
fig = plot_combined_box_and_severity(
    region='Umtata',
    rclr_df=rclr_dfs['Umtata'],
    log_df=log_dfs['Umtata'],
    taxa_list=[
        'g__Streptococcus_ASV-1',
        'g__Streptococcus_ASV-2',
        'g__Staphylococcus_ASV-1',
        'g__Micrococcus_ASV-1',
        'g__Veillonella_A_ASV-1'
    ],
    group_palette=group_palette,
    strip_palette=strip_palette,
    title_name='Skin Differential Taxa in Umtata Children'
)
fig.savefig('../Figures/Main/Fig_4D.jpg', dpi=1000)


  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  grouped_vals = vals.groupby(grouper)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na',