In [541]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
import itertools


In [542]:
# A list of unique colors to use for taxa not predefined
unique_colors = sns.color_palette("deep", n_colors=20).as_hex()
unique_color_iter = cycle(unique_colors)  # Iterator to cycle through unique colors

In [543]:
def load_biom_table(biom_path, metadata_path):
    """
    Load a BIOM table and corresponding metadata. Filter to skin samples only,
    align data, and return a relative abundance table and metadata DataFrame.

    Returns:
    - df_rel_abund: taxa x sample DataFrame (relative abundance, taxa as rows)
    - metadata_filtered: metadata DataFrame (indexed by sample ID)
    """

    # Load metadata
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata = metadata.set_index('#sample-id')

    # Load BIOM table
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    df.columns = df.columns.str.replace('15564.', '')

    df = df.T  # samples as rows

    # Clean taxon labels
    df.columns = df.columns.map(lambda x: 'g__Unknown' if x.strip() == 'g__' else x.strip())

    # Join metadata
    df = df.join(metadata[['area', 'case_type']], how='left')

    # Filter for skin samples only
    df = df[df['case_type'].str.endswith('skin', na=False)]

    # Separate metadata from abundance data
    metadata_filtered = df[['area', 'case_type']]
    df = df.drop(columns=['area', 'case_type'])

    # Sort rows by total abundance
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False).drop(columns=['row_sum'])

    # Convert to relative abundance
    df_rel_abund = df.div(df.sum(axis=1), axis=0)

    return df_rel_abund.T, metadata_filtered


In [544]:
# Function to determine the top 15 families and collapse the rest as "Others"
def collapse_top_15(df):
    top_genera = df.sum(axis=1).nlargest(15).index  # Select top 15 families
    df_top = df.loc[top_genera]
    df_top.loc['Others'] = df.loc[~df.index.isin(top_genera)].sum()
    return df_top

In [545]:
def get_taxa_colors(taxa_list, global_taxa_color_map, taxa_colors=None, unique_color_iter=None):
    """
    Assign colors to each taxon in the list, using a global color map.

    Parameters:
    - taxa_list: List of taxa names to color.
    - global_taxa_color_map: Dictionary to store and reuse assigned colors.
    - taxa_colors: (Optional) predefined color dictionary for known taxa.
    - unique_color_iter: (Optional) iterator for generating new unique colors.

    Returns:
    - Updated global_taxa_color_map with all taxa in taxa_list assigned a color.
    """
    for taxa in taxa_list:
        if taxa not in global_taxa_color_map:
            if taxa_colors and taxa in taxa_colors:
                global_taxa_color_map[taxa] = taxa_colors[taxa]
            elif unique_color_iter:
                global_taxa_color_map[taxa] = next(unique_color_iter)
            else:
                global_taxa_color_map[taxa] = '#cccccc'  # fallback color
    return global_taxa_color_map


In [546]:
def plot_relative_abundance_two_panels(df, metadata, output_dir, taxa_color_map):
    """
    Create a two-panel stacked bar plot of relative abundances by region (Cape Town and Umtata),
    with sample counts (n=) in x-tick labels.
    """

    # Make sure sample order matches between df and metadata
    # metadata = metadata.loc[df.index]
    # print(df.index)
    # print(metadata.index)
    
    shared_samples = df.index.intersection(metadata.index)
    # print(shared_samples)

    df = df.loc[shared_samples]
    metadata = metadata.loc[shared_samples]

    # Ensure no missing values
    if metadata[['case_type', 'area']].isnull().any().any():
        raise ValueError("Missing values in 'area' or group column in metadata.")

    # Create composite label for grouping
    metadata['site_group'] = metadata['area'] + ' | ' + metadata['case_type']
    print(metadata['site_group'].value_counts())
    
    # Count number of samples per group for labeling
    group_counts = metadata['site_group'].value_counts().to_dict()
    print(group_counts)

    # Group and average by site_group
    df_grouped = df.groupby(metadata['site_group']).mean().T
    # print(df_grouped)

    # Define plotting order
    ct_order = ['Cape Town | control-nonlesional skin', 'Cape Town | case-nonlesional skin', 'Cape Town | case-lesional skin']
    um_order = ['Umtata | control-nonlesional skin', 'Umtata | case-nonlesional skin', 'Umtata | case-lesional skin']

    missing = [col for col in ct_order + um_order if col not in df_grouped.columns]
    if missing:
        raise KeyError(f"Missing expected site_group columns in df_grouped: {missing}")

    df_ct = df_grouped[ct_order]
    df_um = df_grouped[um_order]

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    output_png = os.path.join(output_dir, 'Fig_3A.png')

    # Make figure wider to accommodate legend
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7.5, 8.5), sharex=False)
    # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6), sharex=False)

    # Define x-axis labels with n counts
    def get_labels(order_list):
        short_labels = ['H', 'ADNL', 'ADL']
        return [f'{short} (n={group_counts.get(group, 0)})' for short, group in zip(short_labels, order_list)]

    ct_labels = get_labels(ct_order)
    um_labels = get_labels(um_order)

    # Plot Cape Town
    df_ct.T.plot(kind='bar', stacked=True, ax=ax1, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_ct.index], legend=False)
    ax1.set_title('Cape Town', fontsize=18)
    ax1.set_ylabel('Relative Abundance', fontsize=14)
    ax1.set_xlim(-0.4, 2.4)
    ax1.set_xticks(range(3))
    ax1.set_xlabel(' ')
    ax1.set_xticklabels(ct_labels, rotation=0, fontsize=14)

    # Plot Umtata
    df_um.T.plot(kind='bar', stacked=True, ax=ax2, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_um.index], legend=False)
    ax2.set_title('Umtata', fontsize=18)
    ax2.set_ylabel('Relative Abundance', fontsize=14)
    ax2.set_xlim(-0.4, 2.4)
    ax2.set_xticks(range(3))
    ax2.set_xticklabels(um_labels, rotation=0, fontsize=14)

    # Shared legend
    handles, labels = ax1.get_legend_handles_labels()
    fig.legend(
        handles, labels,
        loc='center left',
        bbox_to_anchor=(0.65, 0.5),
        fontsize=12,
        title_fontsize=14,
        frameon=True
    )

    fig.suptitle('Relative Abundance Skin', fontsize=20, y=1.05)
    plt.subplots_adjust(left=0.15, right=0.65, top=0.93, bottom=0.08)
    plt.xlabel('')

    plt.savefig(output_png, dpi=600, bbox_inches='tight')
    plt.close()

    print(f"sFigure saved to: {output_png}")

In [547]:
# Paths to input files
biom_path = '../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_5pct_rare_Genus-ASV-non-collapse.biom'
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type-location.tab'

df, metadata = load_biom_table(biom_path, metadata_path)
df = collapse_top_15(df)

# Sum g__Unknown into Others row
if 'g__Unknown' in df.index:
    # Add g__Unknown values to Others
    df.loc['Others'] = df.loc['Others'] + df.loc['g__Unknown']
    # Drop g__Unknown row
    df = df.drop('g__Unknown')

# Switch order of first and second row
# Get first two row names
first_row = df.index[0]
second_row = df.index[1]

# Create new df with swapped rows
df = df.reindex([second_row, first_row] + list(df.index[2:]))

df = df.T
df

Unnamed: 0,g__Streptococcus,g__Staphylococcus,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Haemophilus_D_734546,g__Veillonella_A,g__Psychrobacter,g__Neisseria_563205,g__SIO2C1,g__Cutibacterium,g__Chryseobacterium_796614,g__Pseudomonas_E_647464,Others
900360,0.128946,0.069174,0.069510,0.046004,0.032908,0.145064,0.051377,0.003694,0.034923,0.038281,0.011081,0.008059,0.000000,0.000000,0.360981
900262,0.066533,0.774657,0.018723,0.014376,0.011033,0.009027,0.001337,0.019057,0.001672,0.001672,0.007021,0.010699,0.000000,0.000000,0.064193
900446,0.107333,0.089333,0.108333,0.013000,0.024333,0.020667,0.000000,0.020000,0.000000,0.002000,0.000000,0.005667,0.360000,0.000333,0.249000
900263,0.153000,0.103000,0.132333,0.050000,0.071667,0.022000,0.001667,0.013667,0.000000,0.081000,0.011333,0.010333,0.001000,0.000000,0.349000
900555,0.152451,0.119208,0.201478,0.048355,0.044661,0.114171,0.000000,0.065144,0.005037,0.010745,0.000000,0.000000,0.000000,0.000000,0.238751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900426,0.037679,0.094365,0.001000,0.030343,0.396465,0.012671,0.001667,0.000667,0.006002,0.001000,0.006669,0.016339,0.038680,0.000000,0.356452
900243,0.058333,0.639333,0.045667,0.010667,0.018000,0.021000,0.000333,0.021667,0.002333,0.005333,0.003333,0.004667,0.000000,0.000667,0.168667
900570,0.120454,0.236904,0.012346,0.018352,0.215215,0.000000,0.000667,0.028695,0.002002,0.010010,0.000667,0.003337,0.001001,0.000000,0.350350
900572,0.486667,0.039333,0.011333,0.008333,0.012667,0.004333,0.007000,0.018667,0.000333,0.068667,0.001000,0.041000,0.018000,0.000000,0.282667


In [548]:
metadata = pd.read_csv('../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type-location.tab', sep='\t')
# Drop first column and set #sample-id as index
metadata = metadata.drop('Unnamed: 0', axis=1).set_index('#sample-id')
# Remove name of index column 
metadata.index.name = None

metadata

Unnamed: 0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,o_scorad_adj,individual_case,individual_case_location
Ca009ST_L,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,40,AD skin,AD skin Cape Town
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34,AD skin,AD skin Umtata
Ca010EB_L,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,21,AD skin,AD skin Cape Town
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,40,AD skin,AD skin Umtata
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,41,AD skin,AD skin Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ON_L_2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34,AD skin,AD skin Cape Town
Ca006ON_NL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34,AD skin,AD skin Cape Town
Ca006ON_NL_2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34,AD skin,AD skin Cape Town
Ca006ON_PN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34,AD nares,AD nares Cape Town


In [549]:
metadata['case_type'].value_counts()

case_type
case-nonlesional skin       111
case-anterior nares         108
case-lesional skin          107
control-anterior nares       89
control-nonlesional skin     87
Name: count, dtype: int64

In [550]:
output_dir = '../Figures/Main/'


taxa_colors = {
    'g__Streptococcus': '#ff0000',          # bright red
    'g__Corynebacterium': '#a5d8ff',        # light sky blue
    'g__Acinetobacter': '#ffe59a',          # soft pastel yellow
    'g__Prevotella': '#a5c9a1',             # muted greenish mint
    'g__Micrococcus': '#0000FF',            # blue
    'g__Haemophilus_D_734546': '#E6E6FA',   # pale lavender
    'g__Staphylococcus': '#000000',         # black
    'g__Veillonella_A': '#FA5F55',          # sunset orange
    'g__Psychrobacter': '#accbe1',          # cool steel blue
    'g__Neisseria_563205': '#b0d9b1',       # light sage green
    'g__SIO2C1': '#f7c59f',                 # warm peach
    'g__Cutibacterium': '#FADADD',          # pale pink
    'g__Chryseobacterium_7966': '#dfc5a5',  # muted tan
    'g__Pseudomonas_E_647464': '#d4a5a5',   # muted rose
    'Others': '#d9d9d9'                     # neutral gray
}


# Run the plotting function
plot_relative_abundance_two_panels(
    df=df,
    metadata=metadata,
    output_dir=output_dir,
    taxa_color_map=taxa_colors)

site_group
Umtata | case-lesional skin             37
Umtata | control-nonlesional skin       27
Cape Town | case-lesional skin          24
Umtata | case-nonlesional skin          19
Cape Town | case-nonlesional skin       19
Cape Town | control-nonlesional skin    14
Name: count, dtype: int64
{'Umtata | case-lesional skin': 37, 'Umtata | control-nonlesional skin': 27, 'Cape Town | case-lesional skin': 24, 'Umtata | case-nonlesional skin': 19, 'Cape Town | case-nonlesional skin': 19, 'Cape Town | control-nonlesional skin': 14}
sFigure saved to: ../Figures/Main/Fig_3A.png
