In [47]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
import itertools


In [48]:
# Define the taxa level
taxa_level = 'Genus'

In [49]:
# Predefined color palette for specific families
if taxa_level == "Genus":
    taxa_colors = {
        ' g__Cutibacterium': '#ffa505',  # Bright orange
        ' g__Staphylococcus': '#92f0f0',      # Fluorescent light blue
        ' g__Streptococcus': '#FF0000',    # Red
        ' g__Corynebacterium': '#ffe59a',        # Pastel yellow
        ' g__Lawsonella': '#70a8dc',         # Light blue
        ' g__Veillonella': '#c5bce0',         # Pastel purplish
        ' g__Micrococcus':'#f4cccd',           # Pastel yellow
        ' g__Alloprevotella': '#bcbcbc',        # Light gray
        ' g__Lactobacillus': '#daead3',     # Pale mint green
        ' g__Neisseria': '#f6475f',         # Redish pink
        'Others': '#ededed'                 # White
    }

In [50]:
# A list of unique colors to use for taxa not predefined
unique_colors = sns.color_palette("deep", n_colors=20).as_hex()
unique_color_iter = cycle(unique_colors)  # Iterator to cycle through unique colors

In [51]:
def load_biom_table(biom_path, metadata_path):
    """
    Load a BIOM table and corresponding metadata. Filter to nares samples only,
    align data, and return a relative abundance table and metadata DataFrame.

    Returns:
    - df_rel_abund: taxa x sample DataFrame (relative abundance, taxa as rows)
    - metadata_filtered: metadata DataFrame (indexed by sample ID)
    """

    # Load metadata
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata = metadata.set_index('#sample-id')

    # Load BIOM table
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    df.columns = df.columns.str.replace('15564.', '')

    df = df.T  # samples as rows

    # Clean taxon labels
    df.columns = df.columns.map(lambda x: 'g__Unknown' if x.strip() == 'g__' else x.strip())

    # Join metadata
    df = df.join(metadata[['area', 'case_type']], how='left')

    # Filter for nares samples only
    df = df[df['case_type'].str.endswith('nares', na=False)]

    # Separate metadata from abundance data
    metadata_filtered = df[['area', 'case_type']]
    df = df.drop(columns=['area', 'case_type'])

    # Sort rows by total abundance
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False).drop(columns=['row_sum'])

    # Convert to relative abundance
    df_rel_abund = df.div(df.sum(axis=1), axis=0)

    return df_rel_abund.T, metadata_filtered


In [52]:
# Function to determine the top 15 families and collapse the rest as "Others"
def collapse_top_15(df):
    top_genera = df.sum(axis=1).nlargest(15).index  # Select top 15 families
    df_top = df.loc[top_genera]
    df_top.loc['Others'] = df.loc[~df.index.isin(top_genera)].sum()
    return df_top

In [53]:
def get_taxa_colors(taxa_list, global_taxa_color_map, taxa_colors=None, unique_color_iter=None):
    """
    Assign colors to each taxon in the list, using a global color map.

    Parameters:
    - taxa_list: List of taxa names to color.
    - global_taxa_color_map: Dictionary to store and reuse assigned colors.
    - taxa_colors: (Optional) predefined color dictionary for known taxa.
    - unique_color_iter: (Optional) iterator for generating new unique colors.

    Returns:
    - Updated global_taxa_color_map with all taxa in taxa_list assigned a color.
    """
    for taxa in taxa_list:
        if taxa not in global_taxa_color_map:
            if taxa_colors and taxa in taxa_colors:
                global_taxa_color_map[taxa] = taxa_colors[taxa]
            elif unique_color_iter:
                global_taxa_color_map[taxa] = next(unique_color_iter)
            else:
                global_taxa_color_map[taxa] = '#cccccc'  # fallback color
    return global_taxa_color_map


In [54]:
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')
metadata = metadata.set_index('#sample-id')

# Cast o_scorad to int, coercing NaN values to remain NaN
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Create severity_group column based on o_scorad values
metadata['severity_group'] = metadata['o_scorad'].apply(
    lambda x: 'severe' if x > 50 else 'moderate' if x > 25 else 'mild'
)

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,severity_group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009ST_L,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate
Ca010EB_L,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,mild
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ON_L_2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate
Ca006ON_NL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate
Ca006ON_NL_2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate
Ca006ON_PN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,moderate


In [55]:
metadata['severity_group'].value_counts()

severity_group
mild        240
moderate    196
severe       66
Name: count, dtype: int64

In [56]:
metadata['household_size'].value_counts()

household_size
4.0     104
5.0      90
7.0      67
3.0      65
6.0      50
8.0      38
9.0      21
11.0     14
10.0     12
2.0      10
12.0      8
15.0      6
13.0      2
20.0      2
18.0      1
Name: count, dtype: int64

In [57]:
metadata['case_type'].value_counts()

case_type
case-nonlesional skin       111
case-anterior nares         108
case-lesional skin          107
control-anterior nares       89
control-nonlesional skin     87
Name: count, dtype: int64

In [58]:
def plot_relative_abundance_two_panels(df, metadata, group_column, output_dir, key, taxa_color_map, taxa_level):
    """
    Create a two-panel stacked bar plot of relative abundances by region (Cape Town and Umtata),
    with sample counts (n=) in x-tick labels.
    """
    # Make sure sample order matches between df and metadata
    metadata = metadata.loc[df.columns]

    # Ensure no missing values
    if metadata[[group_column, 'area']].isnull().any().any():
        raise ValueError("Missing values in 'area' or group column in metadata.")

    # Create composite label for grouping
    metadata['site_group'] = metadata['area'] + ' | ' + metadata[group_column]

    # Count number of samples per group for labeling
    group_counts = metadata['site_group'].value_counts().to_dict()

    # Group and average by site_group
    df_grouped = df.T.groupby(metadata['site_group']).mean().T

    # Define plotting order
    ct_order = ['Cape Town | control-anterior nares', 'Cape Town | case-anterior nares']
    um_order = ['Umtata | control-anterior nares', 'Umtata | case-anterior nares']

    missing = [col for col in ct_order + um_order if col not in df_grouped.columns]
    if missing:
        raise KeyError(f"Missing expected site_group columns in df_grouped: {missing}")

    df_ct = df_grouped[ct_order]
    df_um = df_grouped[um_order]

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    output_png = os.path.join(output_dir, f'{key}_{taxa_level}_relative_abundance_CapeTown_Umtata_nares.png')
    output_svg = os.path.join(output_dir, f'{key}_{taxa_level}_relative_abundance_CapeTown_Umtata_nares.svg')

    # Make figure wider to accommodate legend
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(5.5, 8.5), sharex=False)

    # Define x-axis labels with n counts
    def get_labels(order_list):
        short_labels = ['H', 'AD']
        return [f'{short} (n={group_counts.get(group, 0)})' for short, group in zip(short_labels, order_list)]

    ct_labels = get_labels(ct_order)
    um_labels = get_labels(um_order)

    # Plot Cape Town
    df_ct.T.plot(kind='bar', stacked=True, ax=ax1, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_ct.index], legend=False)
    ax1.set_title('Cape Town', fontsize=18)
    ax1.set_ylabel('Relative Abundance', fontsize=14)
    ax1.set_xlim(-0.4, 1.5)
    ax1.set_xticks(range(2))
    ax1.set_xlabel(' ')
    ax1.set_xticklabels(ct_labels, rotation=0, fontsize=14)

    # Plot Umtata
    df_um.T.plot(kind='bar', stacked=True, ax=ax2, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_um.index], legend=False)
    ax2.set_title('Umtata', fontsize=18)
    ax2.set_ylabel('Relative Abundance', fontsize=14)
    ax2.set_xlim(-0.4, 1.5)
    ax2.set_xticks(range(2))
    ax2.set_xticklabels(um_labels, rotation=0, fontsize=14)

    # Shared legend
    handles, labels = ax1.get_legend_handles_labels()
    fig.legend(
        handles, labels,
        loc='center left',
        bbox_to_anchor=(0.65, 0.5),
        fontsize=12,
        title=taxa_level,
        title_fontsize=14,
        frameon=True
    )

    fig.suptitle('Relative Abundance Nares', fontsize=20, y=1.01)
    plt.subplots_adjust(left=0.15, right=0.65, top=0.93, bottom=0.08)
    plt.xlabel('')

    plt.savefig(output_png, dpi=600, bbox_inches='tight')
    plt.savefig(output_svg)
    plt.close()

    print(f"✅ Figure saved to: {output_png}")


In [59]:
# Paths to input files
biom_path = '../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_5pct_rare_Genus-ASV-non-collapse.biom'

df, metadata = load_biom_table(biom_path, metadata_path)
df = collapse_top_15(df)

# Sum g__Unknown into Others row
if 'g__Unknown' in df.index:
    # Add g__Unknown values to Others
    df.loc['Others'] = df.loc['Others'] + df.loc['g__Unknown']
    # Drop g__Unknown row
    df = df.drop('g__Unknown')

df


Unnamed: 0,900112,900583,900322,900396,900402,900359,900282,900267,900456,900126,...,900609,900257,900136,900053,900248,900134,900324,900466,900286,900333
g__Streptococcus,0.097667,0.280333,0.433,0.030333,0.07,0.93,0.016,0.57,0.222333,0.477667,...,0.381667,0.628667,0.536333,0.545667,0.84,0.971638,0.853,0.971667,0.534667,0.358932
g__Haemophilus_D_734546,0.275333,0.299333,0.037,0.004333,0.0,0.0,0.418667,0.135,0.0,0.331,...,0.537,0.002,0.375667,0.378333,0.010333,0.0,0.0,0.0,0.084667,0.0
g__Corynebacterium,0.048667,0.014333,0.048,0.055333,0.326667,0.045333,0.209333,0.013667,0.008667,0.074333,...,0.027667,0.185667,0.045333,0.001,0.069667,0.011345,0.0,0.000333,0.059,0.003005
g__Staphylococcus,0.021667,0.049333,0.001,0.189333,0.193,0.001667,0.012,0.001333,0.074667,0.002,...,0.0,0.013,0.0,0.0,0.000667,0.001335,0.002,0.001,0.005333,0.0
g__Dolosigranulum,0.016667,0.015333,0.002333,0.0,0.0,0.005,0.07,0.007,0.000667,0.037667,...,0.047,0.061,0.002333,0.005333,0.047667,0.001668,0.040667,0.022333,0.020333,0.0
g__SIO2C1,0.002667,0.000333,0.0,0.0,0.007667,0.0,0.0,0.0,0.554333,0.001667,...,0.0,0.0,0.001333,0.0,0.001333,0.0,0.001,0.001333,0.0,0.008681
g__Neisseria_563205,0.023333,0.046,0.067,0.004333,0.011333,0.0,0.0,0.111333,0.0,0.014667,...,0.0,0.016,0.000333,0.026333,0.0,0.000334,0.0,0.0,0.111333,0.104508
g__Pseudomonas_E_647464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
g__Prevotella,0.011333,0.081333,0.026,0.115667,0.005,0.0,0.005667,0.003667,0.002667,0.018,...,0.000333,0.004333,0.0,0.002333,0.002,0.000667,0.002,0.000333,0.009,0.065776
g__Veillonella_A,0.008,0.002667,0.025,0.004333,0.003667,0.0,0.002,0.013667,0.003333,0.000667,...,0.0,0.002,0.0,0.004333,0.001,0.000667,0.001667,0.001,0.0,0.067112


In [60]:
metadata_path = '../Data/Metadata/updated_clean_ant_nares_metadata_microbiome_type.tab'

# Output directory for plots
output_dir = '../Plots/Analysis_figures/Relative_Abundance'

# Dataset key and taxonomic level
key = '16S_V4'
taxa_level = 'Genus'

taxa_colors = {
    'g__Staphylococcus': '#000000',         # black
    'g__Streptococcus': '#ff0000',          # bright red
    'g__Corynebacterium': '#a5d8ff',        # light sky blue
    'g__Acinetobacter': '#ffe59a',          # soft pastel yellow
    'g__Prevotella': '#a5c9a1',             # muted greenish mint
    'g__Micrococcus': '#0000FF',            # blue
    'g__Haemophilus_D_734546': '#E6E6FA',   # pale lavender
    'g__Veillonella_A': '#FA5F55',          # sunset orange
    'g__Psychrobacter': '#accbe1',          # cool steel blue
    'g__Neisseria_563205': '#b0d9b1',       # light sage green
    'g__SIO2C1': '#f7c59f',                 # warm peach
    'g__Cutibacterium': '#FADADD',          # pale pink
    'g__Chryseobacterium_7966': '#dfc5a5',  # muted tan
    'g__Dolosigranulum': '#d4a5a5',   # muted rose
    'g__Gemella': '#a5c9a1',             # muted greenish mint
    'g__Granulicatella': '#CCCCFF',            # periwinkle blue
    'Others': '#d9d9d9'                     # neutral gray
}


# Run the plotting function
plot_relative_abundance_two_panels(
    df=df,
    metadata=metadata,
    group_column='severity_group',
    output_dir=output_dir,
    key=key,
    taxa_color_map=taxa_colors,
    taxa_level=taxa_level
)


✅ Figure saved to: ../Plots/Analysis_figures/Relative_Abundance/16S_V4_Genus_relative_abundance_CapeTown_Umtata_nares.png


In [61]:
def plot_relative_abundance_two_panels(df, metadata, group_column, output_dir, key, taxa_color_map, taxa_level):
    """
    Create a two-panel stacked bar plot of relative abundances by region (Cape Town and Umtata),
    with sample counts (n=) in x-tick labels.
    """
    # Make sure sample order matches between df and metadata
    metadata = metadata.loc[df.columns]

    # Ensure no missing values
    if metadata[[group_column, 'area']].isnull().any().any():
        raise ValueError("Missing values in 'area' or group column in metadata.")

    # Create composite label for grouping
    metadata['site_group'] = metadata['area'] + ' | ' + metadata[group_column]

    # Count number of samples per group for labeling
    group_counts = metadata['site_group'].value_counts().to_dict()

    # Group and average by site_group
    df_grouped = df.T.groupby(metadata['site_group']).mean().T

    # Define plotting order
    ct_order = ['Cape Town | control-anterior nares', 'Cape Town | case-anterior nares']
    um_order = ['Umtata | control-anterior nares', 'Umtata | case-anterior nares']

    missing = [col for col in ct_order + um_order if col not in df_grouped.columns]
    if missing:
        raise KeyError(f"Missing expected site_group columns in df_grouped: {missing}")

    df_ct = df_grouped[ct_order]
    df_um = df_grouped[um_order]

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    output_png = os.path.join(output_dir, f'{key}_{taxa_level}_relative_abundance_CapeTown_Umtata_nares.png')
    output_svg = os.path.join(output_dir, f'{key}_{taxa_level}_relative_abundance_CapeTown_Umtata_nares.svg')

    # Make figure wider to accommodate legend
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(5.5, 8.5), sharex=False)

    # Define x-axis labels with n counts
    def get_labels(order_list):
        short_labels = ['H', 'AD']
        return [f'{short} (n={group_counts.get(group, 0)})' for short, group in zip(short_labels, order_list)]

    ct_labels = get_labels(ct_order)
    um_labels = get_labels(um_order)

    # Plot Cape Town
    df_ct.T.plot(kind='bar', stacked=True, ax=ax1, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_ct.index], legend=False)
    ax1.set_title('Cape Town', fontsize=18)
    ax1.set_ylabel('Relative Abundance', fontsize=14)
    ax1.set_xlim(-0.4, 1.5)
    ax1.set_xticks(range(2))
    ax1.set_xlabel(' ')
    ax1.set_xticklabels(ct_labels, rotation=0, fontsize=14)

    # Plot Umtata
    df_um.T.plot(kind='bar', stacked=True, ax=ax2, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_um.index], legend=False)
    ax2.set_title('Umtata', fontsize=18)
    ax2.set_ylabel('Relative Abundance', fontsize=14)
    ax2.set_xlim(-0.4, 1.5)
    ax2.set_xticks(range(2))
    ax2.set_xticklabels(um_labels, rotation=0, fontsize=14)

    # Shared legend
    handles, labels = ax1.get_legend_handles_labels()
    fig.legend(
        handles, labels,
        loc='center left',
        bbox_to_anchor=(0.65, 0.5),
        fontsize=12,
        title=taxa_level,
        title_fontsize=14,
        frameon=True
    )

    fig.suptitle('Relative Abundance Nares', fontsize=20, y=1.01)
    plt.subplots_adjust(left=0.15, right=0.65, top=0.93, bottom=0.08)
    plt.xlabel('')

    plt.savefig(output_png, dpi=600, bbox_inches='tight')
    plt.savefig(output_svg)
    plt.close()

    print(f"✅ Figure saved to: {output_png}")
