# Relative Abundance Barplot of skin Samples by Region

In [205]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
import itertools

In [206]:
# A list of unique colors to use for taxa not predefined
unique_colors = sns.color_palette("deep", n_colors=20).as_hex()
unique_color_iter = cycle(unique_colors)  # Iterator to cycle through unique colors

In [207]:
def load_biom_table(biom_path, metadata_path):
    """
    Load a BIOM table and corresponding metadata. Filter to skin samples only,
    align data, and return a relative abundance table and metadata DataFrame.

    Returns:
    - df_rel_abund: taxa x sample DataFrame (relative abundance, taxa as rows)
    - metadata_filtered: metadata DataFrame (indexed by sample ID)
    """

    # Load metadata
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata = metadata.set_index('#sample-id')

    # Load BIOM table
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    df.columns = df.columns.str.replace('15564.', '')

    df = df.T  # samples as rows

    # Clean taxon labels
    df.columns = df.columns.map(lambda x: 'g__Unknown' if x.strip() == 'g__' else x.strip())

    # Join metadata
    df = df.join(metadata[['area', 'case_type']], how='left')

    # Filter for skin samples only
    df = df[df['case_type'].str.endswith('skin', na=False)]

    # Separate metadata from abundance data
    metadata_filtered = df[['area', 'case_type']]
    df = df.drop(columns=['area', 'case_type'])

    # Sort rows by total abundance
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False).drop(columns=['row_sum'])

    # Convert to relative abundance
    df_rel_abund = df.div(df.sum(axis=1), axis=0)

    return df_rel_abund.T, metadata_filtered


In [208]:
# Function to determine the top 15 families and collapse the rest as "Others"
def collapse_top_15(df):
    top_genera = df.sum(axis=1).nlargest(15).index  # Select top 15 families
    df_top = df.loc[top_genera]
    df_top.loc['Others'] = df.loc[~df.index.isin(top_genera)].sum()
    return df_top

In [209]:
def get_taxa_colors(taxa_list, global_taxa_color_map, taxa_colors=None, unique_color_iter=None):
    """
    Assign colors to each taxon in the list, using a global color map.

    Parameters:
    - taxa_list: List of taxa names to color.
    - global_taxa_color_map: Dictionary to store and reuse assigned colors.
    - taxa_colors: (Optional) predefined color dictionary for known taxa.
    - unique_color_iter: (Optional) iterator for generating new unique colors.

    Returns:
    - Updated global_taxa_color_map with all taxa in taxa_list assigned a color.
    """
    for taxa in taxa_list:
        if taxa not in global_taxa_color_map:
            if taxa_colors and taxa in taxa_colors:
                global_taxa_color_map[taxa] = taxa_colors[taxa]
            elif unique_color_iter:
                global_taxa_color_map[taxa] = next(unique_color_iter)
            else:
                global_taxa_color_map[taxa] = '#cccccc'  # fallback color
    return global_taxa_color_map


In [210]:
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')
metadata = metadata.set_index('#sample-id')

# Cast o_scorad to int, coercing NaN values to remain NaN
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009ST_L,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,24,male,4/16/2015,Autumn,Unexposed,negative,4.0,40.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,9,female,8/11/2015,Winter,Unexposed,negative,7.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
Ca010EB_L,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,24,female,11/20/2014,Spring,Unexposed,negative,7.0,21.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,18,female,9/23/2015,Spring,Unexposed,,4.0,40.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,31,male,4/21/2015,Autumn,Unexposed,negative,7.0,41.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ON_L_2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,35,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
Ca006ON_NL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,35,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
Ca006ON_NL_2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,35,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
Ca006ON_PN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,35,female,3/25/2015,Autumn,Unexposed,negative,3.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...


In [211]:
def plot_relative_abundance_two_panels(df, metadata, group_column, output_dir, key, taxa_color_map, taxa_level):
    """
    Create a two-panel stacked bar plot of relative abundances by region (Cape Town and Umtata),
    filtering out lesional samples with SCORAD >= 50 and labeling sample counts (n=) on the x-axis.
    """
    # Convert o_scorad to float, replacing any non-numeric values with NaN
    metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

    # Keep only lesional samples with o_scorad < 50 or all non-lesional/healthy
    keep_mask = ~(
        (metadata['case_type'] == 'case-lesional skin') & 
        (metadata['o_scorad'] >= 50)
    )
    metadata = metadata[keep_mask]
    df = df.loc[:, df.columns.intersection(metadata.index)]

    # Make sure sample order matches between df and metadata
    metadata = metadata.loc[df.columns]

    # Ensure no missing values in grouping columns
    if metadata[[group_column, 'area']].isnull().any().any():
        raise ValueError("Missing values in 'area' or group column in metadata.")

    # Create composite label for grouping
    metadata['site_group'] = metadata['area'] + ' | ' + metadata[group_column]

    # Count number of samples per group for labeling
    group_counts = metadata['site_group'].value_counts().to_dict()

    # Group and average by site_group
    df_grouped = df.T.groupby(metadata['site_group']).mean().T

    # Define plotting order
    ct_order = ['Cape Town | control-nonlesional skin', 'Cape Town | case-nonlesional skin', 'Cape Town | case-lesional skin']
    um_order = ['Umtata | control-nonlesional skin', 'Umtata | case-nonlesional skin', 'Umtata | case-lesional skin']

    missing = [col for col in ct_order + um_order if col not in df_grouped.columns]
    if missing:
        raise KeyError(f"Missing expected site_group columns in df_grouped: {missing}")

    df_ct = df_grouped[ct_order]
    df_um = df_grouped[um_order]

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    output_png = os.path.join(output_dir, 'Fig_3A.png')

    # Create figure
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(5.5, 8.5), sharex=False)

    # Define x-axis labels with n counts
    def get_labels(order_list):
        short_labels = ['H', 'AD', 'AD']  # includes AD non-lesional and AD lesional
        return [f'{short}\n(n={group_counts.get(group, 0)})' for short, group in zip(short_labels, order_list)]

    ct_labels = get_labels(ct_order)
    um_labels = get_labels(um_order)

    # Plot Cape Town
    df_ct.T.plot(kind='bar', stacked=True, ax=ax1, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_ct.index], legend=False)
    ax1.set_title('Cape Town', fontsize=18)
    ax1.set_ylabel('Relative Abundance', fontsize=14)
    ax1.set_xlim(-0.4, 2.5)
    ax1.set_xticks(range(3))
    ax1.set_xticklabels(ct_labels, rotation=0, fontsize=14)
    ax1.set_xlabel(' ')

    # Plot Umtata
    df_um.T.plot(kind='bar', stacked=True, ax=ax2, width=0.6,
                 color=[taxa_color_map.get(taxon, '#ADD8E6') for taxon in df_um.index], legend=False)
    ax2.set_title('Umtata', fontsize=18)
    ax2.set_ylabel('Relative Abundance', fontsize=14)
    ax2.set_xlim(-0.4, 2.5)
    ax2.set_xticks(range(3))
    ax2.set_xticklabels(um_labels, rotation=0, fontsize=14)

    # Shared legend
    handles, labels = ax1.get_legend_handles_labels()
    fig.legend(
        handles, labels,
        loc='center left',
        bbox_to_anchor=(0.65, 0.5),
        fontsize=12,
        title=taxa_level,
        title_fontsize=14,
        frameon=True
    )

    fig.suptitle('Relative Abundance skin  (SCORAD < 50)', fontsize=20, y=1.01)
    plt.subplots_adjust(left=0.15, right=0.65, top=0.93, bottom=0.08)
    plt.xlabel('')

    plt.savefig(output_png, dpi=600, bbox_inches='tight')
    plt.close()

    print(f"Figure saved to: {output_png}")

In [212]:
# Paths to input files
biom_path = '../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_5pct_rare_Genus-ASV-non-collapse.biom'

df, metadata = load_biom_table(biom_path, metadata_path)
df = collapse_top_15(df)

# Sum g__Unknown into Others row
if 'g__Unknown' in df.index:
    # Add g__Unknown values to Others
    df.loc['Others'] = df.loc['Others'] + df.loc['g__Unknown']
    # Drop g__Unknown row
    df = df.drop('g__Unknown')


df

Unnamed: 0,900360,900262,900446,900263,900555,900116,900460,900627,900358,900137,...,900102,900253,900494,900285,900337,900426,900243,900570,900572,900323
g__Staphylococcus,0.069174,0.774657,0.089333,0.103,0.119208,0.620667,0.084,0.000333,0.054333,0.968333,...,0.925592,0.022,0.0,0.155831,0.019667,0.094365,0.639333,0.236904,0.039333,0.021021
g__Streptococcus,0.128946,0.066533,0.107333,0.153,0.152451,0.067,0.423667,0.001333,0.003,0.004,...,0.00634,0.034,0.521667,0.033512,0.459667,0.037679,0.058333,0.120454,0.486667,0.005339
g__Corynebacterium,0.06951,0.018723,0.108333,0.132333,0.201478,0.034,0.003333,0.092333,0.010667,0.005,...,0.004338,0.001,0.031333,0.391086,0.017333,0.001,0.045667,0.012346,0.011333,0.001335
g__Acinetobacter,0.046004,0.014376,0.013,0.05,0.048355,0.017667,0.010667,0.0,0.643333,0.001667,...,0.01001,0.006333,0.008333,0.036528,0.003667,0.030343,0.010667,0.018352,0.008333,0.530864
g__Prevotella,0.032908,0.011033,0.024333,0.071667,0.044661,0.027333,0.068667,0.178333,0.019,0.000667,...,0.001668,0.477667,0.019,0.007038,0.039333,0.396465,0.018,0.215215,0.012667,0.003337
g__Micrococcus,0.145064,0.009027,0.020667,0.022,0.114171,0.0,0.008667,0.0,0.006333,0.0,...,0.004671,0.168,0.026,0.071381,0.002333,0.012671,0.021,0.0,0.004333,0.0
g__Haemophilus_D_734546,0.051377,0.001337,0.0,0.001667,0.0,0.064333,0.006667,0.01,0.0,0.0,...,0.000667,0.001,0.0,0.007373,0.012,0.001667,0.000333,0.000667,0.007,0.0
g__Veillonella_A,0.003694,0.019057,0.02,0.013667,0.065144,0.015333,0.060333,0.0,0.0,0.0,...,0.000667,0.008333,0.048667,0.0,0.273,0.000667,0.021667,0.028695,0.018667,0.0
g__Psychrobacter,0.034923,0.001672,0.0,0.0,0.005037,0.0,0.0,0.0,0.026667,0.0,...,0.001001,0.0,0.0,0.030161,0.0,0.006002,0.002333,0.002002,0.000333,0.11011
g__Neisseria_563205,0.038281,0.001672,0.002,0.081,0.010745,0.021333,0.011667,0.019,0.0,0.0,...,0.000334,0.006,0.128667,0.016421,0.001667,0.001,0.005333,0.01001,0.068667,0.001668


In [213]:
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
md = pd.read_csv(metadata_path, sep='\t')
md = md.set_index('#sample-id')

# Cast o_scorad to int, coercing NaN values to remain NaN
md['o_scorad'] = pd.to_numeric(md['o_scorad'], errors='coerce')

# Map o_scorad values from md to metadata
metadata['o_scorad'] = md['o_scorad']

# Fill NaN values in o_scorad with 0
metadata['o_scorad'] = metadata['o_scorad'].fillna(0)

metadata

Unnamed: 0,area,case_type,o_scorad
900221,Umtata,case-lesional skin,34.0
900570,Cape Town,case-nonlesional skin,36.0
900091,Cape Town,case-nonlesional skin,53.0
900245,Umtata,control-nonlesional skin,0.0
900581,Cape Town,case-lesional skin,67.0
...,...,...,...
900063,Cape Town,case-lesional skin,30.0
900263,Umtata,control-nonlesional skin,0.0
900081,Cape Town,case-lesional skin,44.0
900501,Umtata,case-nonlesional skin,54.0


In [214]:
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab'

# Output directory for plots
output_dir = '../Figures/Main/'

# Dataset key and taxonomic level
key = '16S_V4'
taxa_level = 'Genus'

taxa_colors = {
    'g__Staphylococcus': '#000000',         # black
    'g__Streptococcus': '#ff0000',          # bright red
    'g__Corynebacterium': '#a5d8ff',        # light sky blue
    'g__Acinetobacter': '#ffe59a',          # soft pastel yellow
    'g__Prevotella': '#a5c9a1',             # muted greenish mint
    'g__Micrococcus': '#0000FF',            # blue
    'g__Haemophilus_D_734546': '#E6E6FA',   # pale lavender
    'g__Veillonella_A': '#FA5F55',          # sunset orange
    'g__Psychrobacter': '#accbe1',          # cool steel blue
    'g__Neisseria_563205': '#b0d9b1',       # light sage green
    'g__SIO2C1': '#f7c59f',                 # warm peach
    'g__Cutibacterium': '#FADADD',          # pale pink
    'g__Chryseobacterium_7966': '#dfc5a5',  # muted tan
    'g__Dolosigranulum': '#d4a5a5',   # muted rose
    'g__Gemella': '#a5c9a1',             # muted greenish mint
    'g__Granulicatella': '#CCCCFF',            # periwinkle blue
    'Others': '#d9d9d9'                     # neutral gray
}


# Run the plotting function
plot_relative_abundance_two_panels(
    df=df,
    metadata=metadata,
    group_column='case_type',
    output_dir=output_dir,
    key=key,
    taxa_color_map=taxa_colors,
    taxa_level=taxa_level
)

Figure saved to: ../Figures/Main/Fig_4A.png
