# Relative Abundance and Alpha Diversity Plots

Date created: 11/15/2024

This notebook plots the following:

- 16S V4 relative abundance plots at Genus taxon level
- 16S V4 Shannon alpha diversity plots at Genus taxon level
- 16S V4 Faith PD alpha diversity plots at Genus taxon level

In [215]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba

In [216]:
# Define the taxa level
taxa_level = 'Genus'

In [217]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')
metadata['case_type'].value_counts()

case_type
case-nonlesional skin       111
case-anterior nares         108
case-lesional skin          107
control-anterior nares       89
control-nonlesional skin     87
Name: count, dtype: int64

In [218]:
# Define paths to the collapsed taxa tables
biom_paths = {
    '16S_V4': '../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom'
}

In [219]:
# Predefined color palette for specific families
if taxa_level == "Genus":
    taxa_colors = {
        ' g__Cutibacterium': '#ffa505',  # Bright orange
        ' g__Staphylococcus': '#92f0f0',      # Fluorescent light blue
        ' g__Streptococcus': '#FF0000',    # Red
        ' g__Corynebacterium': '#ffe59a',        # Pastel yellow
        ' g__Lawsonella': '#70a8dc',         # Light blue
        ' g__Veillonella': '#c5bce0',         # Pastel purplish
        ' g__Micrococcus':'#f4cccd',           # Pastel yellow
        ' g__Alloprevotella': '#bcbcbc',        # Light gray
        ' g__Lactobacillus': '#daead3',     # Pale mint green
        ' g__Neisseria': '#f6475f',         # Redish pink
        'Others': '#ededed'                 # White
    }

In [220]:
# A list of unique colors to use for taxa not predefined
unique_colors = sns.color_palette("deep", n_colors=20).as_hex()
unique_color_iter = cycle(unique_colors)  # Iterator to cycle through unique colors

In [221]:
# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load metadata as a DataFrame from the file path
    metadata = pd.read_csv(metadata_path, sep='\t')

    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    
    # Drop the 'row_sum' column before proceeding
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)
    
    return df


In [222]:
# Function to determine the top 15 families and collapse the rest as "Others"
def collapse_top_15(df):
    top_genera = df.sum(axis=1).nlargest(15).index  # Select top 15 families
    df_top = df.loc[top_genera]
    df_top.loc['Others'] = df.loc[~df.index.isin(top_genera)].sum()
    return df_top

In [223]:
# Function to get or assign colors to families
def get_taxa_colors(families, global_taxa_color_map):
    for taxa in families:
        if taxa not in global_taxa_color_map:
            if taxa in taxa_colors:
                global_taxa_color_map[taxa] = taxa_colors[taxa]
            else:
                global_taxa_color_map[taxa] = next(unique_color_iter)  # Assign a new unique color
    return global_taxa_color_map

## Relative abundance plots

In [224]:
def plot_relative_abundance(df, metadata, group_column, output_dir, key, taxa_color_map):
    # Average by group
    df_grouped = df.groupby(metadata[group_column], axis=1).mean()

    # Reorder the columns
    desired_order = ['control-nonlesional skin','case-nonlesional skin', 'case-lesional skin', 'control-anterior nares', 'case-anterior nares']

    df_grouped = df_grouped[desired_order]
    
    # Create output file paths
    output_png_file = os.path.join(output_dir, f'{key}_{taxa_level}_relative_abundance_plot.png')  # Save as png
    output_svg_file = os.path.join(output_dir, f'{key}_{taxa_level}_relative_abundance_plot.svg')  # Save as svg
    
    # Set plot title based on key
    if key == '16S_V4':
        plot_title = f'16S rRNA (V4) Relative Abundance'

    # Plot
    ax = df_grouped.T.plot(kind='bar', stacked=True, figsize=(13, 10),
                           width=0.8,  # Bars closer together
                           color=[taxa_color_map.get(fam, '#ADD8E6') for fam in df_grouped.index])

    plt.ylabel('Relative Abundance', fontsize=16)
    plt.xlabel(' ')
    plt.title(plot_title, fontsize=18)

    # Set x-axis labels and sample size retained after rarefaction for V1-V3 and V4
    if key == '16S_V4':
        new_labels = ['Healthy skin','AD NL skin', 'AD L skin', 'Healthy anterior nares', 'AD anterior nares']

    # Set the new x-tick labels
    plt.xticks(ticks=range(len(new_labels)), labels=new_labels, rotation=45, ha='center', fontsize=16)

    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=14, title=taxa_level, title_fontsize=16)
    plt.tight_layout()

    plt.savefig(output_png_file, format='png', dpi=600)  # Save as png
    plt.savefig(output_svg_file, format='svg')  # Save as svg
    plt.show()

    plt.close()


In [225]:
load_biom_table('../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom', metadata_path)

Unnamed: 0,900221,900570,900092,900466,9003932,900091,900556,900301,900245,900581,...,900547,900263,Ca008HNL,900081,900501,900279,900304,900580,900484,9003972
g__Streptococcus,0.000667,0.120454,0.973333,0.971667,0.700333,0.017391,0.632667,0.001667,0.006333,0.049000,...,0.209333,0.153000,0.000333,0.242000,0.539513,0.021667,0.904667,0.934667,0.703000,0.225343
g__Staphylococcus,0.058333,0.236904,0.000000,0.001000,0.000000,0.081271,0.001667,0.980667,0.017667,0.517333,...,0.004000,0.103000,0.977667,0.072333,0.058686,0.251667,0.006667,0.043333,0.000000,0.127048
g__Haemophilus_D_734546,0.000000,0.000667,0.000000,0.000000,0.283333,0.000000,0.252000,0.000000,0.000000,0.008000,...,0.750000,0.001667,0.000000,0.154333,0.014672,0.015667,0.022667,0.000333,0.132333,0.015714
g__Unknown,0.002333,0.175843,0.006667,0.000333,0.000000,0.411371,0.022333,0.003000,0.280667,0.090667,...,0.011667,0.106667,0.003667,0.147000,0.093698,0.093000,0.008000,0.000333,0.007333,0.041792
g__Corynebacterium,0.002333,0.012346,0.015333,0.000333,0.006000,0.090301,0.036333,0.012333,0.170000,0.010000,...,0.003000,0.132333,0.016333,0.027667,0.005335,0.182000,0.023333,0.014000,0.039000,0.015045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
g__Pseudomonas_K,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
g__Herbaspirillum,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
g__Marinomonas,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
g__Tetragenococcus,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [226]:
# Process both BIOM files
global_taxa_color_map = {}  # Store colors assigned to families across both datasets
for key, biom_path in biom_paths.items():
    # Load and process the BIOM table
    df = load_biom_table(biom_path, metadata_path)

    # Keep only the columns in df that have matching entries in the metadata
    df = df[df.columns.intersection(metadata['#sample-id'])]

    df_top_15 = collapse_top_15(df)  # Now using the top 15 function
    
    # Define the output directory for the current key
    output_dir = '../Plots/Analysis_figures/Relative_Abundance/'
    os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
    
    # Merge with metadata
    metadata_subset = metadata.set_index('#sample-id').loc[df.columns]  # Assuming #sample-id in metadata matches samples in BIOM table
    
    # Generate or retrieve colors for the top families in this dataset
    global_taxa_color_map = get_taxa_colors(df_top_15.index, global_taxa_color_map)
    
    # Plot the results and save in the respective folder 
    plot_relative_abundance(df_top_15, metadata_subset, 'case_type', output_dir, key, global_taxa_color_map)

  df_grouped = df.groupby(metadata[group_column], axis=1).mean()
  plt.show()


## Shannon alpha diversity plots

In [227]:
# Additional Python packages for alpha diversity analyses
from skbio.diversity import alpha_diversity
from scipy.stats import mannwhitneyu

In [228]:
def darken_color(color, amount=0.3):
    """
    Darken a given color by a certain amount.
    
    Parameters:
    - color: The base color (as a hex string or color name).
    - amount: The amount to darken the color by (default: 0.3).
    
    Returns:
    - A darkened color as a hex string.
    """
    c = to_rgba(color)
    return (c[0] * (1 - amount), c[1] * (1 - amount), c[2] * (1 - amount), c[3])

In [231]:
def calculate_shannon_alpha_diversity_and_plot(biom_path, metadata_path, group_col, title_suffix):
    # Load metadata as a DataFrame from the file path
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata = metadata.set_index('#sample-id')
    
    # Load biom table
    feature_table = load_biom_table(biom_path, metadata_path)

    # Transpose feature_table so rows are sample IDs
    feature_table = feature_table.transpose()

    # Calculate Shannon diversity
    shannon_values = alpha_diversity('shannon', feature_table.values, ids=feature_table.index)
    
    # Merge Shannon values with metadata
    common_samples = metadata.index.intersection(feature_table.index)
    metadata = metadata.loc[common_samples].copy()
    metadata['Shannon'] = shannon_values.loc[common_samples]
    
    # Set the custom order for the groups: Healthy first, then Acne Non-lesional, then Acne Lesional
    desired_order = ['control-nonlesional skin','case-nonlesional skin', 'case-lesional skin', 'control-anterior nares', 'case-anterior nares']

    # Set the color palette for the groups in the correct order
    palette = {
        'control-nonlesional skin': '#3333B3',     # Dark Blue color for Healthy
        'case-nonlesional skin': '#5cbccb',     # Blue color for AD Non-Lesional
        'case-lesional skin': '#f16c52',       # Red color for AD Lesional
        'control-anterior nares': '#008000',   # Green for healthy Nasal
        'case-anterior nares': '#FFC0CB'        # Pink for AD Nasal
    }

    # Create a new column for severity category based on local lesion severity
    # metadata['severity_category'] = pd.cut(metadata['age_months'],
    #                                     bins=[0, 2, 4, 6],  # Low (1-2), Moderate (3-4), High (5-6)
    #                                     labels=['low', 'moderate', 'high'])

    # Define a custom palette for severity categories within Acne_L group
    # severity_palette = {
    #     'low': '#F1948A',      # Light red for low severity
    #     'moderate': '#EC7063',  # Red for moderate severity
    #     'high': '#C0392B'       # Dark red for high severity
    # }

    # Create a more rectangular plot (e.g., 8 inches wide and 12 inches tall)
    plt.figure(figsize=(10, 10))

    # Plot the Shannon diversity across the groups using a boxplot with custom colors and order
    ax = sns.boxplot(x=group_col, y='Shannon', data=metadata, palette=palette, order=desired_order)

    # Darken the colors for the strip plot manually
    darker_palette = {key: darken_color(color) for key, color in palette.items()}

    # Plot the general stripplot with darkened colors
    sns.stripplot(x=group_col, y='Shannon', data=metadata, palette=darker_palette, jitter=True, dodge=False, ax=ax, linewidth=0.6, order=desired_order)

    # Add a second stripplot specifically for Acne_L, coloring by severity
    # sns.stripplot(x=metadata[metadata[group_col] == 'Acne_L'][group_col], 
    #               y=metadata[metadata[group_col] == 'Acne_L']['Shannon'], 
    #               hue=metadata[metadata[group_col] == 'Acne_L']['severity_category'],
    #               palette=severity_palette, jitter=True, dodge=False, ax=ax, linewidth=0.6)

    # Adjust the legend for the severity categories
    handles, labels = ax.get_legend_handles_labels()
    severity_handles = handles[-3:]  # Get the last three handles, which correspond to severity levels
    severity_labels = ['Low (1-2)', 'Moderate (3-4)', 'High (5-6)']  # Rename the labels

    # Move the legend outside the boxplot to the right
    # ax.legend(severity_handles, severity_labels, title='Lesional Severity Score', 
    #         loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1, frameon=True, title_fontsize=16, fontsize=14)

    # Add the title and labels
    plt.title(f'16S rRNA ({title_suffix}) Alpha Diversity', fontsize=18)
    plt.xlabel(' ')
    plt.ylabel('Shannon Index', fontsize=16)

    # Set x-axis labels and sample size retained after rarefaction for V1-V3 and V4
    if key == '16S_V4':
        new_labels = ['Healthy skin','AD NL skin', 'AD L skin', 'Healthy anterior nares', 'AD anterior nares']
        
    plt.xticks(ticks=range(len(new_labels)), labels=new_labels, rotation=45, ha='center', fontsize=16)

    # Pairwise significance testing using Mann-Whitney U test
    groups = desired_order
    p_values = {}
    
    # Heights to draw the annotation lines∂
    y_max = max(metadata['Shannon']) + 0.1
    height_step = 0.15  # Height step between lines
    
    # Perform pairwise comparisons
    for i, group1 in enumerate(groups):
        for j, group2 in enumerate(groups):
            if i < j:
                # Get the Shannon values for each group
                group1_values = metadata[metadata[group_col] == group1]['Shannon']
                group2_values = metadata[metadata[group_col] == group2]['Shannon']
                
                # Perform Mann-Whitney U test
                stat, p = mannwhitneyu(group1_values, group2_values, alternative='two-sided')
                p_values[f'{group1} vs {group2}'] = p
                
                # Determine the significance label based on p-value thresholds
                if p >= 0.05:
                    label = f"{p:.2e}"
                elif p < 0.001:
                    label = '***  ' + f"{p:.2e}"
                elif p < 0.01:
                    label = '**  ' + f"{p:.2e}"
                else:
                    label = '*  ' + f"{p:.2e}"
                
                # Get x coordinates of the boxplots
                x1, x2 = i, j
                y = y_max + height_step  # Vertical position for the horizontal line
                
                # Draw horizontal line and annotate the significance label
                plt.plot([x1, x1, x2, x2], [y, y + 0.01, y + 0.01, y], lw=1, color='black')
                plt.text((x1 + x2) * 0.5, y + 0.02, label, ha='center', va='bottom', fontsize=12)
                
                # Update y_max for the next comparison
                y_max += height_step + 0.05

        # Save the figure
        plt.savefig(f'../Plots/Analysis_figures/Diversity/{key}_{taxa_level}_Shannon_alpha-diversity.png', dpi=600, bbox_inches='tight', pad_inches=0.1)  # Save as png
        plt.savefig(f'../Plots/Analysis_figures/Diversity/{key}_{taxa_level}_Shannon_alpha-diversity.svg')  # Save as svg

    # Print pairwise p-values in scientific notation
    print("Pairwise Mann-Whitney U test p-values:")
    for comparison, p_value in p_values.items():
        print(f"{comparison}: p-value = {p_value:.2e}")


In [232]:
# Plot Alpha Diversity plots for both V1-V3 and V4
for key, biom_path in biom_paths.items():
    calculate_shannon_alpha_diversity_and_plot(
        biom_path=biom_path,
        metadata_path=metadata_path,
        group_col='case_type',
        title_suffix='V4' if key == '16S_V4' else ''
    )


  sns.stripplot(x=group_col, y='Shannon', data=metadata, palette=darker_palette, jitter=True, dodge=False, ax=ax, linewidth=0.6, order=desired_order)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


Pairwise Mann-Whitney U test p-values:
control-nonlesional skin vs case-nonlesional skin: p-value = 9.49e-01
control-nonlesional skin vs case-lesional skin: p-value = 2.84e-01
control-nonlesional skin vs control-anterior nares: p-value = 9.83e-09
control-nonlesional skin vs case-anterior nares: p-value = 7.21e-08
case-nonlesional skin vs case-lesional skin: p-value = 3.86e-01
case-nonlesional skin vs control-anterior nares: p-value = 2.11e-08
case-nonlesional skin vs case-anterior nares: p-value = 2.60e-07
case-lesional skin vs control-anterior nares: p-value = 9.47e-07
case-lesional skin vs case-anterior nares: p-value = 3.30e-06
control-anterior nares vs case-anterior nares: p-value = 5.16e-01
