# Alpha Diversity (Faith PD)

In [161]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
from scipy.stats import mannwhitneyu
from skbio.diversity import alpha_diversity
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
from matplotlib.patches import Circle
from matplotlib.colors import to_hex
import statsmodels.api as sm
from biom import load_table

In [162]:
# Read in table at collapsed genera level
biom_path = f'../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_filtered_Genus-ASV-non-collapse.biom'

biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)
df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Cutibacterium_ASV-1,g___ASV-16,g___ASV-25,g___ASV-27,g__Cutibacterium_ASV-2,...,g__Copromonas_ASV-2,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-15,g___ASV-126,g__Leptotrichia_A_993758_ASV-16,g__Capnocytophaga_820688_ASV-7,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-4,g__Bosea_ASV-2,g___ASV-119
900344,156.0,95.0,23.0,17.0,1.0,2.0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900459,21.0,30.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900570,18.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900092,174.0,104.0,10.0,1.0,0,0,0,0,2.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900294,6.0,2.0,0,0,0,41.0,0,0,0,32.0,...,0,0,0,0,0,0,0,0,0,0
9003972,38.0,21.0,0,0,1.0,18.0,0,0,0,12.0,...,0,0,0,0,0,0,0,0,0,0
900097,3.0,0,0,0,0,3.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900498,4.0,6.0,0,0,0,13.0,0,6.0,0,13.0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')
metadata['o_scorad_adj'] = metadata['o_scorad'].fillna(0)

metadata

Unnamed: 0,#sample-id,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,o_scorad_adj
0,Ca009ST_L,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,40
1,900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
2,Ca010EB_L,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,21
3,900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,40
4,900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,Ca006ON_L_2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
498,Ca006ON_NL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
499,Ca006ON_NL_2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
500,Ca006ON_PN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34


In [164]:
metadata['individual_case_location'] = metadata['case_type'] + ' ' + metadata['area']
metadata['individual_case_location'].value_counts() 

individual_case_location
case-lesional skin Umtata             61
case-nonlesional skin Umtata          61
case-anterior nares Umtata            61
control-anterior nares Umtata         55
control-nonlesional skin Umtata       52
case-nonlesional skin Cape Town       50
case-anterior nares Cape Town         47
case-lesional skin Cape Town          46
control-nonlesional skin Cape Town    35
control-anterior nares Cape Town      34
Name: count, dtype: int64

In [165]:
def darken_color(color, amount=0.3):
    """
    Darken a given color by a certain amount.
    
    Parameters:
    - color: The base color (as a hex string or color name).
    - amount: The amount to darken the color by (default: 0.3).
    
    Returns:
    - A darkened color as a hex string.
    """
    c = to_rgba(color)
    return (c[0] * (1 - amount), c[1] * (1 - amount), c[2] * (1 - amount), c[3])

In [166]:
def brighten_color(color, amount=0.3):
    """
    Brighten a color by increasing its RGB intensity without blending with white.
    
    Parameters:
    - color: color name or hex code
    - amount: brightness boost factor (0 = no change, 1 = full brightness)

    Returns:
    - Hex string of the brightened color.
    """
    rgba = to_rgba(color)
    r = min(rgba[0] * (1 + amount), 1.0)
    g = min(rgba[1] * (1 + amount), 1.0)
    b = min(rgba[2] * (1 + amount), 1.0)
    return to_hex((r, g, b, rgba[3]))

In [167]:
def plot_faith_pd_histo(metadata, group_col):
    metadata = metadata.set_index('#sample-id')

    # Load Faith PD data
    feature_table = pd.read_csv(
        "../Data/Faith_PD/vector.tsv", sep="\t"
    )

    feature_table = feature_table.set_index(feature_table.columns[0])

    # Filter feature table to only include samples that are in metadata
    feature_table = feature_table[feature_table.index.isin(metadata.index)] # filter only by samples over rarefaction depth

    # Keep only overlapping samples
    common_samples = metadata.index.intersection(feature_table.index)
    metadata = metadata.loc[common_samples].copy()
    metadata['Faith_PD'] = feature_table.loc[common_samples]
    metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce').astype(float)

    # Remove samples with o_scorad > 50 only for case-lesional groups
    lesional_groups = ['case-lesional skin Cape Town', 'case-lesional skin Umtata']

    # UNCOMMENT FOR ALL SCORAD
    # for group in lesional_groups:
    #     condition = (metadata[group_col] == group) & (metadata['o_scorad'] > 50)
    #     metadata = metadata[~condition]


    # Desired group order
    # desired_order = ['H skin Cape Town', 'AD skin Cape Town', 'H skin Umtata', 'AD skin Umtata']
    desired_order = ['control-nonlesional skin Cape Town', 'case-lesional skin Cape Town', 'control-nonlesional skin Umtata', 'case-lesional skin Umtata']

    palette = {
        'control-nonlesional skin Cape Town': '#7FBCEB',   # Soft but clearer blue
        'case-lesional skin Cape Town': '#C9A34F',  # Warm golden tan, more vibrant than tan 
        'control-nonlesional skin Umtata': '#66C2EE',      # Light turquoise-blue with more saturation
        'case-lesional skin Umtata': '#F0806B'      # Coral-salmon tone, richer than original salmon
    }

    # Add group counts for x-axis labels
    group_counts = metadata[group_col].value_counts().to_dict()
    new_labels = [
        f"CT-H\n(n={group_counts.get('control-nonlesional skin Cape Town', 0)})",
        f"CT-ADL\n(n={group_counts.get('case-lesional skin Cape Town', 0)})",
        f"UM-H\n(n={group_counts.get('control-nonlesional skin Umtata', 0)})",
        f"UM-ADL\n(n={group_counts.get('case-lesional skin Umtata', 0)})"
    ]

    # Start figure
    plt.figure(figsize=(8, 4.5))
    ax = sns.boxplot(x=group_col, y='Faith_PD', data=metadata, palette=palette, order=desired_order)

    # Darker dots on top
    brighter_palette = {k: brighten_color(v, amount=0.3) for k, v in palette.items()}

    sns.stripplot(x=group_col, y='Faith_PD', data=metadata, palette=brighter_palette,
                  jitter=True, dodge=False, ax=ax, linewidth=0.6, order=desired_order)

    # Customize axes and labels
    plt.suptitle(f'Alpha Diversity by Region (SCORAD <50)', fontsize=20, y = 1.03)
    # plt.suptitle(f'Alpha Diversity by Region', fontsize=20, y = 1.03) # ALL SCORAD

    plt.title(f'Cape Town                      Umtata', fontsize=16, y = 1)

    plt.xlabel(' ')
    plt.ylabel('Faith PD', fontsize=16)
    plt.xticks(ticks=range(len(new_labels)), labels=new_labels, ha='center', fontsize=16)

    # Pairwise significance testing (only p <= 0.05 shown)
    groups = desired_order
    p_values = {}
    y_max = metadata['Faith_PD'].max()
    height_step = 3

    y_offset = 5

    for i, group1 in enumerate(groups):
        for j, group2 in enumerate(groups):
            if i < j:
                vals1 = metadata[metadata[group_col] == group1]['Faith_PD']
                vals2 = metadata[metadata[group_col] == group2]['Faith_PD']
                stat, p = mannwhitneyu(vals1, vals2, alternative='two-sided')
                p_values[f'{group1} vs {group2}'] = p

                if p < 0.001:
                    label = '***  ' + f"p={p:.2}" + "   " + f"U={stat:.2}"
                elif p < 0.01:
                    label = '**  ' + f"p={p:.2}" + "   " + f"U={stat:.2}"
                elif p < 0.05:
                    label = '*  ' + f"p={p:.2}" + "   " + f"U={stat:.2}"
                else:
                    label = f"p={p:.2}" + "   " + f"U={stat:.2}"

                x1, x2 = i, j
                y = y_max + height_step - y_offset
                plt.plot([x1, x1, x2, x2], [y, y + 0.1, y, y], lw=1, color='black')
                plt.text((x1 + x2) * 0.5, y, label, ha='center', va='bottom', fontsize=12)
                y_max += height_step + 1



    # Save the figure
    plt.savefig('../Figures/Main/Fig_2B.png', dpi=600, bbox_inches='tight', pad_inches=0.1)
    # plt.savefig('../Figures/Supplementary/Suppl_Fig_3A.png', dpi=600, bbox_inches='tight', pad_inches=0.1) # ALL SCORAD

    # Print p-values
    print("Pairwise Mann-Whitney U test p-values:")
    for comparison, p_value in p_values.items():
        print(f"{comparison}: p-value = {p_value:.2e}")

In [168]:
# Plot Alpha Diversity plots for both V1-V3 and V4
plot_faith_pd_histo(
    metadata=metadata,
    group_col='individual_case_location'
)


  sns.stripplot(x=group_col, y='Faith_PD', data=metadata, palette=brighter_palette,
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


Pairwise Mann-Whitney U test p-values:
control-nonlesional skin Cape Town vs case-lesional skin Cape Town: p-value = 3.12e-01
control-nonlesional skin Cape Town vs control-nonlesional skin Umtata: p-value = 1.31e-05
control-nonlesional skin Cape Town vs case-lesional skin Umtata: p-value = 3.28e-02
case-lesional skin Cape Town vs control-nonlesional skin Umtata: p-value = 9.93e-06
case-lesional skin Cape Town vs case-lesional skin Umtata: p-value = 1.05e-01
control-nonlesional skin Umtata vs case-lesional skin Umtata: p-value = 3.47e-04
