# Alpha Diversity

In [49]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
from scipy.stats import mannwhitneyu
from skbio.diversity import alpha_diversity
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
from matplotlib.patches import Circle
from matplotlib.colors import to_hex
import matplotlib.ticker as ticker
import statsmodels.api as sm
from biom import load_table
from statsmodels.stats.multitest import multipletests
from skbio import TreeNode
from skbio.diversity.alpha import shannon
from skbio.diversity.alpha import faith_pd
import statsmodels.formula.api as smf
from scipy.stats import chi2_contingency
import matplotlib.colors as mc, colorsys

### Define parameters for inputs

In [50]:
prevalence = '1pct'
depth = 2000

### Load inputs

In [51]:
# Load the metadata
metadata_path = '../Metadata/16S_AD_South-Africa_metadata_subset.tsv'
metadata = pd.read_csv(metadata_path, sep='\t')
metadata['o_scorad_adj'] = metadata['o_scorad'].fillna(0)
if "Unnamed: 0" in metadata.columns:
    metadata = metadata.drop(columns=["Unnamed: 0"])

metadata['individual_case_location'] = metadata['case_type'] + ' ' + metadata['area']
metadata['individual_case_location'].value_counts()     

# Filter to only the skin samples
metadata = metadata[metadata['specimen'] == 'skin']
metadata


Unnamed: 0,#sample-id,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,...,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,o_scorad_adj,individual_case_location
0,Ca009ST_L,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,...,24.0,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,40,case-lesional_skin Cape Town
1,900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,...,9.0,female,8/11/2015,Winter,Unexposed,negative,7.0,34,34,case-lesional_skin Umtata
2,Ca010EB_L,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,...,24.0,female,11/20/2014,Spring,Unexposed,negative,7.0,21,21,case-lesional_skin Cape Town
3,900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,...,18.0,female,9/23/2015,Spring,Unexposed,,4.0,40,40,case-lesional_skin Umtata
4,900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,...,31.0,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,41,case-lesional_skin Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,900398,5,C10,SB503,AGAGTCAC,SB710,TGCTCGTA,SB710SB503,TGCTCGTA-AGAGTCAC,1.010000e+21,...,25.0,female,9/17/2015,Spring,Exposed,negative,5.0,54,54,case-nonlesional_skin Umtata
456,900400,5,B12,SB502,CGTTACTA,SB712,CGTAGCGA,SB712SB502,CGTAGCGA-CGTTACTA,1.010000e+21,...,21.0,female,9/17/2015,Spring,Exposed,negative,12.0,38,38,case-lesional_skin Umtata
457,900401,5,C12,SB503,AGAGTCAC,SB712,CGTAGCGA,SB712SB503,CGTAGCGA-AGAGTCAC,1.010000e+21,...,21.0,female,9/17/2015,Spring,Exposed,negative,12.0,38,38,case-nonlesional_skin Umtata
459,Ca006ON_L,6,F1,SA506,CGTGAGTG,SB701,CTCGACTT,SB701SA506,CTCGACTT-CGTGAGTG,1.010000e+21,...,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,34,case-lesional_skin Cape Town


In [52]:
# Read in table at Genus-ASV level
biom_path = f'../Data/Tables/Count_Tables/6_209766_feature_table_dedup_prev-filt-{prevalence}_rare-{depth}_Genus-ASV_skin.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# Clean up column names so they match tree tips
df.columns = (
    df.columns
      .str.replace(r"^g__", "", regex=True)
      .str.replace("_", " ", regex=False)
)

# Keep only samples that exist in metadata
df = df[df.index.isin(metadata['#sample-id'])]

# Subset to skin samples only
skin_samples = metadata.loc[metadata['specimen'] == 'skin', '#sample-id']
df = df.loc[df.index.intersection(skin_samples)]

df


Unnamed: 0,Streptococcus ASV-1,Streptococcus ASV-2,Corynebacterium ASV-1,Corynebacterium ASV-3,ASV-2,Cutibacterium ASV-1,ASV-56,ASV-27,ASV-28,Cutibacterium ASV-2,...,ASV-109,Filifactor ASV-1,Leptotrichia A 993758 ASV-11,Capnocytophaga 820688 ASV-8,Dermacoccus ASV-2,Comamonas F 589250 ASV-4,Bosea ASV-2,Capnocytophaga 820690 ASV-3,Blautia A 141780 ASV-1,ASV-153
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900570,126.0,0,0,0,0,5.0,0,0,0,4.0,...,0,0,0,0,0,0,0,0,0,0
900091,14.0,7.0,0,0,0,0,0,0,8.0,0,...,0,0,0,0,0,0,0,0,0,0
900245,9.0,0,0,0,0,9.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900581,95.0,0,0,0,13.0,23.0,0,0,0,9.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900263,165.0,108.0,5.0,0,7.0,17.0,0,0,4.0,9.0,...,0,0,0,0,0,0,0,0,0,0
900081,453.0,0,21.0,0,77.0,19.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900501,948.0,127.0,0,0,3.0,8.0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
900279,39.0,0,0,0,3.0,33.0,0,8.0,0,35.0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Subset metadata to match df (check numbers for groups below)
metadata = metadata[metadata['#sample-id'].isin(df.index)]
metadata['individual_case_location'].value_counts()

individual_case_location
case-lesional_skin Umtata             41
control-nonlesional_skin Umtata       30
case-lesional_skin Cape Town          26
case-nonlesional_skin Umtata          21
case-nonlesional_skin Cape Town       20
control-nonlesional_skin Cape Town    13
Name: count, dtype: int64

In [54]:
# Read in newick phylogenetic tree
tree_path = f'../Data/Trees/209766_feature_table_dedup_prev-filt-{prevalence}_rare-{depth}_Genus-ASV_skin_aln.nwk'

tree = TreeNode.read(tree_path)

# Remove leading 'g  ' from each tip name (if present)
for tip in tree.tips():
    if tip.name.startswith("g  "):
        tip.name = tip.name.replace("g  ", "", 1)

# Root the tree (required for Faith PD) at its midpoint
tree = tree.root_at_midpoint()

tree

<TreeNode, name: root, internal node count: 845, tips count: 847>

In [55]:
# Check that table and tree features match
features_in_tree = {tip.name for tip in tree.tips()}
shared_features = df.columns.intersection(features_in_tree)

print(len(shared_features))

847


### Run Faith Phylogenetic Diversity

In [56]:
# Run Faith PD
faith_pd_results = {}

for sample in df.index:
    counts = df.loc[sample].values
    faith_pd_results[sample] = faith_pd(counts, otu_ids=df.columns, tree=tree)

faith_pd_series = pd.Series(faith_pd_results, name="Faith_PD")
faith_pd_series.to_csv("../Data/Alpha_Diversity/faith_pd_results.tsv", sep="\t", header=True)

print("Faith PD successfully computed for all skin samples!")
faith_pd_series.head()

Faith PD successfully computed for all skin samples!


900221    3.134932
900570    8.753575
900091    3.333254
900245    7.948009
900581    8.964010
Name: Faith_PD, dtype: float64

In [57]:
# Merge Faith PD results with metadata
metadata = metadata.set_index("#sample-id")  # ensure sample IDs align
merged = metadata.join(faith_pd_series, how="left")

# Filter only samples with Faith PD = 0.0
zero_pd_samples = merged[merged["Faith_PD"] == 0.0]

# Count how many in each individual_case_location
counts = (
    zero_pd_samples
    .groupby("individual_case_location")
    .size()
    .reset_index(name="num_zero_Faith_PD")
    .sort_values("num_zero_Faith_PD", ascending=False)
)

print("Counts of samples with Faith PD = 0.0 by individual_case_location:")
print(counts)

print(f"\nTotal samples with Faith PD = 0.0: {len(zero_pd_samples)} / {len(faith_pd_series)}")

Counts of samples with Faith PD = 0.0 by individual_case_location:
Empty DataFrame
Columns: [individual_case_location, num_zero_Faith_PD]
Index: []

Total samples with Faith PD = 0.0: 0 / 151


### Plot Faith PD from Umtata and Cape Town

In [58]:
def plot_faith_region(metadata, group_col, downsample=True):
    """
    Generate Faith PD box + strip plots for Umtata and Cape Town,
    including pairwise Mann–Whitney U tests with p-value correction.

    Parameters
    ----------
    metadata : pd.DataFrame
        Metadata table with sample information.
    group_col : str
        Column name containing group labels (e.g., "case_type_region").
    downsample : bool, default=True
        Whether to perform equal downsampling of groups.
    """

    # ------------------------------------------------------------------
    # Helper functions
    # ------------------------------------------------------------------
    def brighten_color(color, amount=0.3):
        """Brighten a color by a given amount."""
        try:
            c = mc.cnames[color]
        except KeyError:
            c = color
        h, l, s = colorsys.rgb_to_hls(*mc.to_rgb(c))
        return colorsys.hls_to_rgb(h, 1 - amount * (1 - l), s)

    def add_pairwise_annotations(ax, groups, df, group_col, y_col):
        """Perform pairwise Mann–Whitney tests and annotate figure."""
        comparisons, pvals, stats = [], [], []

        for i in range(len(groups)):
            for j in range(i + 1, len(groups)):
                g1, g2 = groups[i], groups[j]
                vals1 = df[df[group_col] == g1][y_col]
                vals2 = df[df[group_col] == g2][y_col]
                stat, p = mannwhitneyu(vals1, vals2, alternative='two-sided')
                comparisons.append((g1, g2))
                pvals.append(p)
                stats.append(stat)

        # Correction (for reporting only)
        reject, pvals_corr, _, _ = multipletests(pvals, method='fdr_bh')

        # ------------------------------------------------------------------
        # Print results
        # ------------------------------------------------------------------
        print(f"\nPairwise Mann–Whitney tests for {y_col}:")
        print("------------------------------------------------------------")
        for (g1, g2), stat, p, p_corr, rej in zip(comparisons, stats, pvals, pvals_corr, reject):
            sig_marker = "*" if rej else "ns"
            print(f"{g1} vs {g2} | U={stat:.0f}, p={p:.3g}, p_adj={p_corr:.3g}, {sig_marker}")
        print("------------------------------------------------------------")

        # ------------------------------------------------------------------
        # Annotate on figure (using raw p-values)
        # ------------------------------------------------------------------
        y_max = df[y_col].max()
        spacing = 0.5
        for ((g1, g2), p, stat) in zip(comparisons, pvals, stats):
            if p < 0.05:
                i, j = groups.index(g1), groups.index(g2)
                y = y_max + spacing
                ax.plot([i, i, j, j],
                        [y, y + 0.015, y + 0.015, y],
                        lw=1, color='black')
                label = f"p={p:.2g}, U={stat:.0f}"   # <-- raw p-value
                ax.text((i + j) / 2, y, label,
                        ha='center', va='bottom', fontsize=9)
                y_max += spacing


    # ------------------------------------------------------------------
    # Load Faith PD results
    # ------------------------------------------------------------------
    feature_table = pd.read_csv("../Data/Alpha_Diversity/faith_pd_results.tsv", sep="\t", index_col=0)
    feature_table = feature_table[feature_table['Faith_PD'] != 0]

    # Match metadata and Faith PD samples
    common_samples = metadata.index.intersection(feature_table.index)
    metadata = metadata.loc[common_samples].copy()
    metadata['Faith_PD'] = feature_table.loc[common_samples, 'Faith_PD']
    metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

    # ------------------------------------------------------------------
    # Region-specific plotting function
    # ------------------------------------------------------------------
    def plot_region(region, groups, palette, output_path, ylim, ytick, downsample_sizes=None):
        """Generic plotting routine for each region."""
        region_data = metadata[metadata[group_col].isin(groups)].copy()

        if downsample and downsample_sizes:
            # Downsample each group to defined size
            sampled = []
            for g in groups:
                n = downsample_sizes.get(g, None)
                subset = region_data[region_data[group_col] == g]
                if n and len(subset) > n:
                    subset = subset.sample(n=n, random_state=42, replace=False)
                sampled.append(subset)
            region_data = pd.concat(sampled)
        else:
            # Use all samples (no downsampling)
            print(f"No downsampling applied for {region}. Using all {len(region_data)} samples.")

        # Count for labeling
        group_counts = region_data[group_col].value_counts().to_dict()
        short_labels = ['H', 'ADNL', 'ADL']
        new_labels = [
            f"{lab}\n(n={group_counts.get(g, 0)})"
            for lab, g in zip(short_labels, groups)
        ]

        # Plot
        fig, ax = plt.subplots(figsize=(2.5, 4.25))
        sns.boxplot(
            x=group_col, y='Faith_PD', data=region_data,
            palette=palette, order=groups, ax=ax
        )
        sns.stripplot(
            x=group_col, y='Faith_PD', data=region_data,
            palette={k: brighten_color(v) for k, v in palette.items()},
            jitter=True, dodge=False, linewidth=0.6, order=groups, ax=ax
        )

        ax.set_title(region, fontsize=14, y=1)
        ax.set_xlabel('')
        ax.set_ylabel('Faith PD', fontsize=13)
        ax.set_xticks(range(len(new_labels)))
        ax.set_xticklabels(new_labels, ha='center', fontsize=13)
        ax.set_ylim(*ylim)
        ax.yaxis.set_major_locator(ticker.MultipleLocator(ytick))
        ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x)}'))

        add_pairwise_annotations(ax, groups, region_data, group_col, 'Faith_PD')

        fig.savefig(output_path, dpi=600, bbox_inches='tight', pad_inches=0.1)
        plt.close(fig)

    # ------------------------------------------------------------------
    # Umtata
    # ------------------------------------------------------------------
    umtata_groups = [
        'control-nonlesional_skin Umtata',
        'case-nonlesional_skin Umtata',
        'case-lesional_skin Umtata'
    ]
    umtata_palette = {
        'control-nonlesional_skin Umtata': '#66C2EE',
        'case-nonlesional_skin Umtata': '#FAD5A5',
        'case-lesional_skin Umtata': '#F0806B',
    }

    # Downsample only ADL to match ADNL
    umtata_downsample = {
        'control-nonlesional_skin Umtata': 21,
        'case-nonlesional_skin Umtata': 21,
        'case-lesional_skin Umtata': 21
    } if downsample else None

    plot_region(
        'Umtata', umtata_groups, umtata_palette,
        '../Figures/Supplementary/Suppl_Fig_5A.jpg',
        ylim=(0, 12), ytick=2, downsample_sizes=umtata_downsample
    )

    # ------------------------------------------------------------------
    # Cape Town
    # ------------------------------------------------------------------
    cape_groups = [
        'control-nonlesional_skin Cape Town',
        'case-nonlesional_skin Cape Town',
        'case-lesional_skin Cape Town'
    ]
    cape_palette = {
        'control-nonlesional_skin Cape Town': '#7FBCEB',
        'case-nonlesional_skin Cape Town': '#FAD5A5',
        'case-lesional_skin Cape Town': '#C9A34F',
    }
    # Downsample only ADL to match ADNL
    cape_downsample = {
        'control-nonlesional_skin Cape Town': 23,
        'case-nonlesional_skin Cape Town': 20,
        'case-lesional_skin Cape Town': 20
    } if downsample else None

    plot_region(
        'Cape Town', cape_groups, cape_palette,
        '../Figures/Supplementary/Suppl_Fig_5D.jpg',
        ylim=(0, 12), ytick=2, downsample_sizes=cape_downsample
    )


    print(f"Done! Plots saved. Downsampling applied: {downsample}")


In [59]:
plot_faith_region(
    metadata=metadata,
    group_col='individual_case_location'
)


Pairwise Mann–Whitney tests for Faith_PD:
------------------------------------------------------------
control-nonlesional_skin Umtata vs case-nonlesional_skin Umtata | U=293, p=0.0701, p_adj=0.105, ns
control-nonlesional_skin Umtata vs case-lesional_skin Umtata | U=300, p=0.0469, p_adj=0.105, ns
case-nonlesional_skin Umtata vs case-lesional_skin Umtata | U=224, p=0.94, p_adj=0.94, ns
------------------------------------------------------------

Pairwise Mann–Whitney tests for Faith_PD:
------------------------------------------------------------
control-nonlesional_skin Cape Town vs case-nonlesional_skin Cape Town | U=109, p=0.45, p_adj=0.655, ns
control-nonlesional_skin Cape Town vs case-lesional_skin Cape Town | U=99, p=0.261, p_adj=0.655, ns
case-nonlesional_skin Cape Town vs case-lesional_skin Cape Town | U=183, p=0.655, p_adj=0.655, ns
------------------------------------------------------------
Done! Plots saved. Downsampling applied: True


  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


### Faith PD between Umtata healthy and Cape Town healthy

In [60]:
def prepare_faith_pd_data(metadata, group_col, faith_pd_path="../Data/Alpha_Diversity/faith_pd_results.tsv"):
    import pandas as pd

    # Load Faith PD results
    feature_table = pd.read_csv(faith_pd_path, sep="\t")
    feature_table = feature_table.set_index(feature_table.columns[0])

    # Align with metadata
    for possible_id in ['#sample-id', '#SampleID', 'sample_id']:
        if possible_id in metadata.columns:
            metadata = metadata.set_index(possible_id)
            break

    # Subset to common samples
    feature_table = feature_table[feature_table.index.isin(metadata.index)]
    common_samples = metadata.index.intersection(feature_table.index)

    metadata = metadata.loc[common_samples].copy()
    metadata['Faith_PD'] = feature_table.loc[common_samples]

    # Define region_group labels
    metadata['region_group'] = None
    metadata.loc[metadata[group_col] == 'control-nonlesional_skin Cape Town', 'region_group'] = 'H_CapeTown'
    metadata.loc[metadata[group_col] == 'control-nonlesional_skin Umtata', 'region_group'] = 'H_Umtata'
    
    metadata.loc[metadata[group_col].isin([
        'case-nonlesional_skin Cape Town',
        'case-lesional_skin Cape Town']), 'region_group'] = 'AD_CapeTown'

    metadata.loc[metadata[group_col].isin([
        'case-nonlesional_skin Umtata',
        'case-lesional_skin Umtata']), 'region_group'] = 'AD_Umtata'

    return metadata


In [61]:
def plot_faith_pd_groups(metadata):

    # Helper function for plotting one comparison
    def plot_box(data, order, palette, title, outfile):
        fig, ax = plt.subplots(figsize=(2.5, 4))
        sns.boxplot(
            x='region_group', y='Faith_PD',
            data=data, palette=palette, order=order, ax=ax
        )
        sns.stripplot(
            x='region_group', y='Faith_PD',
            data=data, palette=palette,
            jitter=True, dodge=False, linewidth=0.6, order=order, ax=ax
        )

        ax.set_title(title, fontsize=14, y=1.02)
        ax.set_xlabel('')
        ax.set_ylabel('Faith PD', fontsize=13)

        # Label x-axis with group names and n
        ax.set_xticklabels([
            f"{order[0].split('_')[1]}\n(n={len(data[data['region_group']==order[0]])})",
            f"{order[1].split('_')[1]}\n(n={len(data[data['region_group']==order[1]])})"
        ], fontsize=12)
        ax.yaxis.set_major_locator(ticker.MultipleLocator(2))

        # Mann–Whitney U Test
        group1 = data[data['region_group'] == order[0]]['Faith_PD'].dropna()
        group2 = data[data['region_group'] == order[1]]['Faith_PD'].dropna()
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')

        # Annotate with p and U
        ax.text(
            0.5, data['Faith_PD'].max() * 0.95,
            f"p={p:.3f}\nU={stat:.0f}",
            ha='center', va='bottom', fontsize=9
        )

        fig.savefig(outfile, dpi=600, bbox_inches='tight', pad_inches=0.1)
        plt.close(fig)

    # --- Healthy Cape Town vs Healthy Umtata ---
    h_data = metadata[metadata['region_group'].isin(['H_CapeTown', 'H_Umtata'])].copy()

    h_palette = {
        'H_Umtata': '#66C2EE',   # light blue
        'H_CapeTown': '#7FBCEB'  # slightly deeper blue
    }

    plot_box(
        h_data,
        ['H_Umtata', 'H_CapeTown'],
        h_palette,
        'Healthy Skin Diversity',
        '../Figures/Main/Fig_4A.jpg'
    )

    print("Done")

In [62]:
# Step 1: Prepare Faith PD data with region_group labels
md_faith = prepare_faith_pd_data(
    metadata=metadata,
    group_col='individual_case_location'
)

# Step 2: Plot the Faith PD comparisons
plot_faith_pd_groups(
    metadata=md_faith
)


  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


Done


### Run Shannon Diversity

In [63]:
shannon_results = {}

for sample in df.index:
    counts = df.loc[sample].values
    shannon_results[sample] = shannon(counts)

# Convert to pandas Series
shannon_series = pd.Series(shannon_results, name="Shannon")

# Save results
shannon_series.to_csv("../Data/Alpha_Diversity/shannon_results.tsv", sep="\t", header=True)

print("Shannon diversity successfully computed for all skin samples!")
print(shannon_series.head())

Shannon diversity successfully computed for all skin samples!
900221    1.327170
900570    4.581213
900091    3.732909
900245    5.102464
900581    4.391689
Name: Shannon, dtype: float64


### Plot Shannon from Umtata healthy and Cape Town healthy

In [64]:
# Load the metadata
metadata_path = '../Metadata/16S_AD_South-Africa_metadata_subset.tsv'
metadata = pd.read_csv(metadata_path, sep='\t')
metadata['o_scorad_adj'] = metadata['o_scorad'].fillna(0)
if "Unnamed: 0" in metadata.columns:
    metadata = metadata.drop(columns=["Unnamed: 0"])

metadata['individual_case_location'] = metadata['case_type'] + ' ' + metadata['area']
metadata['individual_case_location'].value_counts()     

# Filter to only the skin samples
metadata = metadata[metadata['specimen'] == 'skin']
metadata


Unnamed: 0,#sample-id,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,...,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,o_scorad_adj,individual_case_location
0,Ca009ST_L,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,...,24.0,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,40,case-lesional_skin Cape Town
1,900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,...,9.0,female,8/11/2015,Winter,Unexposed,negative,7.0,34,34,case-lesional_skin Umtata
2,Ca010EB_L,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,...,24.0,female,11/20/2014,Spring,Unexposed,negative,7.0,21,21,case-lesional_skin Cape Town
3,900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,...,18.0,female,9/23/2015,Spring,Unexposed,,4.0,40,40,case-lesional_skin Umtata
4,900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,...,31.0,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,41,case-lesional_skin Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,900398,5,C10,SB503,AGAGTCAC,SB710,TGCTCGTA,SB710SB503,TGCTCGTA-AGAGTCAC,1.010000e+21,...,25.0,female,9/17/2015,Spring,Exposed,negative,5.0,54,54,case-nonlesional_skin Umtata
456,900400,5,B12,SB502,CGTTACTA,SB712,CGTAGCGA,SB712SB502,CGTAGCGA-CGTTACTA,1.010000e+21,...,21.0,female,9/17/2015,Spring,Exposed,negative,12.0,38,38,case-lesional_skin Umtata
457,900401,5,C12,SB503,AGAGTCAC,SB712,CGTAGCGA,SB712SB503,CGTAGCGA-AGAGTCAC,1.010000e+21,...,21.0,female,9/17/2015,Spring,Exposed,negative,12.0,38,38,case-nonlesional_skin Umtata
459,Ca006ON_L,6,F1,SA506,CGTGAGTG,SB701,CTCGACTT,SB701SA506,CTCGACTT-CGTGAGTG,1.010000e+21,...,35.0,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,34,case-lesional_skin Cape Town


In [65]:
def prepare_shannon_data(metadata, group_col, shannon_path="../Data/Alpha_Diversity/shannon_results.tsv"):
    import pandas as pd

    # Load Shannon results
    feature_table = pd.read_csv(shannon_path, sep="\t")
    feature_table = feature_table.set_index(feature_table.columns[0])

    # Align with metadata
    for possible_id in ['#sample-id']:
        if possible_id in metadata.columns:
            metadata = metadata.set_index(possible_id)
            break

    # Subset to common samples
    feature_table = feature_table[feature_table.index.isin(metadata.index)]
    common_samples = metadata.index.intersection(feature_table.index)

    metadata = metadata.loc[common_samples].copy()
    metadata['Shannon'] = feature_table.loc[common_samples]

    # Define region_group labels
    metadata['region_group'] = None
    metadata.loc[metadata[group_col] == 'control-nonlesional_skin Cape Town', 'region_group'] = 'H_CapeTown'
    metadata.loc[metadata[group_col] == 'control-nonlesional_skin Umtata', 'region_group'] = 'H_Umtata'

    metadata.loc[metadata[group_col].isin([
        'case-nonlesional_skin Cape Town',
        'case-lesional_skin Cape Town']), 'region_group'] = 'AD_CapeTown'

    metadata.loc[metadata[group_col].isin([
        'case-nonlesional_skin Umtata',
        'case-lesional_skin Umtata']), 'region_group'] = 'AD_Umtata'

    return metadata


In [66]:
def plot_shannon_groups(metadata):
    import seaborn as sns
    import matplotlib.pyplot as plt
    import matplotlib.ticker as ticker
    from scipy.stats import mannwhitneyu
    import pandas as pd

    # Helper function for plotting one comparison
    def plot_box(data, order, palette, title, outfile):
        fig, ax = plt.subplots(figsize=(2.5, 4))
        sns.boxplot(x='region_group', y='Shannon', data=data, palette=palette,
                    order=order, ax=ax)
        sns.stripplot(x='region_group', y='Shannon', data=data, palette=palette,
                      jitter=True, dodge=False, linewidth=0.6, order=order, ax=ax)
        ax.set_title(title, fontsize=14, y=1.02)
        ax.set_xlabel('')
        ax.set_ylabel('Shannon Diversity', fontsize=13)
        ax.set_xticklabels([
            f"{order[0].split('_')[1]}\n(n={len(data[data['region_group']==order[0]])})",
            f"{order[1].split('_')[1]}\n(n={len(data[data['region_group']==order[1]])})"
        ], fontsize=12)
        ax.yaxis.set_major_locator(ticker.MultipleLocator(2))

        # Mann–Whitney U Test
        stat, p = mannwhitneyu(
            data[data['region_group'] == order[0]]['Shannon'].dropna(),
            data[data['region_group'] == order[1]]['Shannon'].dropna(),
            alternative='two-sided'
        )
        ax.text(0.55, data['Shannon'].max() - 0.75, f"p={p:.2}\nU={stat:.0f}",
                ha='center', va='bottom', fontsize=9)
        fig.savefig(outfile, dpi=600, bbox_inches='tight', pad_inches=0.1)
        plt.close(fig)

    # --- Healthy vs Healthy ---
    h_data = metadata[metadata['region_group'].isin(['H_CapeTown', 'H_Umtata'])].copy()

    h_umtata = h_data[h_data['region_group'] == 'H_Umtata']
    h_cape = h_data[h_data['region_group'] == 'H_CapeTown']

    h_data = pd.concat([h_umtata, h_cape])
    h_palette = {'H_Umtata': '#66C2EE', 'H_CapeTown': '#7FBCEB'}
    plot_box(h_data, ['H_Umtata', 'H_CapeTown'], h_palette,
             'Healthy Skin Diversity', '../Figures/Supplementary/Suppl_Fig_4.jpg')

    print("Done: all group counts printed above and Shannon plots saved.")


In [67]:
# Step 1: Prepare Shannon data with region_group labels
md_shannon = prepare_shannon_data(
    metadata=metadata,
    group_col='individual_case_location'
)

# Step 2: Plot the Shannon comparisons
plot_shannon_groups(
    metadata=md_shannon
)


Done: all group counts printed above and Shannon plots saved.


  sns.stripplot(x='region_group', y='Shannon', data=data, palette=palette,
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


In [68]:
# Subset metadata to match df (check numbers for groups below)
metadata = metadata[metadata['#sample-id'].isin(df.index)]
metadata['individual_case_location'].value_counts()

individual_case_location
case-lesional_skin Umtata             41
control-nonlesional_skin Umtata       30
case-lesional_skin Cape Town          26
case-nonlesional_skin Umtata          21
case-nonlesional_skin Cape Town       20
control-nonlesional_skin Cape Town    13
Name: count, dtype: int64

### Plot Shannon Diversity from Umtata and Cape Town

In [69]:
# Load metadata and Shannon diversity results
shannon = pd.read_csv("../Data/Alpha_Diversity/shannon_results.tsv", sep="\t", index_col=0)

# Ensure column name consistency
if "shannon" not in shannon.columns:
    shannon.columns = ["shannon"]

# Merge Shannon values into metadata
merged = metadata.join(shannon["shannon"], how="left")

# Identify samples with Shannon = 0.0
zero_shannon_samples = merged[merged["shannon"] == 0.0]

# Summarize by location
counts = (
    zero_shannon_samples
    .groupby("individual_case_location")
    .size()
    .reset_index(name="num_zero_Shannon")
    .sort_values("num_zero_Shannon", ascending=False)
)

# Print results
print("Counts of samples with Shannon = 0.0 by individual_case_location:")
print(counts)

print(f"\nTotal samples with Shannon = 0.0: {len(zero_shannon_samples)} / {len(shannon)} total samples")


Counts of samples with Shannon = 0.0 by individual_case_location:
Empty DataFrame
Columns: [individual_case_location, num_zero_Shannon]
Index: []

Total samples with Shannon = 0.0: 0 / 151 total samples


In [70]:
# Load in Shannon results
shannon = pd.read_csv("../Data/Alpha_Diversity/shannon_results.tsv", sep="\t", index_col=0)

# Ensure column name consistency
if "shannon" not in shannon.columns:
    shannon.columns = ["shannon"]

# Merge Shannon values into metadata
metadata = metadata.set_index("#sample-id")  # ensure alignment
merged = metadata.join(shannon["shannon"], how="left")

# Identify samples with Shannon = 0.0
zero_shannon_samples = merged[merged["shannon"] == 0.0]

# Summarize by location
counts = (
    zero_shannon_samples
    .groupby("individual_case_location")
    .size()
    .reset_index(name="num_zero_Shannon")
    .sort_values("num_zero_Shannon", ascending=False)
)

# Print results
print("Counts of samples with Shannon = 0.0 by individual_case_location:")
print(counts)

print(f"\nTotal samples with Shannon = 0.0: {len(zero_shannon_samples)} / {len(shannon)} total samples")


Counts of samples with Shannon = 0.0 by individual_case_location:
Empty DataFrame
Columns: [individual_case_location, num_zero_Shannon]
Index: []

Total samples with Shannon = 0.0: 0 / 151 total samples


In [71]:
def plot_shannon_region(metadata, group_col, downsample=True):
    """
    Generate Shannon diversity box + strip plots for Umtata and Cape Town,
    including pairwise Mann–Whitney U tests with p-value correction.

    Parameters
    ----------
    metadata : pd.DataFrame
        Metadata table with sample information.
    group_col : str
        Column name containing group labels (e.g., "case_type_region").
    downsample : bool, default=True
        Whether to perform equal downsampling of groups.
    """

    # ------------------------------------------------------------------
    # Helper functions
    # ------------------------------------------------------------------
    def brighten_color(color, amount=0.3):
        """Brighten a color by a given amount."""
        try:
            c = mc.cnames[color]
        except KeyError:
            c = color
        h, l, s = colorsys.rgb_to_hls(*mc.to_rgb(c))
        return colorsys.hls_to_rgb(h, 1 - amount * (1 - l), s)

    def add_pairwise_annotations(ax, groups, df, group_col, y_col):
        """Perform pairwise Mann–Whitney tests and annotate figure."""
        comparisons, pvals, stats = [], [], []

        for i in range(len(groups)):
            for j in range(i + 1, len(groups)):
                g1, g2 = groups[i], groups[j]
                vals1 = df[df[group_col] == g1][y_col]
                vals2 = df[df[group_col] == g2][y_col]
                stat, p = mannwhitneyu(vals1, vals2, alternative='two-sided')
                comparisons.append((g1, g2))
                pvals.append(p)
                stats.append(stat)

        # Benjamini-Hochberg FDR correction
        reject, pvals_corr, _, _ = multipletests(pvals, method='fdr_bh')

        # Print results to console
        print(f"\nPairwise Mann–Whitney tests for {y_col}:")
        print("------------------------------------------------------------")
        for (g1, g2), stat, p, p_corr, rej in zip(comparisons, stats, pvals, pvals_corr, reject):
            sig_marker = "*" if rej else "ns"
            print(f"{g1} vs {g2} | U={stat:.0f}, p={p:.3g}, p_adj={p_corr:.3g}, {sig_marker}")
        print("------------------------------------------------------------")

        # Annotate significant comparisons on plot (raw p-values)
        y_max = df[y_col].max()
        spacing = 0.2
        for ((g1, g2), p, stat) in zip(comparisons, pvals, stats):
            if p < 0.05:
                i, j = groups.index(g1), groups.index(g2)
                y = y_max + spacing
                ax.plot([i, i, j, j], [y, y + 0.1, y + 0.1, y], lw=1, color='black')
                label = f"p={p:.2g}, U={stat:.0f}"
                ax.text((i + j) / 2, y + 0.1, label, ha='center', va='bottom', fontsize=9)
                y_max += spacing + 0.1

    # ------------------------------------------------------------------
    # Load Shannon diversity results
    # ------------------------------------------------------------------
    feature_table = pd.read_csv("../Data/Alpha_Diversity/shannon_results.tsv", sep="\t", index_col=0)
    feature_table = feature_table[feature_table['Shannon'] != 0]

    # Match metadata and Shannon samples
    common_samples = metadata.index.intersection(feature_table.index)
    metadata = metadata.loc[common_samples].copy()
    metadata['Shannon'] = feature_table.loc[common_samples, 'Shannon']
    metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

    # ------------------------------------------------------------------
    # Region-specific plotting function
    # ------------------------------------------------------------------
    def plot_region(region, groups, palette, output_path, ylim, ytick, downsample_sizes=None):
        """Generic plotting routine for each region."""
        region_data = metadata[metadata[group_col].isin(groups)].copy()

        if downsample and downsample_sizes:
            # Downsample each group to defined size
            sampled = []
            for g in groups:
                n = downsample_sizes.get(g, None)
                subset = region_data[region_data[group_col] == g]
                if n and len(subset) > n:
                    subset = subset.sample(n=n, random_state=42, replace=False)
                sampled.append(subset)
            region_data = pd.concat(sampled)
        else:
            print(f"No downsampling applied for {region}. Using all {len(region_data)} samples.")

        # Count for labeling
        group_counts = region_data[group_col].value_counts().to_dict()
        short_labels = ['H', 'ADNL', 'ADL']
        new_labels = [
            f"{lab}\n(n={group_counts.get(g, 0)})"
            for lab, g in zip(short_labels, groups)
        ]

        # Plot
        fig, ax = plt.subplots(figsize=(2.5, 4.25))
        sns.boxplot(
            x=group_col, y='Shannon', data=region_data,
            palette=palette, order=groups, ax=ax
        )
        sns.stripplot(
            x=group_col, y='Shannon', data=region_data,
            palette={k: brighten_color(v) for k, v in palette.items()},
            jitter=True, dodge=False, linewidth=0.6, order=groups, ax=ax
        )

        ax.set_title(region, fontsize=14, y=1)
        ax.set_xlabel('')
        ax.set_ylabel('Shannon Diversity', fontsize=13)
        ax.set_xticks(range(len(new_labels)))
        ax.set_xticklabels(new_labels, ha='center', fontsize=13)
        ax.set_ylim(*ylim)
        ax.yaxis.set_major_locator(ticker.MultipleLocator(ytick))
        ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x:.1f}'))

        add_pairwise_annotations(ax, groups, region_data, group_col, 'Shannon')

        fig.savefig(output_path, dpi=600, bbox_inches='tight', pad_inches=0.1)
        plt.close(fig)

    # ------------------------------------------------------------------
    # Umtata
    # ------------------------------------------------------------------
    umtata_groups = [
        'control-nonlesional_skin Umtata',
        'case-nonlesional_skin Umtata',
        'case-lesional_skin Umtata'
    ]
    umtata_palette = {
        'control-nonlesional_skin Umtata': '#66C2EE',
        'case-nonlesional_skin Umtata': '#FAD5A5',
        'case-lesional_skin Umtata': '#F0806B',
    }

    umtata_downsample = {
        'control-nonlesional_skin Umtata': 21,
        'case-nonlesional_skin Umtata': 21,
        'case-lesional_skin Umtata': 21
    } if downsample else None

    plot_region(
        'Umtata', umtata_groups, umtata_palette,
        '../Figures/Supplementary/Suppl_Fig_5B.jpg',
        ylim=(0, 6), ytick=1, downsample_sizes=umtata_downsample
    )

    # ------------------------------------------------------------------
    # Cape Town
    # ------------------------------------------------------------------
    cape_groups = [
        'control-nonlesional_skin Cape Town',
        'case-nonlesional_skin Cape Town',
        'case-lesional_skin Cape Town'
    ]
    cape_palette = {
        'control-nonlesional_skin Cape Town': '#7FBCEB',
        'case-nonlesional_skin Cape Town': '#FAD5A5',
        'case-lesional_skin Cape Town': '#C9A34F',
    }

    cape_downsample = {
        'control-nonlesional_skin Cape Town': 23,
        'case-nonlesional_skin Cape Town': 20,
        'case-lesional_skin Cape Town': 20
    } if downsample else None

    plot_region(
        'Cape Town', cape_groups, cape_palette,
        '../Figures/Supplementary/Suppl_Fig_5E.jpg',
        ylim=(0, 6), ytick=1, downsample_sizes=cape_downsample
    )

    print(f"Done! Shannon plots saved. Downsampling applied: {downsample}")


In [72]:
plot_shannon_region(
    metadata=metadata,
    group_col='individual_case_location'
)


Pairwise Mann–Whitney tests for Shannon:
------------------------------------------------------------
control-nonlesional_skin Umtata vs case-nonlesional_skin Umtata | U=292, p=0.0741, p_adj=0.111, ns
control-nonlesional_skin Umtata vs case-lesional_skin Umtata | U=298, p=0.0527, p_adj=0.111, ns
case-nonlesional_skin Umtata vs case-lesional_skin Umtata | U=230, p=0.821, p_adj=0.821, ns
------------------------------------------------------------


  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  sns.stripplot(
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)



Pairwise Mann–Whitney tests for Shannon:
------------------------------------------------------------
control-nonlesional_skin Cape Town vs case-nonlesional_skin Cape Town | U=82, p=0.0801, p_adj=0.152, ns
control-nonlesional_skin Cape Town vs case-lesional_skin Cape Town | U=85, p=0.101, p_adj=0.152, ns
case-nonlesional_skin Cape Town vs case-lesional_skin Cape Town | U=195, p=0.903, p_adj=0.903, ns
------------------------------------------------------------
Done! Shannon plots saved. Downsampling applied: True
