# Beta Diversity (RPCA)

In [1]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca
from matplotlib.patches import Circle
from matplotlib.colors import to_hex
import statsmodels.api as sm
from skbio import DistanceMatrix

### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

In [2]:
# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)

    # Remove '15564.' prefix from columns
    df.columns = df.columns.str.replace('15564.', '')

    # Load metadata
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')

    # Set Sample-ID as index
    metadata = metadata.set_index('#sample-id')

    # Subset metadata to only samples in the BIOM table
    metadata_sub = metadata.loc[df.columns]

    # Get a list of the nares samples
    nares_samples = metadata_sub[metadata_sub['case_type'].str.endswith('nares')].index.tolist()

    # Drop nares samples from BIOM table
    df = df.drop(columns=nares_samples)

    # Recompute metadata_sub after nares removal
    metadata_sub = metadata.loc[df.columns]

    # Compute individual_case_location
    metadata['individual_case_location'] = metadata['case_type'] + ' ' + metadata['area']

    # Convert SCORAD and filter only AD samples with SCORAD > 40
    metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

    # Keep healthy samples (NaN SCORAD) and AD samples with SCORAD ≤ 40
    # metadata = metadata[(metadata['o_scorad'].isna()) | (metadata['o_scorad'] <= 40)]
    metadata = metadata[(metadata['o_scorad'].isna()) | (metadata['o_scorad'] <= 200)] # ALL SCORAD

    # Subset both df and metadata to overlapping samples
    shared_samples = df.columns.intersection(metadata.index)
    df = df[shared_samples]
    metadata_sub = metadata.loc[shared_samples]

    return df, metadata_sub


In [3]:
# converts the subsetted skin only BIOM table DataFrame back into a BIOM to run RPCA
def convert_df_to_biom(table, biom_output_file):
    obs_ids = table.index
    samp_ids = table.columns
    biom_table = biom.table.Table(table.values, observation_ids=obs_ids, sample_ids=samp_ids) # Convert df back to a biom table
    #biom_output_file = f"../tables/{name}_subset.biom" # Path to output file

    with biom_open(biom_output_file, 'w') as f: # This does the actual saving!
        biom_table.to_hdf5(f, generated_by="subsetted tables")

    return biom_table

In [4]:
# specify the path to the BIOM table and Metadata
biom_path = "../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table.biom"
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type-location.tab'
skin_only_table, metadata_sub = load_biom_table(biom_path, metadata_path)

skin_only_table

Unnamed: 0,900344,900221,900570,900129,900321,900091,900245,900423,900581,900145,...,900094,900287,900225,900057,900294,9003972,900097,900498,900276,900406
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,984.0,22.0,389.0,0.0,26.0,27.0,40.0,0.0,198.0,0.0,...,0.0,57.0,44.0,111.0,12.0,1168.0,24.0,15.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,1294.0,182.0,9.0,296.0,159.0,49.0,1366.0,27.0,...,0.0,108.0,30.0,93.0,13.0,794.0,171.0,28.0,0.0,12.0
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,0.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,...,0.0,2.0,0.0,0.0,0.0,74.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,236.0,0.0,0.0,0.0,0.0,57.0,799.0,37.0,...,0.0,0.0,4.0,116.0,0.0,477.0,94.0,11.0,0.0,7.0
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,611.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,7.0,593.0,0.0,17.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGCCAGCAGCCGCGGTAATACGGAGGGTCCGAGCGTTATCCGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGCTTTATAAGTCAGTGGTGAAATCCGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCTGTGAAATTCCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGGTGTGAAAACTCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Create the path to the skin BIOM file by using the prefix of the original BIOM file
skin_only_biom_path = biom_path.removesuffix('.biom') + '_skin_only.biom'

# Print the path that was created
print(f'{skin_only_biom_path = }')

# Converting df to BIOM table and saving the biom table to the path specified above
biom_tbl = convert_df_to_biom(skin_only_table, skin_only_biom_path)

skin_only_biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table_skin_only.biom'


In [6]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata_sub['individual_case_location'])

spca_df

Unnamed: 0,PC1,PC2,PC3,individual_case_location
900344,-0.031259,0.030585,-0.045341,control-nonlesional skin Umtata
900221,0.034560,-0.014311,-0.101067,case-lesional skin Umtata
900570,0.051177,0.101247,0.020742,case-nonlesional skin Cape Town
900129,0.061599,0.018120,0.002876,case-lesional skin Cape Town
900321,0.002420,-0.003792,-0.040013,control-nonlesional skin Umtata
...,...,...,...,...
9003972,-0.107095,0.089850,0.019500,case-lesional skin Umtata
900097,0.025972,-0.031322,0.000608,case-nonlesional skin Cape Town
900498,-0.033348,-0.048009,-0.030825,case-nonlesional skin Umtata
900276,-0.108307,0.102284,-0.005354,case-lesional skin Umtata


In [7]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.176233,0.363698,-0.052031
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.031293,0.18116,0.635457
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,-0.137862,0.125219,-0.075513
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.018601,0.079648,0.607929
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.200034,0.269834,-0.054126


In [8]:
def permanova_on_case_type_subset(df, dist_matrix, case_type_subset):
    """
    Perform PERMANOVA on a subset of the data.
    
    Parameters:
    - df: DataFrame with metadata, must include the grouping variable
    - dist_matrix: DistanceMatrix object from scikit-bio
    - case_type_subset: list of case_type groups to include in the test
    
    Returns:
    - PERMANOVA result (dict-like with p-value, test statistic, etc.)
    """
    # Subset the DataFrame
    subset_df = df[df['individual_case_location'].isin(case_type_subset)]
    print(subset_df)
    group_counts = subset_df['individual_case_location'].value_counts()
    print("Group counts:", group_counts)

    # Get the matching IDs and subset distance matrix
    ids = subset_df.index
    sub_dm = dist_matrix.filter(ids, strict=False)
    
    # Run PERMANOVA
    result = permanova(sub_dm, grouping=subset_df['individual_case_location'], permutations=999)
    
    return result

In [9]:
# calculate permanova F-statistic for all combinations between Healthy, case lesional and case non lesional

case_type_subsets = [
    ["control-nonlesional skin Cape Town", "case-lesional skin Cape Town"],
    ["control-nonlesional skin Umtata", "case-lesional skin Umtata"]]

perma_res = {}

for i, case_type_subset in enumerate(case_type_subsets):
    print("Subset case_type:", case_type_subset)
    # print("Available sample IDs:", spca_df.index.tolist())

    result = permanova_on_case_type_subset(spca_df, distance, case_type_subset)
    group_label = (
        "Skin of Healthy vs. AD Children in Cape Town" if i == 0 else
        "Skin of Healthy vs. AD Children in Umtata"
    )
    f_val = result["test statistic"]
    p_val = result["p-value"]
    
    perma_res[group_label] = {
        "p": f"{p_val:.2e}",
        "f": f"{f_val:.2f}"
    }

perma_res

Subset case_type: ['control-nonlesional skin Cape Town', 'case-lesional skin Cape Town']
                PC1       PC2       PC3            individual_case_location
900129     0.061599  0.018120  0.002876        case-lesional skin Cape Town
900581    -0.007051  0.063078  0.054416        case-lesional skin Cape Town
900145     0.115958 -0.038398 -0.038217  control-nonlesional skin Cape Town
900544     0.053323  0.045220  0.041756  control-nonlesional skin Cape Town
900110     0.038821 -0.045479 -0.017278        case-lesional skin Cape Town
...             ...       ...       ...                                 ...
Co005SNNL -0.056991 -0.078442 -0.030159  control-nonlesional skin Cape Town
Ca008HNL  -0.061534 -0.089716  0.146310        case-lesional skin Cape Town
900081    -0.050282  0.029005 -0.022752        case-lesional skin Cape Town
900093     0.036764  0.008035 -0.035655        case-lesional skin Cape Town
900057    -0.037138 -0.021089  0.008198        case-lesional skin Cape Town

{'Skin of Healthy vs. AD Children in Cape Town': {'p': '2.00e-03',
  'f': '6.20'},
 'Skin of Healthy vs. AD Children in Umtata': {'p': '1.00e-03', 'f': '25.72'}}

In [10]:
# Set the color palette for the groups in the correct order
palette = {
    'control-nonlesional skin Cape Town': '#7FBCEB',
    'control-nonlesional skin Umtata': '#66C2EE',
    'case-nonlesional skin Cape Town': '#FAD5A5',
    'case-nonlesional skin Umtata': '#FAD5A5',
    'case-lesional skin Cape Town': '#cd853f',
    'case-lesional skin Umtata': '#fa8072'
}

In [11]:
# Prepare labels
group_short_labels = {
    "case-lesional skin Umtata": "UM-ADL",
    "case-lesional skin Cape Town": "CT-ADL",
    "case-nonlesional skin Umtata": "UM-ADNL",
    "case-nonlesional skin Cape Town": "CT-ADNL",
    "control-nonlesional skin Umtata": "UM-H",
    "control-nonlesional skin Cape Town": "CT-H"
}

# Add short label column
spca_df["short_label"] = spca_df["individual_case_location"].map(group_short_labels)

# Create label map with (n=#)
label_map = spca_df["short_label"].value_counts().to_dict()
label_map = {k: f"{k} (n={v})" for k, v in label_map.items()}
spca_df["group_label"] = spca_df["short_label"].map(label_map)

# Define PC variance
pc1_var = ordination.proportion_explained['PC1'] * 100
pc2_var = ordination.proportion_explained['PC2'] * 100

# ===== CAPE TOWN FIGURE =====
ct_full_groups = ["control-nonlesional skin Cape Town", "case-nonlesional skin Cape Town", "case-lesional skin Cape Town"]
ct_subset = spca_df[spca_df["individual_case_location"].isin(ct_full_groups)].copy()

fig_ct, ax_ct = plt.subplots(figsize=(5.5, 5.5))
sns.scatterplot(
    data=ct_subset,
    x="PC1", y="PC2",
    hue="group_label",
    hue_order=[label_map[group_short_labels[g]] for g in ct_full_groups],
    s=50, edgecolor="black", linewidth=0.5,
    palette={label_map[group_short_labels[g]]: palette[g] for g in ct_full_groups},
    ax=ax_ct
)

# Confidence ellipses
for group in ct_full_groups:
    df = ct_subset[ct_subset["individual_case_location"] == group]
    color = palette[group]
    pts = df[["PC1", "PC2"]].values
    center = pts.mean(axis=0)
    radius = np.percentile(np.linalg.norm(pts - center, axis=1), 90)
    ax_ct.add_patch(Circle(center, radius, edgecolor=color, facecolor=color, alpha=0.2, lw=1, zorder=0))

ax_ct.set_xlabel(f"RPCA PC1 ({pc1_var:.1f}%)", fontsize=16)
ax_ct.set_ylabel(f"RPCA PC2 ({pc2_var:.1f}%)", fontsize=16)
ax_ct.set_xticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
ax_ct.set_yticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
ax_ct.tick_params(labelsize=12)
ax_ct.legend(frameon=False, fontsize=12, loc='upper right')
ax_ct.set_title("Cape Town (urban)", fontsize=20)
ax_ct.text(
    0.04, -0.17,
    f"p={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['p']):.3f}, "
    f"F={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['f']):.2f}",
    fontsize=12
)

plt.tight_layout()
fig_ct.savefig("../Figures/Supplementary/Suppl_Fig_3D.png", dpi=600)
fig_ct.savefig("../Figures/Main/Fig_2G.png", dpi=600)

# ===== UMTATA FIGURE =====
um_full_groups = ["control-nonlesional skin Umtata", "case-nonlesional skin Umtata", "case-lesional skin Umtata"]
um_subset = spca_df[spca_df["individual_case_location"].isin(um_full_groups)].copy()

fig_um, ax_um = plt.subplots(figsize=(5.5, 5.5))
sns.scatterplot(
    data=um_subset,
    x="PC1", y="PC2",
    hue="group_label",
    hue_order=[label_map[group_short_labels[g]] for g in um_full_groups],
    s=50, edgecolor="black", linewidth=0.5,
    palette={label_map[group_short_labels[g]]: palette[g] for g in um_full_groups},
    ax=ax_um
)

# Confidence ellipses
for group in um_full_groups:
    df = um_subset[um_subset["individual_case_location"] == group]
    color = palette[group]
    pts = df[["PC1", "PC2"]].values
    center = pts.mean(axis=0)
    radius = np.percentile(np.linalg.norm(pts - center, axis=1), 90)
    ax_um.add_patch(Circle(center, radius, edgecolor=color, facecolor=color, alpha=0.2, lw=1, zorder=0))

ax_um.set_xlabel(f"RPCA PC1 ({pc1_var:.1f}%)", fontsize=16)
ax_um.set_ylabel(f"RPCA PC2 ({pc2_var:.1f}%)", fontsize=16)
ax_um.set_xticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
ax_um.set_yticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
ax_um.tick_params(labelsize=12)
ax_um.legend(frameon=False, fontsize=12, loc='upper right')
ax_um.set_title("Umtata (rural)", fontsize=20)
ax_um.text(
    0.02, -0.22,
    # 0.02, -0.18,
    f"p={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['p']):.3f}, "
    f"F={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['f']):.2f}",
    fontsize=12
)

plt.tight_layout()
fig_um.savefig("../Figures/Supplementary/Suppl_Fig_3B.png", dpi=600)
# fig_um.savefig("../Figures/Main/Fig_2E.png", dpi=600)



In [12]:
# Filter only ADL samples from both regions
adl_groups = [
    "case-lesional skin Cape Town",
    "case-lesional skin Umtata"
]
adl_subset = spca_df[spca_df["individual_case_location"].isin(adl_groups)].copy()

# Prepare labels
group_short_labels = {
    "case-lesional skin Umtata": "UM-ADL",
    "case-lesional skin Cape Town": "CT-ADL",
    "case-nonlesional skin Umtata": "UM-ADNL",
    "case-nonlesional skin Cape Town": "CT-ADNL",
    "control-nonlesional skin Umtata": "UM-H",
    "control-nonlesional skin Cape Town": "CT-H"
}

adl_subset["short_label"] = adl_subset["individual_case_location"].map(group_short_labels)
label_map = adl_subset["short_label"].value_counts().to_dict()
print(label_map)
label_map = {k: f"{k} (n={v})" for k, v in label_map.items()}
adl_subset["group_label"] = adl_subset["short_label"].map(label_map)

# Define PC variance
pc1_var = ordination.proportion_explained['PC1'] * 100
pc2_var = ordination.proportion_explained['PC2'] * 100

# Palette
adl_palette = {
    "case-lesional skin Cape Town": "#C9A34F",  # same as CT ADL
    "case-lesional skin Umtata": "#F0806B"      # same as Umtata ADL
}
plot_palette = {
    label_map["CT-ADL"]: adl_palette["case-lesional skin Cape Town"],
    label_map["UM-ADL"]: adl_palette["case-lesional skin Umtata"]
}

# Plot
fig, ax = plt.subplots(figsize=(5.5, 5.5))
sns.scatterplot(
    data=adl_subset,
    x="PC1", y="PC2",
    hue="group_label",
    hue_order=[label_map["CT-ADL"], label_map["UM-ADL"]],
    s=50, edgecolor="black", linewidth=0.5,
    palette=plot_palette,
    ax=ax
)

# Confidence ellipses
for group in adl_groups:
    df = adl_subset[adl_subset["individual_case_location"] == group]
    color = adl_palette[group]
    pts = df[["PC1", "PC2"]].values
    center = pts.mean(axis=0)
    radius = np.percentile(np.linalg.norm(pts - center, axis=1), 90)
    ax.add_patch(Circle(center, radius, edgecolor=color, facecolor=color, alpha=0.2, lw=1, zorder=0))

# Axis formatting
ax.set_xlabel(f"RPCA PC1 ({pc1_var:.1f}%)", fontsize=16)
ax.set_ylabel(f"RPCA PC2 ({pc2_var:.1f}%)", fontsize=16)
ax.set_xticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
ax.set_yticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
ax.tick_params(labelsize=12)
ax.legend(frameon=False, fontsize=12, loc='upper right')
ax.set_title("Lesional Skin: Cape Town vs Umtata", fontsize=18)

plt.tight_layout()
plt.savefig("../Figures/Main/Fig_ADL_CapeTown_vs_Umtata.png", dpi=600)


{'UM-ADL': 61, 'CT-ADL': 46}
