# Beta Diversity (RPCA)

In [652]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca
from matplotlib.patches import Circle
from matplotlib.colors import to_hex
import statsmodels.api as sm
### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

In [653]:
# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)

    # Remove '15564.' prefix from columns
    df.columns = df.columns.str.replace('15564.', '')

    # Load metadata
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')

    # Set Sample-ID as index
    metadata = metadata.set_index('#sample-id')

    # Subset metadata to only samples in the BIOM table
    metadata_sub = metadata.loc[df.columns]

    # Get a list of the nares samples
    nares_samples = metadata_sub[metadata_sub['case_type'].str.endswith('nares')].index.tolist()

    # Drop nares samples from BIOM table
    df = df.drop(columns=nares_samples)

    # Recompute metadata_sub after nares removal
    metadata_sub = metadata.loc[df.columns]

    # Compute individual_case_location
    metadata['individual_case_location'] = metadata['case_type'] + ' ' + metadata['area']

    # Convert SCORAD and filter only AD samples with SCORAD > 50
    metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

    # Keep healthy samples (NaN SCORAD) and AD samples with SCORAD ≤ 50
    metadata = metadata[(metadata['o_scorad'].isna()) | (metadata['o_scorad'] <= 50)]

    # Subset both df and metadata to overlapping samples
    shared_samples = df.columns.intersection(metadata.index)
    df = df[shared_samples]
    metadata_sub = metadata.loc[shared_samples]

    return df, metadata_sub


In [654]:
# converts the subsetted skin only BIOM table DataFrame back into a BIOM to run RPCA
def convert_df_to_biom(table, biom_output_file):
    obs_ids = table.index
    samp_ids = table.columns
    biom_table = biom.table.Table(table.values, observation_ids=obs_ids, sample_ids=samp_ids) # Convert df back to a biom table
    #biom_output_file = f"../tables/{name}_subset.biom" # Path to output file

    with biom_open(biom_output_file, 'w') as f: # This does the actual saving!
        biom_table.to_hdf5(f, generated_by="subsetted tables")

    return biom_table

In [655]:
# specify the path to the BIOM table and Metadata
biom_path = "../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table.biom"
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type-location.tab'
skin_only_table, metadata_sub = load_biom_table(biom_path, metadata_path)

skin_only_table

Unnamed: 0,900344,900221,900570,900129,900321,900245,900423,900145,900364,900329,...,900279,900093,900094,900287,900225,900057,900294,900097,900498,900406
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,984.0,22.0,389.0,0.0,26.0,40.0,0.0,0.0,100.0,91.0,...,133.0,18.0,0.0,57.0,44.0,111.0,12.0,24.0,15.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,1294.0,182.0,9.0,159.0,49.0,27.0,159.0,4953.0,...,908.0,0.0,0.0,108.0,30.0,93.0,13.0,171.0,28.0,12.0
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,0.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,...,53.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,236.0,0.0,0.0,0.0,57.0,37.0,0.0,1560.0,...,499.0,0.0,0.0,0.0,4.0,116.0,0.0,94.0,11.0,7.0
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,611.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,71.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,17.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGCCAGCAGCCGCGGTAATACGGAGGGTCCGAGCGTTATCCGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGCTTTATAAGTCAGTGGTGAAATCCGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCTGTGAAATTCCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGGTGTGAAAACTCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [656]:
metadata_sub

Unnamed: 0.1,Unnamed: 0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,...,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,o_scorad_adj,individual_case,individual_case_location
900344,314,4,C10,SA503,TAGCGAGT,SA710,GTCTGCTA,SA710SA503,GTCTGCTA-TAGCGAGT,1.010000e+21,...,Spring,,,7.0,,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,0,H skin,control-nonlesional skin Umtata
900221,1,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,...,Winter,Unexposed,negative,7.0,34.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34,AD skin,case-lesional skin Umtata
900570,301,4,F8,SA506,CGTGAGTG,SA708,CGAGCGAC,SA708SA506,CGAGCGAC-CGTGAGTG,1.010000e+21,...,Autumn,Unexposed,negative,4.0,36.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,36,AD skin,case-nonlesional skin Cape Town
900129,250,4,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,...,Winter,Unexposed,negative,6.0,44.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,44,AD skin,case-lesional skin Cape Town
900321,273,4,B4,SA502,ACTATCTG,SA704,ACTCACTG,SA704SA502,ACTCACTG-ACTATCTG,1.010000e+21,...,Spring,,,5.0,,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,0,H skin,control-nonlesional skin Umtata
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900057,6,1,G1,SA507,GGATATCT,SA701,CGAGAGTT,SA701SA507,CGAGAGTT-GGATATCT,1.010000e+21,...,Autumn,Unexposed,negative,9.0,33.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,33,AD skin,case-lesional skin Cape Town
900294,134,2,A8,SB501,CTACTATA,SB708,ATAGTACC,SB708SB501,ATAGTACC-CTACTATA,1.010000e+21,...,Winter,Unexposed,negative,4.0,23.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,23,AD skin,case-nonlesional skin Umtata
900097,141,2,H8,SB508,GTCAGATA,SB708,ATAGTACC,SB708SB508,ATAGTACC-GTCAGATA,1.010000e+21,...,Winter,Unexposed,negative,6.0,44.0,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,44,AD skin,case-nonlesional skin Cape Town
900498,464,6,H11,SA508,GACACCGT,SB711,AACGCTGA,SB711SA508,AACGCTGA-GACACCGT,1.010000e+21,...,,,,,,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,0,AD skin,case-nonlesional skin Umtata


In [657]:
# Create the path to the skin BIOM file by using the prefix of the original BIOM file
skin_only_biom_path = biom_path.removesuffix('.biom') + '_skin_only.biom'

# Print the path that was created
print(f'{skin_only_biom_path = }')

# Converting df to BIOM table and saving the biom table to the path specified above
biom_tbl = convert_df_to_biom(skin_only_table, skin_only_biom_path)

skin_only_biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table_skin_only.biom'


In [658]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata_sub['individual_case_location'])

spca_df

Unnamed: 0,PC1,PC2,PC3,individual_case_location
900344,-0.042492,-0.030903,0.047629,control-nonlesional skin Umtata
900221,0.044481,0.009170,0.111885,case-lesional skin Umtata
900570,0.047257,-0.107631,-0.033293,case-nonlesional skin Cape Town
900129,0.065621,-0.016664,-0.012688,case-lesional skin Cape Town
900321,-0.002474,0.003395,0.042491,control-nonlesional skin Umtata
...,...,...,...,...
900057,-0.040740,0.021282,-0.012640,case-lesional skin Cape Town
900294,-0.036117,0.090884,0.039716,case-nonlesional skin Umtata
900097,0.027780,0.033129,-0.005649,case-nonlesional skin Cape Town
900498,-0.034955,0.050211,0.028312,case-nonlesional skin Umtata


In [659]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.180829,-0.367289,0.036368
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.021655,-0.172791,-0.630078
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,-0.13994,-0.13127,0.078579
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.00323,-0.066083,-0.594591
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.206663,-0.26379,0.052879


In [660]:
def permanova_on_case_type_subset(df, dist_matrix, case_type_subset):
    """
    Perform PERMANOVA on a subset of the data.
    
    Parameters:
    - df: DataFrame with metadata, must include the grouping variable
    - dist_matrix: DistanceMatrix object from scikit-bio
    - case_type_subset: list of case_type groups to include in the test
    
    Returns:
    - PERMANOVA result (dict-like with p-value, test statistic, etc.)
    """
    # Subset the DataFrame
    subset_df = df[df['individual_case_location'].isin(case_type_subset)]
    print(subset_df)
    group_counts = subset_df['individual_case_location'].value_counts()
    print("Group counts:", group_counts)

    # Get the matching IDs and subset distance matrix
    ids = subset_df.index
    sub_dm = dist_matrix.filter(ids, strict=False)
    
    # Run PERMANOVA
    result = permanova(sub_dm, grouping=subset_df['individual_case_location'], permutations=999)
    
    return result

In [661]:
# calculate permanova F-statistic for all combinations between Healthy, case lesional and case non lesional

case_type_subsets = [
    ["control-nonlesional skin Cape Town", "case-lesional skin Cape Town"],
    ["control-nonlesional skin Umtata", "case-lesional skin Umtata"]]

perma_res = {}

for i, case_type_subset in enumerate(case_type_subsets):
    print("Subset case_type:", case_type_subset)
    # print("Available sample IDs:", spca_df.index.tolist())

    result = permanova_on_case_type_subset(spca_df, distance, case_type_subset)
    group_label = (
        "Skin of Healthy vs. AD Children in Cape Town" if i == 0 else
        "Skin of Healthy vs. AD Children in Umtata"
    )
    f_val = result["test statistic"]
    p_val = result["p-value"]
    
    perma_res[group_label] = {
        "p": f"{p_val:.2e}",
        "f": f"{f_val:.2f}"
    }

perma_res

Subset case_type: ['control-nonlesional skin Cape Town', 'case-lesional skin Cape Town']
                PC1       PC2       PC3            individual_case_location
900129     0.065621 -0.016664 -0.012688        case-lesional skin Cape Town
900145     0.109621  0.046086  0.016735  control-nonlesional skin Cape Town
900544     0.060183 -0.041247 -0.058251  control-nonlesional skin Cape Town
900110     0.041713  0.038921  0.001057        case-lesional skin Cape Town
900600     0.017252  0.069338  0.043534  control-nonlesional skin Cape Town
...             ...       ...       ...                                 ...
Co005SNNL -0.066133  0.084223  0.032097  control-nonlesional skin Cape Town
Ca008HNL  -0.071927  0.105776 -0.157505        case-lesional skin Cape Town
900081    -0.056060 -0.031927  0.016984        case-lesional skin Cape Town
900093     0.044290 -0.009791  0.037301        case-lesional skin Cape Town
900057    -0.040740  0.021282 -0.012640        case-lesional skin Cape Town

{'Skin of Healthy vs. AD Children in Cape Town': {'p': '2.00e-03',
  'f': '6.71'},
 'Skin of Healthy vs. AD Children in Umtata': {'p': '1.00e-03', 'f': '23.57'}}

In [662]:
# Set the color palette for the groups in the correct order
palette = {
    'control-nonlesional skin Cape Town': '#A7C7E7',
    'control-nonlesional skin Umtata': '#ADD8E6',
    'case-lesional skin Cape Town': '#d2b48c',
    'case-lesional skin Umtata': '#fa8072'
}

In [663]:
# Create label map with (n=#)
label_map = spca_df["individual_case_location"].value_counts().to_dict()
label_map = {k: f"{k} (n={v})" for k, v in label_map.items()}

# Add labeled group column
spca_df["group_label"] = spca_df["individual_case_location"].map(label_map)

# Create figure
fig, axes = plt.subplots(1, 2, figsize=(8, 5.5), sharex=True, sharey=True)

# Define groups per panel
split_map = {
    "Cape Town": ["control-nonlesional skin Cape Town", "case-lesional skin Cape Town"],
    "Umtata": ["control-nonlesional skin Umtata", "case-lesional skin Umtata"]
}

# Plot loop
for ax, (title, groups) in zip(axes, split_map.items()):
    subset_df = spca_df[spca_df["individual_case_location"].isin(groups)].copy()
    subset_df["group_label"] = subset_df["individual_case_location"].map(label_map)

    ax.grid(False)

    sns.scatterplot(
        data=subset_df,
        x="PC1",
        y="PC2",
        hue="group_label",
        hue_order=[label_map[g] for g in groups],
        s=50,
        edgecolor="black",
        linewidth=0.5,
        palette={label_map[g]: palette[g] for g in groups},
        ax=ax
    )

    # Draw 90% confidence circles
    for case_type, case_type_df in subset_df.groupby("individual_case_location"):
        color = palette[case_type]
        points = case_type_df[["PC1", "PC2"]].values
        centroid = points.mean(axis=0)
        dists = np.linalg.norm(points - centroid, axis=1)
        radius = np.percentile(dists, 90)
        circle = Circle(centroid, radius, edgecolor=color, facecolor=color, alpha=0.2, lw=1, zorder=0)
        ax.add_patch(circle)

    ax.set_title(title, fontsize=16)
    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.legend(frameon=False, fontsize=9, loc='upper right')

# === ADD % VARIANCE EXPLAINED TO AXIS LABELS ===
pc1_var = ordination.proportion_explained['PC1'] * 100
pc2_var = ordination.proportion_explained['PC2'] * 100

axes[0].set_ylabel(f"PC2 ({pc2_var:.1f}%)", fontsize=14)
for ax in axes:
    ax.set_xlabel(f"PC1 ({pc1_var:.1f}%)", fontsize=14)


axes[0].set_ylabel(f"PC2 ({pc2_var:.1f}%)", fontsize=14)
for ax in axes:
    ax.set_xlabel(f"PC1 ({pc1_var:.1f}%)", fontsize=14)
    ax.set_xticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
    ax.set_xticklabels([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3], fontsize=12)
    ax.set_yticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
    ax.set_yticklabels([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3], fontsize=12)

# PERMANOVA stats
axes[0].text(
    0.04, -0.2,
    f"p={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['p']):.3f}, "
    f"F={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['f']):.2f}",
    fontsize=12
)
axes[1].text(
    0.04, -0.2,
    f"p={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['p']):.3f}, "
    f"F={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['f']):.2f}",
    fontsize=12
)

# Final touches
fig.suptitle("Beta Diversity by Region (SCORAD < 50)", fontsize=18, y=0.93)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig("../Figures/Main/Fig_3C.png", dpi=600)
