In [881]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca

### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

In [882]:
# # read in biom table
# biom_tbl = biom.load_table("/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom")

# print(biom_tbl.ids(axis= 'sample'))
# len(biom_tbl.ids(axis= 'sample'))
# type(biom_tbl.ids(axis = 'sample'))

# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # df = df.T
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    
    # Drop the 'row_sum' column before proceeding
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    # df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)

    # Remove '15564.' prefix from columns
    df.columns = df.columns.str.replace('15564.', '')

     # Load metadata as a DataFrame from the file path
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')

    # Set Sample-ID as the index for the metadata dataframe 
    metadata = metadata.set_index('#sample-id')

    # Subset metadata to only contain the same samples as in the BIOM df
    metadata_sub = metadata.loc[df.columns]

    # Get a list of the sample ids that are just nares
    metadataindex = metadata_sub[metadata_sub['case_type'].str.endswith('nares')].index.tolist()
    
    
    # Drop all sample-ids that correspond to nares in the BIOM table DataFrame
    df = df.drop(columns= metadataindex)

    # Subsets the Metadata to only contain the sample ids that are in the newly subsetted biom table DataFrame with only Skin
    metadata_sub = metadata.loc[df.columns]

    
    #returns BIOM table DataFrame and the metadata that has been subsetted to only contain skin
    return df, metadata_sub


# converts the subsetted skin only BIOM table DataFrame back into a BIOM to run RPCA
def convert_df_to_biom(table, biom_output_file):
    obs_ids = table.index
    samp_ids = table.columns
    biom_table = biom.table.Table(table.values, observation_ids=obs_ids, sample_ids=samp_ids) # Convert df back to a biom table
    #biom_output_file = f"../tables/{name}_subset.biom" # Path to output file

    with biom_open(biom_output_file, 'w') as f: # This does the actual saving!
        biom_table.to_hdf5(f, generated_by="subsetted tables")

    return biom_table



In [883]:
# specify the path to the BIOM table and Metadata
# biom_path = "../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom"
biom_path = "../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table.biom"
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
skin_only_table, metadata_sub = load_biom_table(biom_path, metadata_path)

In [884]:
# skin_only_table = skin_only_table.
skin_only_table

Unnamed: 0,900344,900221,900570,900129,900321,900091,900245,900423,900581,900145,...,900094,900287,900225,900057,900294,9003972,900097,900498,900276,900406
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,984.0,22.0,389.0,0.0,26.0,27.0,40.0,0.0,198.0,0.0,...,0.0,57.0,44.0,111.0,12.0,1168.0,24.0,15.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,1294.0,182.0,9.0,296.0,159.0,49.0,1366.0,27.0,...,0.0,108.0,30.0,93.0,13.0,794.0,171.0,28.0,0.0,12.0
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,0.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,...,0.0,2.0,0.0,0.0,0.0,74.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,236.0,0.0,0.0,0.0,0.0,57.0,799.0,37.0,...,0.0,0.0,4.0,116.0,0.0,477.0,94.0,11.0,0.0,7.0
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,611.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,7.0,593.0,0.0,17.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGCCAGCAGCCGCGGTAATACGGAGGGTCCGAGCGTTATCCGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGCTTTATAAGTCAGTGGTGAAATCCGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCTGTGAAATTCCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGGTGTGAAAACTCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [885]:
# Filter metadata to only AD cases
# ad_metadata = metadata_sub[metadata_sub['case_type'].str.startswith('case')]

# Subset the table to only include AD samples
# skin_only_table = skin_only_table[ad_metadata.index]
# skin_only_table

In [886]:
metadata_sub['area'].value_counts()

area
Umtata       174
Cape Town    131
Name: count, dtype: int64

In [887]:
# Set the color palette for the groups in the correct order
palette = {
    "CP": "salmon",
    "UT": "#d2b48c",  # tax
}


In [888]:
# Create the path to the skin BIOM file by using the prefix of the original BIOM file
skin_only_biom_path = biom_path.removesuffix('.biom') + '_skin_only.biom'

# Print the path that was created
print(f'{skin_only_biom_path = }')

# Converting df to BIOM table and saving the biom table to the path specified above
biom_tbl = convert_df_to_biom(skin_only_table, skin_only_biom_path)

skin_only_biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table_skin_only.biom'


## Run RPCA

In [889]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata_sub['area'])

spca_df

Unnamed: 0,PC1,PC2,PC3,area
900344,-0.031259,-0.030585,0.045341,Umtata
900221,0.034560,0.014311,0.101067,Umtata
900570,0.051177,-0.101247,-0.020742,Cape Town
900129,0.061599,-0.018120,-0.002876,Cape Town
900321,0.002420,0.003792,0.040013,Umtata
...,...,...,...,...
9003972,-0.107095,-0.089850,-0.019500,Umtata
900097,0.025972,0.031322,-0.000608,Cape Town
900498,-0.033348,0.048009,0.030825,Umtata
900276,-0.108307,-0.102284,0.005354,Umtata


In [890]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.176233,-0.363698,0.052031
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.031293,-0.18116,-0.635457
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,-0.137862,-0.125219,0.075513
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.018601,-0.079648,-0.607929
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.200034,-0.269834,0.054126


In [891]:
def permanova_on_case_type_subset(df, dist_matrix, case_type_subset):
    """
    Perform PERMANOVA on a subset of the data.
    
    Parameters:
    - df: DataFrame with metadata, must include the grouping variable
    - dist_matrix: DistanceMatrix object from scikit-bio
    - case_type_subset: list of case_type groups to include in the test
    
    Returns:
    - PERMANOVA result (dict-like with p-value, test statistic, etc.)
    """
    # Subset the DataFrame
    subset_df = df[df['area'].isin(case_type_subset)]
    
    # Get the matching IDs and subset distance matrix
    ids = subset_df.index
    sub_dm = dist_matrix.filter(ids, strict=False)
    
    # Run PERMANOVA
    result = permanova(sub_dm, grouping=subset_df['area'], permutations=999)
    
    return result

# Calculate P-values for all combinations of skin case_types

In [892]:
# calculate permanova F-statistic for all combinations between Healthy, case lesional and case non lesional

case_type_subsets = [
    ["Cape Town", "Umtata"]
]

perma_res = {}

for i, case_type_subset in enumerate(case_type_subsets):
    result = permanova_on_case_type_subset(spca_df, distance, case_type_subset)
    group_label = ("CP vs. UT")
    f_val = result["test statistic"]
    p_val = result["p-value"]
    
    perma_res[group_label] = {
        "p": f"{p_val:.2}",
        "f": f"{f_val:.2f}"
    }

perma_res



{'CP vs. UT': {'p': '0.001', 'f': '11.87'}}

In [893]:
# # Replace the long names with short names in perma_res
# for i in range(len(perma_res)):
#     perma_res[i] = perma_res[i].replace('control-nonlesional skin', 'H')
#     perma_res[i] = perma_res[i].replace('case-nonlesional skin', 'AD-NL') 
#     perma_res[i] = perma_res[i].replace('case-lesional skin', 'AD-L')

# # View the updated list
# perma_res


In [894]:
spca_df['area'] = spca_df['area'].replace({
    'Cape Town': 'CP',
    'Umtata': 'UT'})

spca_df

Unnamed: 0,PC1,PC2,PC3,area
900344,-0.031259,-0.030585,0.045341,UT
900221,0.034560,0.014311,0.101067,UT
900570,0.051177,-0.101247,-0.020742,CP
900129,0.061599,-0.018120,-0.002876,CP
900321,0.002420,0.003792,0.040013,UT
...,...,...,...,...
9003972,-0.107095,-0.089850,-0.019500,UT
900097,0.025972,0.031322,-0.000608,CP
900498,-0.033348,0.048009,0.030825,UT
900276,-0.108307,-0.102284,0.005354,UT


In [895]:
num_UT = len(spca_df[spca_df['area'] == 'UT'])
num_CP = len(spca_df[spca_df['area'] == 'CP'])


In [896]:
areas = spca_df['area'].unique()
print("Unique values in 'area':", areas)


Unique values in 'area': ['UT' 'CP']


# Plot the convex hull PCA for Skin samples only

In [897]:
# create beta diversity plot
fig, ax = plt.subplots(1, 1, figsize=(4.5, 5))

fpca_df.columns = [f"PC{i+1}" for i in range(fpca_df.shape[1])]

sns.scatterplot(
    data=spca_df,
    x="PC1",
    y="PC2",
    hue="area",
    s=50,
    edgecolor="black",      # Thin black outline
    linewidth=0.5,          # Line thickness
    palette=palette,
    ax=ax
)

for case_type, case_type_df in spca_df.groupby("area"):
    color = palette[case_type]

    points = case_type_df[["PC1", "PC2"]].values
    hull = scipy.spatial.ConvexHull(points)

    hull_plot_x = points[hull.vertices, 0]
    hull_plot_y = points[hull.vertices, 1]
    # Connect last point with first point
    hull_plot_x = np.append(hull_plot_x, points[hull.vertices[0], 0])
    hull_plot_y = np.append(hull_plot_y, points[hull.vertices[0], 1])

    ax.plot(
        hull_plot_x,
        hull_plot_y,
        c=color,
        zorder=0
    )
    ax.fill(
        points[hull.vertices, 0],
        points[hull.vertices, 1],
        c=color,
        alpha=0.3
    )


handles, labels = ax.get_legend_handles_labels()
custom_labels = [f"CT (n={num_CP})", f"UT (n={num_UT})"]

plt.legend(
    handles=handles,
    labels=custom_labels,
    frameon=False,
    fontsize=10,
    loc='upper right'
)

pc1_pct, pc2_pct, _ = [f"RPCA PC{i+1} ({x*100:.2f}%)" for i, x in enumerate(ordination.proportion_explained)]

ax.set_xlabel(pc1_pct, fontsize = 14)
ax.set_ylabel(pc2_pct, fontsize = 14)

yticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
yticklocations = yticklabels  # Assuming you want the locations to match the labels
ax.set_yticks(yticklocations)
ax.set_yticklabels(yticklabels, fontsize = 12)

xticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
xticklocations = xticklabels  # Assuming you want the locations to match the labels
ax.set_xticks(xticklocations)
ax.set_xticklabels(xticklabels, fontsize = 12)

# Writes out the text for each permanova run combination
ax.text(
    -0.18, 0.26,
    f"p={perma_res['CP vs. UT']['p']},  F={perma_res['CP vs. UT']['f']}",
    fontsize=12
)


plt.title(f'Cape Town vs. Umtata', fontsize = 18)

# ax.spines["right"].set_visible(False)
# ax.spines["top"].set_visible(False)

plt.tight_layout()
plt.savefig("../Plots/Analysis_figures/Diversity/16S_Beta_Diversity_RPCA_skin_only_region.png", dpi = 600)
plt.show()

  plt.show()
