In [1]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca

### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

# Load metadata/BIOM table and subset for skin only samples


In [10]:
# # read in biom table
# biom_tbl = biom.load_table("/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom")

# print(biom_tbl.ids(axis= 'sample'))
# len(biom_tbl.ids(axis= 'sample'))
# type(biom_tbl.ids(axis = 'sample'))

# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    
    # Drop the 'row_sum' column before proceeding
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)

     # Load metadata as a DataFrame from the file path
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')

    # Set Sample-ID as the index for the metadata dataframe 
    metadata = metadata.set_index('#sample-id')

    # Subset metadata to only contain the same samples as in the BIOM df
    metadata_sub = metadata.loc[df.columns]

    # Get a list of the sample ids that are just nares
    metadataindex = metadata_sub[metadata_sub['case_type'].isin(['case-anterior nares', 'control-anterior nares'])].index.tolist()
    
    
    # Drop all sample-ids that correspond to nares in the BIOM table DataFrame
    df = df.drop(columns= metadataindex)

    # Subsets the Metadata to only contain the sample ids that are in the newly subsetted biom table DataFrame with only Skin
    metadata_sub = metadata.loc[df.columns]

    
    #returns BIOM table DataFrame and the metadata that has been subsetted to only contain skin
    return df, metadata_sub


# converts the subsetted skin only BIOM table DataFrame back into a BIOM to run RPCA
def convert_df_to_biom(table, biom_output_file):
    obs_ids = table.index
    samp_ids = table.columns
    biom_table = biom.table.Table(table.values, observation_ids=obs_ids, sample_ids=samp_ids) # Convert df back to a biom table
    #biom_output_file = f"../tables/{name}_subset.biom" # Path to output file

    with biom_open(biom_output_file, 'w') as f: # This does the actual saving!
        biom_table.to_hdf5(f, generated_by="subsetted tables")

    return biom_table



In [11]:
# specify the path to the BIOM table and Metadata
biom_path = "/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom"
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
skin_only_table, metadata_sub = load_biom_table(biom_path, metadata_path)

In [28]:
# Check to see if the subset metadata only contains skin
metadata_sub['case_type'].value_counts()

biom_tbl_ids = list(biom_tbl.ids(axis='sample'))

# Check to see if the len of the BIOM file is the same as metadata 
len(biom_tbl_ids) == len(metadata_sub.index)

True

# Converting the subsetted BIOM table DF back to BIOM table 

In [20]:
# Create the path to the skin BIOM file by using the prefix of the original BIOM file
skin_only_biom_path = biom_path.removesuffix('.biom') + '_skin_only.biom'

# Print the path that was created
print(f'{skin_only_biom_path = }')

# Converting df to BIOM table and saving the biom table to the path specified above
biom_tbl = convert_df_to_biom(skin_only_table, skin_only_biom_path)

skin_only_biom_path = '/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance_skin_only.biom'


# Run RPCA on subsetted BIOM table

In [29]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata_sub['case_type'])

spca_df

Unnamed: 0,PC1,PC2,PC3,case_type
900221,-0.138159,0.119562,-0.061583,case-lesional skin
900570,0.008191,-0.092232,0.082358,case-nonlesional skin
900091,-0.015862,0.000555,-0.051489,case-nonlesional skin
900245,-0.121362,-0.081891,-0.084649,control-nonlesional skin
900581,0.037155,-0.027596,0.087532,case-lesional skin
...,...,...,...,...
Ca008HNL,-0.084536,0.198259,0.042356,case-lesional skin
900081,0.054839,0.013485,-0.018482,case-lesional skin
900501,0.142025,-0.035154,0.038437,case-nonlesional skin
900279,-0.002518,0.044130,0.012372,control-nonlesional skin


In [30]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
g__Streptococcus,0.36899,-0.251468,0.138232
g__Staphylococcus,-0.110703,0.166092,0.847127
g__Haemophilus_D_734546,0.202138,-0.045132,-0.024013
g__Unknown,-0.03393,-0.388362,0.122917
g__Corynebacterium,-0.185973,-0.250482,0.130804


In [33]:
# Set the color palette for the groups in the correct order
palette = {
    'control-nonlesional skin': '#3333B3',     # Dark Blue color for Healthy
    'case-nonlesional skin': '#5cbccb',     # Blue color for AD Non-Lesional
    'case-lesional skin': '#f16c52',       # Red color for AD Lesional
}


# Calculate P-values for all combinations of skin case_types

In [46]:
# calculate permanova F-statistic for all combinations between Healthy, case lesional and case non lesional

case_type_subsets = [
    ["control-nonlesional skin", "case-nonlesional skin"],
    ["control-nonlesional skin", "case-lesional skin"],
    ["case-lesional skin", "case-nonlesional skin"]
]

perma_res = []

# Iterate through all case_type combinations and calculate the p-value and append to the empty list perma_res 
for case_type_subset in case_type_subsets:
    result = permanova_on_case_type_subset(spca_df, distance, case_type_subset)
    perma_res.append(f"{case_type_subset} = {result['p-value']}")

# View the list
perma_res



["['control-nonlesional skin', 'case-nonlesional skin'] = 0.001",
 "['control-nonlesional skin', 'case-lesional skin'] = 0.001",
 "['case-lesional skin', 'case-nonlesional skin'] = 0.004"]

# Plot the convex hull PCA for Skin samples only

In [54]:
# create beta diversity plot
mm = 1/25.4
fig, ax = plt.subplots(1, 1, figsize=(90*mm, 110*mm))

fpca_df.columns = [f"PC{i+1}" for i in range(fpca_df.shape[1])]

sns.scatterplot(
    data=spca_df,
    x="PC1",
    y="PC2",
    hue="case_type",
    edgecolor=None,
    palette=palette,
    ax=ax
)

for case_type, case_type_df in spca_df.groupby("case_type"):
    color = palette[case_type]

    points = case_type_df[["PC1", "PC2"]].values
    hull = scipy.spatial.ConvexHull(points)

    hull_plot_x = points[hull.vertices, 0]
    hull_plot_y = points[hull.vertices, 1]
    # Connect last point with first point
    hull_plot_x = np.append(hull_plot_x, points[hull.vertices[0], 0])
    hull_plot_y = np.append(hull_plot_y, points[hull.vertices[0], 1])

    ax.plot(
        hull_plot_x,
        hull_plot_y,
        c=color,
        zorder=0
    )
    ax.fill(
        points[hull.vertices, 0],
        points[hull.vertices, 1],
        c=color,
        alpha=0.3
    )


handles, labels = ax.get_legend_handles_labels()
plt.legend(
    handles=handles,
    labels=map(str.capitalize, labels),
    frameon=False,
    fontsize = 7
)

pc1_pct, pc2_pct, _ = [f"PC{i+1} ({x*100:.2f}%)" for i, x in enumerate(ordination.proportion_explained)]

ax.set_xlabel(pc1_pct, fontsize = 7)
ax.set_ylabel(pc2_pct, fontsize = 7)

yticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
yticklocations = yticklabels  # Assuming you want the locations to match the labels
ax.set_yticks(yticklocations)
ax.set_yticklabels(yticklabels, fontsize = 7)

xticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
xticklocations = xticklabels  # Assuming you want the locations to match the labels
ax.set_xticks(xticklocations)
ax.set_xticklabels(xticklabels, fontsize = 7)

# Writes out the text for each permanova run combination
ax.text(-0.18, 0.3, 'PERMANOVA', fontsize=7)
ax.text(-0.18, 0.28, perma_res[0], fontsize=6)
ax.text(-0.18, 0.26, perma_res[1], fontsize=6)
ax.text(-0.18, 0.24, perma_res[2], fontsize=6)

ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)

plt.tight_layout()
plt.savefig("../plots/Beta_Plots/16S_Beta_Diversity_skin_only.png", dpi = 600)
plt.show()

  plt.show()
