In [2]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca

### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

# Load metadata/BIOM table and subset for nares only samples


In [3]:
# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    
    # Drop the 'row_sum' column before proceeding
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)

     # Load metadata as a DataFrame from the file path
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')

    # Set Sample-ID as the index for the metadata dataframe 
    metadata = metadata.set_index('#sample-id')

    # Ensures metadata only contains the ID in the BIOM file DF
    metadata_sub = metadata.loc[df.columns]

    # ~ makes it opposite so now we are just subsetting for only nares
    metadataindex = metadata_sub[~metadata_sub['case_type'].isin(['case-anterior nares', 'control-anterior nares'])].index.tolist()
    df = df.drop(columns= metadataindex)
    metadata_sub = metadata.loc[df.columns]

    return df, metadata_sub


# converts 
def convert_df_to_biom(table, biom_output_file):
    obs_ids = table.index
    samp_ids = table.columns
    biom_table = biom.table.Table(table.values, observation_ids=obs_ids, sample_ids=samp_ids) # Convert df back to a biom table
    #biom_output_file = f"../tables/{name}_subset.biom" # Path to output file

    with biom_open(biom_output_file, 'w') as f: # This does the actual saving!
        biom_table.to_hdf5(f, generated_by="subsetted tables")

    return biom_table


In [4]:
biom_path = "/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom"
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
nares_only_table, metadata_sub = load_biom_table(biom_path, metadata_path)

In [9]:
# Check if metadata subsetting is successful 
metadata_sub['case_type'].value_counts()

case_type
case-anterior nares       60
control-anterior nares    46
Name: count, dtype: int64

# Converting the subsetted BIOM table DF back to BIOM table 

In [5]:
nares_only_biom_path = biom_path.removesuffix('.biom') + '_nares_only.biom'
print(f'{nares_only_biom_path = }')
biom_tbl = convert_df_to_biom(nares_only_table, nares_only_biom_path)

nares_only_biom_path = '/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance_nares_only.biom'


# Run RPCA on subsetted BIOM table

In [10]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata_sub['case_type'])

spca_df

Unnamed: 0,PC1,PC2,PC3,case_type
900092,-0.010856,0.110137,0.053213,case-anterior nares
900466,-0.032580,0.088337,0.119949,case-anterior nares
9003932,-0.127913,0.043697,-0.030330,control-anterior nares
900556,0.008655,-0.087404,0.097629,case-anterior nares
900301,0.065717,0.060684,-0.290518,case-anterior nares
...,...,...,...,...
900328a,0.169568,-0.089589,-0.078927,control-anterior nares
900547,-0.076549,-0.087305,0.043353,control-anterior nares
900304,0.012389,-0.006423,0.084750,case-anterior nares
900580,0.006267,0.226454,-0.038413,case-anterior nares


In [19]:
# view distance matrix
print(distance)


106x106 distance matrix
IDs:
'900092', '900466', '9003932', '900556', '900301', '900456', '900601', '900577', ...
Data:
[[0.         0.39065306 1.52903146 ... 1.0200469  0.90911368 1.54393113]
 [0.39065306 0.         1.42295563 ... 1.13912873 1.13367077 1.63841355]
 [1.52903146 1.42295563 0.         ... 1.38373918 2.27223275 1.41602222]
 ...
 [1.0200469  1.13912873 1.38373918 ... 0.         1.84654692 0.55371229]
 [0.90911368 1.13367077 2.27223275 ... 1.84654692 0.         2.36111768]
 [1.54393113 1.63841355 1.41602222 ... 0.55371229 2.36111768 0.        ]]


In [12]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
g__Streptococcus,-0.057559,0.133266,0.874041
g__Staphylococcus,0.320377,-0.057333,-0.050323
g__Haemophilus_D_734546,-0.406874,-0.762683,0.04417
g__Unknown,0.33366,-0.09045,0.072459
g__Corynebacterium,0.469125,-0.326052,0.009351


In [24]:
# Set the color palette for the groups in the correct order
palette = {
    'control-anterior nares': '#008000',   # Green for healthy Nasal
    'case-anterior nares': '#FFC0CB'        # Pink for AD Nasal
}


# Calculate P-values 

In [17]:
# calculate permanova F-statistic
pnova_res = permanova(distance, spca_df, "case_type")
print(pnova_res['p-value'])

0.218


# Plot the convex hull PCA for Nare samples only

In [27]:
# create beta diversity plot
mm = 1/25.4
fig, ax = plt.subplots(1, 1, figsize=(90*mm, 110*mm))

fpca_df.columns = [f"PC{i+1}" for i in range(fpca_df.shape[1])]

sns.scatterplot(
    data=spca_df,
    x="PC1",
    y="PC2",
    hue="case_type",
    edgecolor=None,
    palette=palette,
    ax=ax
)

for case_type, case_type_df in spca_df.groupby("case_type"):
    color = palette[case_type]

    points = case_type_df[["PC1", "PC2"]].values
    hull = scipy.spatial.ConvexHull(points)

    hull_plot_x = points[hull.vertices, 0]
    hull_plot_y = points[hull.vertices, 1]
    # Connect last point with first point
    hull_plot_x = np.append(hull_plot_x, points[hull.vertices[0], 0])
    hull_plot_y = np.append(hull_plot_y, points[hull.vertices[0], 1])

    ax.plot(
        hull_plot_x,
        hull_plot_y,
        c=color,
        zorder=0
    )
    ax.fill(
        points[hull.vertices, 0],
        points[hull.vertices, 1],
        c=color,
        alpha=0.3
    )


handles, labels = ax.get_legend_handles_labels()
plt.legend(
    handles=handles,
    labels=map(str.capitalize, labels),
    frameon=False,
    fontsize = 7
)

pc1_pct, pc2_pct, _ = [f"PC{i+1} ({x*100:.2f}%)" for i, x in enumerate(ordination.proportion_explained)]

ax.set_xlabel(pc1_pct, fontsize = 7)
ax.set_ylabel(pc2_pct, fontsize = 7)

yticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
yticklocations = yticklabels  # Assuming you want the locations to match the labels
ax.set_yticks(yticklocations)
ax.set_yticklabels(yticklabels, fontsize = 7)

xticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
xticklocations = xticklabels  # Assuming you want the locations to match the labels
ax.set_xticks(xticklocations)
ax.set_xticklabels(xticklabels, fontsize = 7)

ax.text(-0.18, 0.22, 'PERMANOVA', fontsize=7)
ax.text(-0.18, 0.2, f"p-val = {pnova_res['p-value']}", fontsize=7)

ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)

plt.tight_layout()
plt.savefig("../plots/Beta_Plots/16S_Beta_Diversity_nares_only.png", dpi = 600)
plt.show()

  plt.show()
