In [7]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca

from matplotlib.patches import Ellipse #draws ellipse around points
from itertools import combinations #used to generate pairwise permanova for all combinations



### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

# Load in BIOM table and Metadata

In [3]:
# read in biom table
biom_tbl = biom.load_table("/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom")

In [4]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')
# metadata['case_type'].value_counts()

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')


# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Set the option to display all columns
pd.set_option('display.max_columns', None)

# view head of metadata
metadata

metadata_sorted = metadata.sort_values(by='#sample-id')


metadata_sorted.head()


Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,Volume (ul),Lysozyme pretreatment,DNA extraction method,Purification method,Date of DNA extraction,pid,case_type,participant,area,sample_type,specimen,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
9000107,3,F1,SA506,CGTGAGTG,SB701,CTCGACTT,SB701SA506,CTCGACTT-CGTGAGTG,1.01e+21,F1,20,Yes,Zymo kit,QiaSymphony,21-Mar,Ca-029-TD,case-lesional skin,case,Cape Town,lesional skin,skin,18,male,6/23/2015,Winter,Unexposed,negative,4.0,25,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.01e+21,E1,20,Yes,Zymo kit,QiaSymphony,21-Mar,Ca-011-LQ,case-lesional skin,case,Cape Town,lesional skin,skin,31,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900052,1,E2,SA505,TCATCGAG,SA702,GACATAGT,SA702SA505,GACATAGT-TCATCGAG,1.01e+21,E2,20,Yes,Zymo kit,QiaSymphony,21-Mar,Ca-011-LQ,case-nonlesional skin,case,Cape Town,nonlesional skin,skin,31,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900053,1,E3,SA505,TCATCGAG,SA703,ACGCTACT,SA703SA505,ACGCTACT-TCATCGAG,1.01e+21,E3,20,Yes,Zymo kit,QiaSymphony,21-Mar,Ca-011-LQ,case-anterior nares,case,Cape Town,anterior nares,nasal,31,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900055,3,D12,SA504,CTGCGTGT,SB712,CGTAGCGA,SB712SA504,CGTAGCGA-CTGCGTGT,1.01e+21,D12,20,Yes,Zymo kit,QiaSymphony,21-Mar,Ca-012-LM,case-nonlesional skin,case,Cape Town,nonlesional skin,skin,15,female,4/22/2015,Spring,Unexposed,negative,4.0,25,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...


# Run RPCA on BIOM table

In [11]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata['case_type'])

spca_df

Unnamed: 0,PC1,PC2,PC3,case_type
900221,0.019447,0.094025,0.104781,case-lesional skin
900570,0.085857,-0.034721,0.000204,case-nonlesional skin
900092,-0.061123,-0.066987,-0.008467,case-anterior nares
900466,-0.096082,-0.036088,-0.042834,case-anterior nares
9003932,-0.127355,-0.024372,0.094411,control-anterior nares
...,...,...,...,...
900279,0.021523,0.041060,0.007289,control-nonlesional skin
900304,-0.023019,-0.004441,-0.057023,case-anterior nares
900580,-0.016790,-0.032797,0.125734,case-anterior nares
900484,-0.082562,0.030154,-0.029069,case-anterior nares


In [12]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
g__Streptococcus,-0.14668,-0.372132,-0.471029
g__Staphylococcus,0.488363,0.126948,-0.100208
g__Haemophilus_D_734546,-0.315586,-0.022995,-0.448794
g__,0.294072,-0.084855,-0.202775
g__Corynebacterium,0.121088,0.257571,-0.357388


# Function to format significance symbols of p-value

In [14]:
def format_p_value(p_value):
    """
    Format a p-value with statistical significance stars.

    Parameters:
        p_value (float): The p-value to format.

    Returns:
        str: A string indicating the significance level with stars and the p-value.
    """
    if not (0 <= p_value <= 1):
        raise ValueError("p-value must be between 0 and 1.")

    if p_value <= 0.001:
        label = '***'
    elif p_value <= 0.01:
        label = '**'
    elif p_value <= 0.05:
        label = '*'
    else:
        label = ''

    return f"{label}p-val = {p_value}"

# Plot ellipse PCA plot for all skin vs all nares 

In [15]:
def generate_pca_plot_with_ellipses(spca_df, palette, filename):
    def get_cov_ellipse(cov, center, n_std):
        """
        Returns parameters for an ellipse given a covariance matrix, a center point, and a confidence interval (n_std).
        """
        eigenvalues, eigenvectors = np.linalg.eigh(cov)
        order = eigenvalues.argsort()[::-1]
        eigenvalues, eigenvectors = eigenvalues[order], eigenvectors[:, order]

        # Angle of the ellipse in degrees
        angle = np.degrees(np.arctan2(*eigenvectors[:, 0][::-1]))
        width, height = 2 * n_std * np.sqrt(eigenvalues)
        return width, height, angle

    mm = 1 / 25.4
    fig, ax = plt.subplots(1, 1, figsize=(90 * mm, 110 * mm))

    sns.scatterplot(
        data=spca_df,
        x="PC1",
        y="PC2",
        hue="case_type",
        edgecolor=None,
        palette=palette,
        ax=ax
    )

    for case_type, case_type_df in spca_df.groupby("case_type"):
        color = palette[case_type]

        # Calculate the centroid
        centroid = case_type_df[["PC1", "PC2"]].mean().values

        # Calculate covariance matrix
        cov = np.cov(case_type_df[["PC1", "PC2"]].values, rowvar=False)

        # Get ellipse parameters
        width, height, angle = get_cov_ellipse(cov, centroid, n_std=2)

        # Plot the ellipse
        ellipse = Ellipse(
            xy=centroid,
            width=width,
            height=height,
            angle=angle,
            edgecolor=color,
            facecolor=color,
            alpha=0.3,
            zorder=0
        )
        ax.add_patch(ellipse)

    handles, labels = ax.get_legend_handles_labels()
    plt.legend(
        handles=handles,
        labels=map(str.capitalize, labels),
        frameon=False,
        fontsize=7
    )

   #pc1_pct, pc2_pct = "PC1 (XX.XX%)", "PC2 (YY.YY%)"  # Replace with your actual proportions if available
    pc1_pct, pc2_pct, _ = [f"PC{i+1} ({x*100:.2f}%)" for i, x in enumerate(ordination.proportion_explained)]

    ax.set_xlabel(pc1_pct, fontsize=7)
    ax.set_ylabel(pc2_pct, fontsize=7)

    yticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
    yticklocations = yticklabels
    ax.set_yticks(yticklocations)
    ax.set_yticklabels(yticklabels, fontsize=7)

    xticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
    xticklocations = xticklabels
    ax.set_xticks(xticklocations)
    ax.set_xticklabels(xticklabels, fontsize=7)

    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

    # calculate permanova F-statistic
    pnova_res = permanova(distance, spca_df_sn, "case_type")
    print(pnova_res)

    ax.text(-0.18, 0.22, 'PERMANOVA', fontsize=7)
    ax.text(-0.18, 0.2, format_p_value(pnova_res["p-value"]), fontsize=7)

    plt.tight_layout()
    plt.savefig(filename, dpi=600)
    plt.show()

# Example usage
# spca_df = pd.DataFrame({"PC1": ..., "PC2": ..., "case_type": ...})
# palette = {"type1": "red", "type2": "blue"}
# generate_pca_plot_with_ellipses(spca_df, palette, "pca_plot_with_ellipses.png")


In [17]:
# relabel case types in spca dataframe to just nares and skin 
spca_df_sn = spca_df.copy()
spca_df_sn['case_type'] = spca_df_sn['case_type'].apply(lambda x: 'skin' if x.endswith('skin') else 'nares')

# Set the color palette for the groups in the correct order
palette_sn = {
    'skin': '#3333B3',     # Dark Blue
    'nares': '#FFC0CB'        # Pink
}

# calls on function to generate the ellipse plot
generate_pca_plot_with_ellipses(spca_df_sn, palette_sn, "../plots/Beta_plots/16S_Beta_Diversity_skin_vs_nares_ellipse.png")


method name               PERMANOVA
test statistic name        pseudo-F
sample size                     254
number of groups                  2
test statistic             99.63559
p-value                       0.001
number of permutations          999
Name: PERMANOVA results, dtype: object


  plt.show()
