In [75]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca

### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

# Load metadata/BIOM table and subset for nares only samples


In [76]:
# # read in biom table
# biom_tbl = biom.load_table("/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom")

# print(biom_tbl.ids(axis= 'sample'))
# len(biom_tbl.ids(axis= 'sample'))
# type(biom_tbl.ids(axis = 'sample'))

# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    
    # Drop the 'row_sum' column before proceeding
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    # df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)

    # Remove '15564.' prefix from columns
    df.columns = df.columns.str.replace('15564.', '')

     # Load metadata as a DataFrame from the file path
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')

    # Set Sample-ID as the index for the metadata dataframe 
    metadata = metadata.set_index('#sample-id')

    # Subset metadata to only contain the same samples as in the BIOM df
    metadata_sub = metadata.loc[df.columns]

    # Get a list of the sample ids that are just nares
    # metadataindex = metadata_sub[metadata_sub['case_type'].isin(['case-anterior nares', 'control-anterior nares'])].index.tolist()
    metadataindex = metadata_sub[metadata_sub['case_type'].isin(['control-nonlesional skin', 'case-nonlesional skin', 'case-lesional skin'])].index.tolist()

    # Drop all sample-ids that correspond to nares in the BIOM table DataFrame
    df = df.drop(columns= metadataindex)

    # Subsets the Metadata to only contain the sample ids that are in the newly subsetted biom table DataFrame with only Skin
    metadata_sub = metadata.loc[df.columns]

    
    #returns BIOM table DataFrame and the metadata that has been subsetted to only contain skin
    return df, metadata_sub


# converts the subsetted skin only BIOM table DataFrame back into a BIOM to run RPCA
def convert_df_to_biom(table, biom_output_file):
    obs_ids = table.index
    samp_ids = table.columns
    biom_table = biom.table.Table(table.values, observation_ids=obs_ids, sample_ids=samp_ids) # Convert df back to a biom table
    #biom_output_file = f"../tables/{name}_subset.biom" # Path to output file

    with biom_open(biom_output_file, 'w') as f: # This does the actual saving!
        biom_table.to_hdf5(f, generated_by="subsetted tables")

    return biom_table



In [77]:
biom_path = "../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table.biom"
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)
df

Unnamed: 0,GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,GTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCACGTCGTCTGTGAAATTCCA,GTGCCAGCCGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCACGTCGTCTGTGAAATTCCA,GTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTAATCGGAATTATTGGGCGTAAAGCGAGTGCAGACGGTTACTTAAGCCAGATGTGAAATCCCC,GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGTTTCTTAAGTCTGATGTGAAAGCCCC,GTGCCAGCAGCCGCGGTGATACGTAGGGTGCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGCTCGTAGGTGGTTGATCGCGTCGGAAGTGTAATCTTG,GTGCCAGCAGCCGCGGTAATACGTAGGGTCCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTGTGCAAGACCGATGTGAAATCCCC,GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGGGAGCGCAGGTGGTTTCTTAAGTCTGATGTGAAAGCCCA,GTGCCAGCCGCCGCGGTAATACGGAAGGTCCAGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGATTATTAAGTCAGTGGTGAAAGACGG,...,GTGCCAGCCGCCGCGGTAATACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGCAGCAAGTCTGATGTGAAAGGCAG,GTGCCAGCAGCCGCGGTAAGACAGAGGGTGCAAACGTTGCTCGGAATCACTGGGCGTAAAGGGCGTGTAGGCGGGAGAGAAAGTCGGGCGTGAAATCCCT,GTGCCAGCCGCGGTAATACGTAGGGGGCTAGCGTTGTCCGGAATCACTGGGCGTAAAGGGTTCGCAGGCGGAAATGCAAGTCAGGTGTAAAAGGCAGTAG,GTGCCAGCAGCCGCGGTAATACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTTGTAGGCGGTTTGTTGCGTCTGCTGTGAAAGACCG,GTGCCAGCCGCCGCGGTAATACGTAGGGCGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTTGTAGGCGGTTTGTTGCGTCTGCTGTGAAAGACCG,GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATCCGGAATCATTGGGTTTAAAGGGTCCGCAGGCGGATTTATAAGTCAGTGGTGAAAGCCTA,GTGCCAGCAGCCGCGGTAATACGTAGGTGGCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGCGTGTAGGCGGGAAGGTAAGTCAGATGTGAAATACCG,GTGCCAGCCGCCGCGGTAATACGGAGGATGCGAGCGTTATTCGGAATCATTGGGTTTAAAGGGTCTGTAGGCGGGCTATTAAGTCAGAGGTGAAAGGTTT,GTGCCAGCCGCCGCGGTAAGACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGTGCAGGCGGTTATCCAAGTCGGGTGTGAAAGCCTT,GTCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGT
15564.900344,984.0,611.0,114.0,82.0,22.0,15.0,8.0,8.0,6.0,3.0,...,0,0,0,0,0,0,0,0,0,0
15564.900459,118.0,106.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15564.900221,22.0,0,0,0,0,0,16.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15564.900570,389.0,0,0,0,8.0,0,11.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15564.900092,3106.0,1707.0,59.0,32.0,3.0,0,0,0,0,7.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15564.9003972,1168.0,593.0,16.0,0,28.0,0,736.0,0,0,36.0,...,0,0,0,0,0,0,0,0,0,0
15564.900097,24.0,0,0,0,0,0,33.0,0,0,0,...,8.0,5.0,1.0,0,0,0,0,0,0,0
15564.900498,15.0,17.0,0,0,0,0,34.0,0,14.0,0,...,0,0,0,15.0,10.0,8.0,0,0,0,0
15564.900276,0,0,30.0,0,0,0,151.0,0,0,0,...,0,0,0,0,0,0,11.0,3.0,2.0,1.0


In [78]:
# specify the path to the BIOM table and Metadata
# biom_path = "../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom"
biom_path = "../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table.biom"
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
nares_only_table, metadata_sub = load_biom_table(biom_path, metadata_path)

In [79]:
nares_only_table

Unnamed: 0,900459,900092,900391,900466,9003932,900556,900612,900301,900563,900237,...,900069,900142,900616,Co004LNPN,900328a,900547,900086,900304,900580,900484
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,118.0,3106.0,688.0,9133.0,2107.0,3417.0,6.0,82.0,611.0,6.0,...,5110.0,13048.0,6.0,1101.0,372.0,1825.0,37.0,3406.0,7314.0,2575.0
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,14.0,0.0,0.0,18.0,0.0,16.0,0.0,33673.0,0.0,0.0,...,0.0,0.0,0.0,29.0,7.0,33.0,35.0,22.0,354.0,0.0
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,432.0,0.0,0.0,0.0,1209.0,1308.0,719.0,2.0,30.0,76.0,...,0.0,58.0,0.0,0.0,279.0,6122.0,0.0,79.0,7.0,458.0
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,19.0,0.0,0.0,0.0,0.0,0.0,0.0,19375.0,0.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,280.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,106.0,1707.0,0.0,5406.0,857.0,1204.0,0.0,0.0,377.0,0.0,...,0.0,4334.0,4.0,0.0,0.0,0.0,0.0,0.0,5114.0,2150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGCCAGCAGCCGCGGTAATACGGAGGGTCCGAGCGTTATCCGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGCTTTATAAGTCAGTGGTGAAATCCGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCTGTGAAATTCCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGGTGTGAAAACTCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
metadata_sub

Unnamed: 0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,age_months,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath
900459,6,G3,SA507,GGATATCT,SB703,TAGCAGCT,SB703SA507,TAGCAGCT-GGATATCT,1.010000e+21,G3,...,20,male,9/23/2015,Spring,Unexposed,negative,5.0,44,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900092,2,D9,SB504,TACGAGAC,SB709,GCGTATAC,SB709SB504,GCGTATAC-TACGAGAC,1.010000e+21,D9,...,23,female,6/4/2015,Winter,Unexposed,negative,4.0,53,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900391,4,B3,SA502,ACTATCTG,SA703,ACGCTACT,SA703SA502,ACGCTACT-ACTATCTG,1.010000e+21,B3,...,26,male,9/14/2015,Spring,,,7.0,,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900466,6,H4,SA508,GACACCGT,SB704,TCTCTATG,SB704SA508,TCTCTATG-GACACCGT,1.010000e+21,H4,...,12,male,8/18/2015,Winter,Exposed,,6.0,43,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
9003932,5,G5,SB507,GATCGTGT,SB705,GATCTACG,SB705SB507,GATCTACG-GATCGTGT,1.010000e+21,G5,...,16,female,9/14/2015,Spring,,,9.0,,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900547,2,E5,SB505,ACGTCTCG,SB705,GATCTACG,SB705SB505,GATCTACG-ACGTCTCG,1.010000e+21,E5,...,30,male,2/25/2016,Summer,,negative,10.0,,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900086,2,G3,SB507,GATCGTGT,SB703,TAGCAGCT,SB703SB507,TAGCAGCT-GATCGTGT,1.010000e+21,G3,...,12,male,6/3/2015,Winter,Exposed,negative,3.0,32,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900304,2,E9,SB505,ACGTCTCG,SB709,GCGTATAC,SB709SB505,GCGTATAC-ACGTCTCG,1.010000e+21,E9,...,23,female,8/26/2015,Winter,Exposed,,7.0,33,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...
900580,4,D9,SA504,CTGCGTGT,SA709,ACTACGAC,SA709SA504,ACTACGAC-CTGCGTGT,1.010000e+21,D9,...,28,male,4/14/2016,Autumn,Unexposed,negative,6.0,43,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...


In [82]:
# Set the color palette for the groups in the correct order
palette = {
    "H": "#ADD8E6",      # baby blue
    # "AD-NL": "#FFDAB9",  # peach
    "AD": "#E31A1C"    # red
}


In [83]:
# Create the path to the skin BIOM file by using the prefix of the original BIOM file
nares_only_biom_path = biom_path.removesuffix('.biom') + '_nares_only.biom'

# Print the path that was created
print(f'{nares_only_biom_path = }')

# Converting df to BIOM table and saving the biom table to the path specified above
biom_tbl = convert_df_to_biom(nares_only_table, nares_only_biom_path)

nares_only_biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table_nares_only.biom'


## Run RPCA

In [84]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata_sub['case_type'])

spca_df

Unnamed: 0,PC1,PC2,PC3,case_type
900459,-0.010591,-0.088512,0.049942,case-anterior nares
900092,0.045115,0.070730,-0.080180,case-anterior nares
900391,-0.039030,0.000598,-0.045942,control-anterior nares
900466,0.000768,0.004813,-0.166650,case-anterior nares
9003932,0.087608,-0.028640,-0.023018,control-anterior nares
...,...,...,...,...
900547,0.086773,-0.084143,0.028597,control-anterior nares
900086,-0.059003,0.072043,0.070186,case-anterior nares
900304,0.045750,-0.024506,-0.042610,case-anterior nares
900580,0.018258,0.175107,-0.096730,case-anterior nares


In [85]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,0.098794,-0.397388,-0.528729
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,-0.171917,-0.118308,0.028445
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,0.488259,-0.48781,0.358997
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,-0.128159,-0.070724,0.010761
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,0.186306,-0.185556,-0.648762


In [86]:
def permanova_on_case_type_subset(df, dist_matrix, case_type_subset):
    """
    Perform PERMANOVA on a subset of the data.
    
    Parameters:
    - df: DataFrame with metadata, must include the grouping variable
    - dist_matrix: DistanceMatrix object from scikit-bio
    - case_type_subset: list of case_type groups to include in the test
    
    Returns:
    - PERMANOVA result (dict-like with p-value, test statistic, etc.)
    """
    # Subset the DataFrame
    subset_df = df[df['case_type'].isin(case_type_subset)]
    print(subset_df)
    # Get the matching IDs and subset distance matrix
    ids = subset_df.index
    sub_dm = dist_matrix.filter(ids, strict=False)
    
    # Run PERMANOVA
    result = permanova(sub_dm, grouping=subset_df['case_type'], permutations=999)
    
    return result

# Calculate P-values for all combinations of nares case_types

In [87]:
# calculate permanova F-statistic for all combinations between Healthy, case lesional and case non lesional

case_type_subsets = [
    ["control-anterior nares", "case-anterior nares"]
]

perma_res = {}

for i, case_type_subset in enumerate(case_type_subsets):
    result = permanova_on_case_type_subset(spca_df, distance, case_type_subset)
    group_label = (
        "H vs. AD"
    )
    f_val = result["test statistic"]
    p_val = result["p-value"]
    
    perma_res[group_label] = {
        "p": f"{p_val:.2e}",
        "f": f"{f_val:.2f}"
    }

perma_res



              PC1       PC2       PC3               case_type
900459  -0.010591 -0.088512  0.049942     case-anterior nares
900092   0.045115  0.070730 -0.080180     case-anterior nares
900391  -0.039030  0.000598 -0.045942  control-anterior nares
900466   0.000768  0.004813 -0.166650     case-anterior nares
9003932  0.087608 -0.028640 -0.023018  control-anterior nares
...           ...       ...       ...                     ...
900547   0.086773 -0.084143  0.028597  control-anterior nares
900086  -0.059003  0.072043  0.070186     case-anterior nares
900304   0.045750 -0.024506 -0.042610     case-anterior nares
900580   0.018258  0.175107 -0.096730     case-anterior nares
900484   0.031543 -0.029679 -0.023330     case-anterior nares

[197 rows x 4 columns]


{'H vs. AD': {'p': '8.10e-02', 'f': '2.41'}}

In [88]:
# # Replace the long names with short names in perma_res
# for i in range(len(perma_res)):
#     perma_res[i] = perma_res[i].replace('control-nonlesional skin', 'H')
#     perma_res[i] = perma_res[i].replace('case-nonlesional skin', 'AD-NL') 
#     perma_res[i] = perma_res[i].replace('case-lesional skin', 'AD')

# # View the updated list
# perma_res


In [89]:
spca_df['case_type'] = spca_df['case_type'].replace({
    'control-anterior nares': 'H',
    'case-anterior nares': 'AD'})

spca_df

Unnamed: 0,PC1,PC2,PC3,case_type
900459,-0.010591,-0.088512,0.049942,AD
900092,0.045115,0.070730,-0.080180,AD
900391,-0.039030,0.000598,-0.045942,H
900466,0.000768,0.004813,-0.166650,AD
9003932,0.087608,-0.028640,-0.023018,H
...,...,...,...,...
900547,0.086773,-0.084143,0.028597,H
900086,-0.059003,0.072043,0.070186,AD
900304,0.045750,-0.024506,-0.042610,AD
900580,0.018258,0.175107,-0.096730,AD


# Plot the convex hull PCA for Skin samples only

In [91]:
# create beta diversity plot
fig, ax = plt.subplots(1, 1, figsize=(4.5,6))

fpca_df.columns = [f"PC{i+1}" for i in range(fpca_df.shape[1])]

sns.scatterplot(
    data=spca_df,
    x="PC1",
    y="PC2",
    hue="case_type",
    s=50,
    edgecolor="black",      # Thin black outline
    linewidth=0.5,          # Line thickness
    palette=palette,
    ax=ax
)

for case_type, case_type_df in spca_df.groupby("case_type"):
    color = palette[case_type]

    points = case_type_df[["PC1", "PC2"]].values
    hull = scipy.spatial.ConvexHull(points)

    hull_plot_x = points[hull.vertices, 0]
    hull_plot_y = points[hull.vertices, 1]
    # Connect last point with first point
    hull_plot_x = np.append(hull_plot_x, points[hull.vertices[0], 0])
    hull_plot_y = np.append(hull_plot_y, points[hull.vertices[0], 1])

    ax.plot(
        hull_plot_x,
        hull_plot_y,
        c=color,
        zorder=0
    )
    ax.fill(
        points[hull.vertices, 0],
        points[hull.vertices, 1],
        c=color,
        alpha=0.3
    )


handles, labels = ax.get_legend_handles_labels()
custom_labels = ["H (n=87)", "AD-NL (n=111)", "AD (n=107)"]

plt.legend(
    handles=handles,
    labels=custom_labels,
    frameon=False,
    fontsize=10,
    loc='lower right'
)

pc1_pct, pc2_pct, _ = [f"RPCA PC{i+1} ({x*100:.2f}%)" for i, x in enumerate(ordination.proportion_explained)]

ax.set_xlabel(pc1_pct, fontsize = 14)
ax.set_ylabel(pc2_pct, fontsize = 14)

yticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
yticklocations = yticklabels  # Assuming you want the locations to match the labels
ax.set_yticks(yticklocations)
ax.set_yticklabels(yticklabels, fontsize = 12)

xticklabels = [-0.2, -0.1, 0.0, 0.1, 0.2, 0.3]
xticklocations = xticklabels  # Assuming you want the locations to match the labels
ax.set_xticks(xticklocations)
ax.set_xticklabels(xticklabels, fontsize = 12)

# Writes out the text for each permanova run combination
ax.text(
    -0.18, 0.28,
    f"H vs. AD: p={perma_res['H vs. AD']['p']},  F={perma_res['H vs. AD']['f']}",
    fontsize=10
)
# ax.text(
#     -0.18, 0.26,
#     f"H vs. AD-NL: p={perma_res['H vs. AD-NL']['p']},  F={perma_res['H vs. AD-NL']['f']}",
#     fontsize=10
# )
# ax.text(
#     -0.18, 0.24,
#     f"AD-NL vs. AD: p={perma_res['AD-NL vs. AD']['p']},  F={perma_res['AD-NL vs. AD']['f']}",
#     fontsize=10
# )

plt.title(f'Beta Diversity', fontsize = 17)

# ax.spines["right"].set_visible(False)
# ax.spines["top"].set_visible(False)

plt.tight_layout()
plt.savefig("../Plots/Analysis_figures/Diversity/16S_Beta_Diversity_RPCA_nares_only.png", dpi = 600)
plt.show()

  plt.show()
