## Alpha Diversity by Individual AD Status and Region

In [1]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
from scipy.stats import mannwhitneyu
from skbio.diversity import alpha_diversity
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
from matplotlib.patches import Circle
from matplotlib.colors import to_hex
import statsmodels.api as sm

In [2]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')
metadata['case_type'].value_counts()

case_type
case-nonlesional skin       111
case-anterior nares         108
case-lesional skin          107
control-anterior nares       89
control-nonlesional skin     87
Name: count, dtype: int64

In [3]:
metadata['o_scorad_adj'] = metadata['o_scorad'].fillna(0)
metadata

Unnamed: 0,#sample-id,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,o_scorad_adj
0,Ca009ST_L,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,40
1,900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
2,Ca010EB_L,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,21
3,900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,40
4,900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,Ca006ON_L_2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
498,Ca006ON_NL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
499,Ca006ON_NL_2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34
500,Ca006ON_PN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,34


In [4]:
# Create mapping dictionary for case types to individual cases
case_type_mapping = {
    'case-nonlesional skin': 'AD skin',
    'case-anterior nares': 'AD nares', 
    'case-lesional skin': 'AD skin',
    'control-anterior nares': 'H nares',
    'control-nonlesional skin': 'H skin'
}

# Create new column using the mapping
metadata['individual_case'] = metadata['case_type'].map(case_type_mapping)

metadata['individual_case_location'] = metadata['individual_case'] + ' ' + metadata['area']
metadata['individual_case_location'].value_counts()

individual_case_location
AD skin Umtata        122
AD skin Cape Town      96
AD nares Umtata        61
H nares Umtata         55
H skin Umtata          52
AD nares Cape Town     47
H skin Cape Town       35
H nares Cape Town      34
Name: count, dtype: int64

In [5]:
metadata.to_csv('../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type-location.tab', sep='\t')

In [6]:
def darken_color(color, amount=0.3):
    """
    Darken a given color by a certain amount.
    
    Parameters:
    - color: The base color (as a hex string or color name).
    - amount: The amount to darken the color by (default: 0.3).
    
    Returns:
    - A darkened color as a hex string.
    """
    c = to_rgba(color)
    return (c[0] * (1 - amount), c[1] * (1 - amount), c[2] * (1 - amount), c[3])

In [7]:
def brighten_color(color, amount=0.3):
    """
    Brighten a color by increasing its RGB intensity without blending with white.
    
    Parameters:
    - color: color name or hex code
    - amount: brightness boost factor (0 = no change, 1 = full brightness)

    Returns:
    - Hex string of the brightened color.
    """
    rgba = to_rgba(color)
    r = min(rgba[0] * (1 + amount), 1.0)
    g = min(rgba[1] * (1 + amount), 1.0)
    b = min(rgba[2] * (1 + amount), 1.0)
    return to_hex((r, g, b, rgba[3]))

In [8]:
def plot_faith_pd_histo(metadata, group_col):
    metadata = metadata.set_index('#sample-id')

    # Load Faith PD data
    feature_table = pd.read_csv(
        "../Analyses/Xebec/Output/results/alpha_div/phylo/faith_pd/vector.tsv", sep="\t"
    )
    feature_table = feature_table.set_index(feature_table.columns[0])


    # Keep only overlapping samples
    common_samples = metadata.index.intersection(feature_table.index)
    metadata = metadata.loc[common_samples].copy()
    metadata['Faith_PD'] = feature_table.loc[common_samples]

    # Desired group order
    desired_order = ['H skin Cape Town', 'AD skin Cape Town', 'H skin Umtata', 'AD skin Umtata']

    # Color palette
    # palette = {
    #     'H skin Cape Town': '#A7C7E7',
    #     'H skin Umtata': '#ADD8E6',
    #     'AD skin Cape Town': '#d2b48c',
    #     'AD skin Umtata': '#fa8072'
    # }

    palette = {
        'H skin Cape Town': '#7FBCEB',   # Soft but clearer blue
        'H skin Umtata': '#66C2EE',      # Light turquoise-blue with more saturation
        'AD skin Cape Town': '#C9A34F',  # Warm golden tan, more vibrant than tan
        'AD skin Umtata': '#F0806B'      # Coral-salmon tone, richer than original salmon
    }



    # Add group counts for x-axis labels
    group_counts = metadata[group_col].value_counts().to_dict()
    new_labels = [
        f"H child\n(n={group_counts.get('H skin Cape Town', 0)})",
        f"AD child\n(n={group_counts.get('AD skin Cape Town', 0)})",
        f"H child\n(n={group_counts.get('H skin Umtata', 0)})",
        f"AD child\n(n={group_counts.get('AD skin Umtata', 0)})"
    ]

    # Start figure
    plt.figure(figsize=(8, 4.5))
    ax = sns.boxplot(x=group_col, y='Faith_PD', data=metadata, palette=palette, order=desired_order)

    # Darker dots on top
    # darker_palette = {k: darken_color(v) for k, v in palette.items()}
    brighter_palette = {k: brighten_color(v, amount=0.3) for k, v in palette.items()}

    sns.stripplot(x=group_col, y='Faith_PD', data=metadata, palette=brighter_palette,
                  jitter=True, dodge=False, ax=ax, linewidth=0.6, order=desired_order)

    # Customize axes and labels
    plt.suptitle(f'Alpha Diversity by Region', fontsize=20, y = 1.03)
    plt.title(f'Cape Town                      Umtata', fontsize=16, y = 1)

    plt.xlabel(' ')
    plt.ylabel('Faith PD', fontsize=16)
    plt.xticks(ticks=range(len(new_labels)), labels=new_labels, ha='center', fontsize=16)

    # Pairwise significance testing (only p <= 0.05 shown)
    groups = desired_order
    p_values = {}
    y_max = metadata['Faith_PD'].max()
    height_step = 1.2

    for i, group1 in enumerate(groups):
        for j, group2 in enumerate(groups):
            if i < j:
                vals1 = metadata[metadata[group_col] == group1]['Faith_PD']
                vals2 = metadata[metadata[group_col] == group2]['Faith_PD']
                stat, p = mannwhitneyu(vals1, vals2, alternative='two-sided')
                p_values[f'{group1} vs {group2}'] = p

                # Define special comparison to always annotate
                always_annotate = ('H skin Cape Town', 'AD skin Cape Town')

                # Annotate significant comparisons or the specified non-significant one
                if p <= 0.05 or (group1, group2) == always_annotate or (group2, group1) == always_annotate:
                    if p < 0.001:
                        label = '***  ' + f"{p:.2}"
                    elif p < 0.01:
                        label = '**  ' + f"{p:.2}"
                    elif p < 0.05:
                        label = '*  ' + f"{p:.2}"
                    else:
                        label = f"{p:.2}"

                    x1, x2 = i, j
                    y = y_max + height_step
                    plt.plot([x1, x1, x2, x2], [y, y + 0.1, y + 0.1, y], lw=1, color='black')
                    plt.text((x1 + x2) * 0.5, y + 0.1, label, ha='center', va='bottom', fontsize=10)
                    y_max += height_step + 0.8


    # Save the figure
    plt.savefig('../Plots/Analysis_figures/Diversity/CapeTown_Umtata_FaithPD.png', dpi=600, bbox_inches='tight', pad_inches=0.1)
    plt.savefig('../Plots/Analysis_figures/Diversity/CapeTown_Umtata_FaithPD.svg')

    # Print p-values
    print("Pairwise Mann-Whitney U test p-values:")
    for comparison, p_value in p_values.items():
        print(f"{comparison}: p-value = {p_value:.2e}")

In [9]:
def plot_faith_pd_histo(metadata, group_col):
    import statsmodels.api as sm
    from scipy.stats import mannwhitneyu

    metadata = metadata.set_index('#sample-id')

    # Load Faith PD data
    feature_table = pd.read_csv(
        "../Analyses/Xebec/Output/results/alpha_div/phylo/faith_pd/vector.tsv", sep="\t"
    )
    feature_table = feature_table.set_index(feature_table.columns[0])

    # Keep only overlapping samples
    common_samples = metadata.index.intersection(feature_table.index)
    metadata = metadata.loc[common_samples].copy()
    metadata['Faith_PD'] = feature_table.loc[common_samples]

    # 🔧 SCORAD adjustment via linear regression

    metadata['o_scorad_adj'] = pd.to_numeric(metadata['o_scorad_adj'], errors='coerce')

    # Drop rows with missing values in o_scorad_adj or Faith_PD
    metadata = metadata.dropna(subset=['Faith_PD', 'o_scorad_adj'])
    
    model = sm.OLS(metadata['Faith_PD'], sm.add_constant(metadata['o_scorad_adj'])).fit()
    metadata['Faith_PD_resid'] = model.resid  # <-- SCORAD-adjusted Faith PD

    # Desired group order
    desired_order = ['H skin Cape Town', 'AD skin Cape Town', 'H skin Umtata', 'AD skin Umtata']
    # desired_order = ['AD skin Cape Town', 'AD skin Umtata']

    # Color palette
    palette = {
        'H skin Cape Town': '#7FBCEB',
        'H skin Umtata': '#66C2EE',
        'AD skin Cape Town': '#C9A34F',
        'AD skin Umtata': '#F0806B'
    }

    # Add group counts for x-axis labels
    group_counts = metadata[group_col].value_counts().to_dict()
    new_labels = [
        f"H child\n(n={group_counts.get('H skin Cape Town', 0)})",
        f"AD child\n(n={group_counts.get('AD skin Cape Town', 0)})",
        f"H child\n(n={group_counts.get('H skin Umtata', 0)})",
        f"AD child\n(n={group_counts.get('AD skin Umtata', 0)})"
    ]

    # Start figure
    plt.figure(figsize=(4.5, 4.5))
    ax = sns.boxplot(x=group_col, y='Faith_PD_resid', data=metadata, palette=palette, order=desired_order)

    # Brighter dots for scatter
    brighter_palette = {k: brighten_color(v, amount=0.3) for k, v in palette.items()}
    sns.stripplot(x=group_col, y='Faith_PD_resid', data=metadata, palette=brighter_palette,
                  jitter=True, dodge=False, ax=ax, linewidth=0.6, order=desired_order)

    ax.grid(False)

    # Customize axes and labels
    plt.suptitle(f'SCORAD-Adjusted Alpha Diversity', fontsize=18, y=1.03)
    plt.title(f'Cape Town              Umtata', fontsize=16, y=1)

    plt.xlabel(' ')
    plt.ylabel('Faith PD (residual)', fontsize=16)
    plt.xticks(ticks=range(len(new_labels)), labels=new_labels, ha='center', fontsize=16)

    # Pairwise significance testing on adjusted values
    groups = desired_order
    p_values = {}
    y_max = metadata['Faith_PD_resid'].max()
    height_step = 1.2

    for i, group1 in enumerate(groups):
        for j, group2 in enumerate(groups):
            if i < j:
                vals1 = metadata[metadata[group_col] == group1]['Faith_PD_resid']
                vals2 = metadata[metadata[group_col] == group2]['Faith_PD_resid']

                # ✅ Skip test if either group has no data
                if len(vals1) == 0 or len(vals2) == 0:
                    print(f"Skipping comparison {group1} vs {group2} (one group is empty)")
                    continue

                stat, p = mannwhitneyu(vals1, vals2, alternative='two-sided')
                p_values[f'{group1} vs {group2}'] = p

                # Define special comparison to always annotate
                always_annotate = ('H skin Cape Town', 'AD skin Cape Town')

                if p <= 0.05 or (group1, group2) == always_annotate or (group2, group1) == always_annotate:
                    if p < 0.001:
                        label = '***  ' + f"{p:.2}"
                    elif p < 0.01:
                        label = '**  ' + f"{p:.2}"
                    elif p < 0.05:
                        label = '*  ' + f"{p:.2}"
                    else:
                        label = f"{p:.2}"

                    x1, x2 = i, j
                    y = y_max + height_step
                    plt.plot([x1, x1, x2, x2], [y, y + 0.1, y + 0.1, y], lw=1, color='black')
                    plt.text((x1 + x2) * 0.5, y + 0.1, label, ha='center', va='bottom', fontsize=10)
                    y_max += height_step + 0.8

    # Optional: display R² from regression
    # r_squared = model.rsquared
    # ax.text(0.98, 0.98, f"R² = {r_squared:.2f}", transform=ax.transAxes,
    #         ha='right', va='top', fontsize=10, color='black')

    # Save the figure
    plt.savefig('../Plots/Analysis_figures/Diversity/CapeTown_Umtata_FaithPD_SCORADadjusted.png', dpi=600, bbox_inches='tight', pad_inches=0.1)
    plt.savefig('../Plots/Analysis_figures/Diversity/CapeTown_Umtata_FaithPD_SCORADadjusted.svg')

    # Print p-values
    print("Pairwise Mann-Whitney U test p-values (on SCORAD-adjusted Faith PD):")
    for comparison, p_value in p_values.items():
        print(f"{comparison}: p-value = {p_value:.2e}")


In [10]:
# Plot Alpha Diversity plots for both V1-V3 and V4
plot_faith_pd_histo(
    metadata=metadata,
    group_col='individual_case_location'
)


  sns.stripplot(x=group_col, y='Faith_PD_resid', data=metadata, palette=brighter_palette,
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/yangchen/PhD/Gallo_lab/16S_AD_South-Africa/Plots/Analysis_figures/Diversity/CapeTown_Umtata_FaithPD_SCORADadjusted.png'

In [1571]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load Faith PD
faith_df = pd.read_csv(
    "../Analyses/Xebec/Output/results/alpha_div/phylo/faith_pd/vector.tsv", sep="\t"
)
faith_df = faith_df.set_index(faith_df.columns[0])  # Set sample ID as index
faith_df.columns = ['Faith_PD']  # Rename value column

# Make sure metadata is indexed by sample ID
metadata = metadata.set_index('#sample-id') if '#sample-id' in metadata.columns else metadata

# Join Faith PD values to metadata
metadata = metadata.join(faith_df, how='inner')

# Ensure SCORAD and Faith PD are numeric
metadata['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')
metadata['Faith_PD'] = pd.to_numeric(metadata['Faith_PD'], errors='coerce')

# Drop samples with missing data
plot_df = metadata.dropna(subset=['o_scorad', 'Faith_PD', 'area'])  # 'area' should be 'Cape Town' or 'Umtata'

# Plot regression: SCORAD vs Faith PD, colored by region
sns.set(style="whitegrid")
plot = sns.lmplot(
    data=plot_df,
    x='o_scorad',
    y='Faith_PD',
    hue='area',
    palette={'Cape Town': '#C9A34F', 'Umtata': '#F0806B'},
    height=5,
    aspect=1.2,
    scatter_kws={'alpha': 0.7, 's': 40},
    line_kws={'linewidth': 2}
)

# Title and labels
plt.title("Faith PD vs SCORAD by Region", fontsize=16)
plt.xlabel("SCORAD", fontsize=14)
plt.ylabel("Faith Phylogenetic Diversity", fontsize=14)
plt.tight_layout()

# Save plot
plt.savefig("../Plots/Analysis_figures/Diversity/FaithPD_vs_SCORAD_by_region.png", dpi=600)


## Beta Diversity by Individual AD Status and Region

In [1572]:
# import Python packages
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import scipy
import scipy.stats as ss
from skbio.stats.distance import permanova
import biom
from biom import load_table
from biom.util import biom_open
from gemelli.rpca import rpca

### See RPCA standalone python tutorial: 
# https://github.com/biocore/gemelli/blob/master/ipynb/tutorials/RPCA-moving-pictures-standalone-cli-and-api.ipynb

In [1573]:
# # read in biom table
# biom_tbl = biom.load_table("/Users/annanguyen/16S_AD_South-Africa/Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom")

# print(biom_tbl.ids(axis= 'sample'))
# len(biom_tbl.ids(axis= 'sample'))
# type(biom_tbl.ids(axis = 'sample'))

# Function to load BIOM table, collapse by taxa, sort rows by row sum, remove specified samples, and convert to relative abundance
def load_biom_table(biom_path, metadata_path):
    # Load BIOM table and convert to a DataFrame
    table = biom.load_table(biom_path)
    df = pd.DataFrame(table.matrix_data.toarray(),
                      index=table.ids(axis='observation'),
                      columns=table.ids(axis='sample'))
    
    # df = df.T
    # Sort rows by row sum in descending order
    df['row_sum'] = df.sum(axis=1)
    df = df.sort_values(by='row_sum', ascending=False)
    
    # Drop the 'row_sum' column before proceeding
    df = df.drop(columns=['row_sum'])

    # Replace ' g__' rows with ' g__Unknown'
    df.index = df.index.map(lambda x: ' g__Unknown' if x == ' g__' else x)

    # Remove '15564.' prefix from columns
    df.columns = df.columns.str.replace('15564.', '')

     # Load metadata as a DataFrame from the file path
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')

    # Set Sample-ID as the index for the metadata dataframe 
    metadata = metadata.set_index('#sample-id')

    # Subset metadata to only contain the same samples as in the BIOM df
    metadata_sub = metadata.loc[df.columns]

    # Get a list of the sample ids that are just skin
    metadataindex = metadata_sub[metadata_sub['case_type'].str.endswith('nares')].index.tolist()
    
    
    # Drop all sample-ids that correspond to nares in the BIOM table DataFrame
    df = df.drop(columns= metadataindex)

    # Subsets the Metadata to only contain the sample ids that are in the newly subsetted biom table DataFrame with only Skin
    metadata_sub = metadata.loc[df.columns]

    
    #returns BIOM table DataFrame and the metadata that has been subsetted to only contain skin
    return df, metadata_sub


# converts the subsetted skin only BIOM table DataFrame back into a BIOM to run RPCA
def convert_df_to_biom(table, biom_output_file):
    obs_ids = table.index
    samp_ids = table.columns
    biom_table = biom.table.Table(table.values, observation_ids=obs_ids, sample_ids=samp_ids) # Convert df back to a biom table
    #biom_output_file = f"../tables/{name}_subset.biom" # Path to output file

    with biom_open(biom_output_file, 'w') as f: # This does the actual saving!
        biom_table.to_hdf5(f, generated_by="subsetted tables")

    return biom_table



In [1574]:
# specify the path to the BIOM table and Metadata
# biom_path = "../Data/Tables/Relative_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_relative_abundance.biom"
biom_path = "../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table.biom"
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type-location.tab'
skin_only_table, metadata_sub = load_biom_table(biom_path, metadata_path)

skin_only_table

Unnamed: 0,900344,900221,900570,900129,900321,900091,900245,900423,900581,900145,...,900094,900287,900225,900057,900294,9003972,900097,900498,900276,900406
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,984.0,22.0,389.0,0.0,26.0,27.0,40.0,0.0,198.0,0.0,...,0.0,57.0,44.0,111.0,12.0,1168.0,24.0,15.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,1294.0,182.0,9.0,296.0,159.0,49.0,1366.0,27.0,...,0.0,108.0,30.0,93.0,13.0,794.0,171.0,28.0,0.0,12.0
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,0.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,...,0.0,2.0,0.0,0.0,0.0,74.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.0,0.0,236.0,0.0,0.0,0.0,0.0,57.0,799.0,37.0,...,0.0,0.0,4.0,116.0,0.0,477.0,94.0,11.0,0.0,7.0
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,611.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,7.0,593.0,0.0,17.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGCCAGCAGCCGCGGTAATACGGAGGGTCCGAGCGTTATCCGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGCTTTATAAGTCAGTGGTGAAATCCGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCTGTGAAATTCCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGCCGCCGCGGTAATACGTAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGGTGTGAAAACTCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GTGCCAGAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1575]:
# Create the path to the skin BIOM file by using the prefix of the original BIOM file
skin_only_biom_path = biom_path.removesuffix('.biom') + '_skin_only.biom'

# Print the path that was created
print(f'{skin_only_biom_path = }')

# Converting df to BIOM table and saving the biom table to the path specified above
biom_tbl = convert_df_to_biom(skin_only_table, skin_only_biom_path)

skin_only_biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_feature_table_skin_only.biom'


In [1576]:
# perform RPCA with auto rank estimation
np.seterr(divide = 'ignore')
ordination, distance = rpca(biom_tbl)

# extract and view sample ordinations from RPCA result
spca_df = ordination.samples

# Add a case type column into the spca_df using the meta_data and matching by indices by the join function
spca_df = spca_df.join(metadata_sub['individual_case_location'])

# Map o_scorad column from metadata to spca_df
spca_df = spca_df.join(metadata_sub['o_scorad'])


spca_df

Unnamed: 0,PC1,PC2,PC3,individual_case_location,o_scorad
900344,-0.031259,-0.030585,0.045341,H skin Umtata,
900221,0.034560,0.014311,0.101067,AD skin Umtata,34
900570,0.051177,-0.101247,-0.020742,AD skin Cape Town,36
900129,0.061599,-0.018120,-0.002876,AD skin Cape Town,44
900321,0.002420,0.003792,0.040013,H skin Umtata,
...,...,...,...,...,...
9003972,-0.107095,-0.089850,-0.019500,AD skin Umtata,54
900097,0.025972,0.031322,-0.000608,AD skin Cape Town,44
900498,-0.033348,0.048009,0.030825,AD skin Umtata,
900276,-0.108307,-0.102284,0.005354,AD skin Umtata,52


In [1577]:
# extract and view feature ordinations from RPCA result
fpca_df = ordination.features
fpca_df.head()

Unnamed: 0,PC1,PC2,PC3
GTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.176233,-0.363698,0.052031
GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.031293,-0.18116,-0.635457
GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTTATTTAAGTGAGGTGTGAAAGCCCC,-0.137862,-0.125219,0.075513
GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCA,0.018601,-0.079648,-0.607929
GTGCCAGCCGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTG,-0.200034,-0.269834,0.054126


In [1578]:
def permanova_on_case_type_subset(df, dist_matrix, case_type_subset):
    """
    Perform PERMANOVA on a subset of the data.
    
    Parameters:
    - df: DataFrame with metadata, must include the grouping variable
    - dist_matrix: DistanceMatrix object from scikit-bio
    - case_type_subset: list of case_type groups to include in the test
    
    Returns:
    - PERMANOVA result (dict-like with p-value, test statistic, etc.)
    """
    # Subset the DataFrame
    
    subset_df = df[df['individual_case_location'].isin(case_type_subset)]
    print(subset_df)
    
    # Get the matching IDs and subset distance matrix
    ids = subset_df.index
    print(ids)
    sub_dm = dist_matrix.filter(ids, strict=False)
    
    # Run PERMANOVA
    result = permanova(sub_dm, grouping=subset_df['individual_case_location'], permutations=999)
    
    return result

In [1579]:
# calculate permanova F-statistic for all combinations between Healthy, case lesional and case non lesional

case_type_subsets = [
    ["H skin Cape Town", "AD skin Cape Town"],
    ["H skin Umtata", "AD skin Umtata"]]

perma_res = {}

for i, case_type_subset in enumerate(case_type_subsets):
    # print("Subset case_type:", case_type_subset)
    # print("Available sample IDs:", spca_df.index.tolist())

    result = permanova_on_case_type_subset(spca_df, distance, case_type_subset)
    group_label = (
        "Skin of Healthy vs. AD Children in Cape Town" if i == 0 else
        "Skin of Healthy vs. AD Children in Umtata"
    )
    f_val = result["test statistic"]
    p_val = result["p-value"]
    
    perma_res[group_label] = {
        "p": f"{p_val:.2e}",
        "f": f"{f_val:.2f}"
    }

perma_res



              PC1       PC2       PC3 individual_case_location o_scorad
900344  -0.031259 -0.030585  0.045341            H skin Umtata      NaN
900221   0.034560  0.014311  0.101067           AD skin Umtata       34
900570   0.051177 -0.101247 -0.020742        AD skin Cape Town       36
900129   0.061599 -0.018120 -0.002876        AD skin Cape Town       44
900321   0.002420  0.003792  0.040013            H skin Umtata      NaN
...           ...       ...       ...                      ...      ...
9003972 -0.107095 -0.089850 -0.019500           AD skin Umtata       54
900097   0.025972  0.031322 -0.000608        AD skin Cape Town       44
900498  -0.033348  0.048009  0.030825           AD skin Umtata      NaN
900276  -0.108307 -0.102284  0.005354           AD skin Umtata       52
900406  -0.041063  0.053825  0.033240           AD skin Umtata       28

[305 rows x 5 columns]
Index(['900570', '900129', '900091', '900581', '900145', '900130', '900544',
       '900110', '900105', '900600'

{'Skin of Healthy vs. AD Children in Cape Town': {'p': '2.00e-02',
  'f': '3.50'},
 'Skin of Healthy vs. AD Children in Umtata': {'p': '1.00e-03', 'f': '25.50'}}

In [1580]:
spca_df['individual_case_location'] = spca_df['individual_case_location'].replace({
    'H skin Cape Town': 'H Child CT',
    'AD skin Cape Town': 'AD Child CT',
    'H skin Umtata': 'H Child UM',
    'AD skin Umtata': 'AD Child UM'
})

spca_df['o_scorad'] = pd.to_numeric(spca_df['o_scorad'], errors='coerce').astype('Int64')

spca_df['o_scorad_adj'] = spca_df['o_scorad'].fillna(0)

spca_df['severity_group'] = spca_df['o_scorad'].apply(lambda x: 'severe' if x > 50 else 'mild/moderate')

spca_df

Unnamed: 0,PC1,PC2,PC3,individual_case_location,o_scorad,o_scorad_adj,severity_group
900344,-0.031259,-0.030585,0.045341,H Child UM,,0,mild/moderate
900221,0.034560,0.014311,0.101067,AD Child UM,34,34,mild/moderate
900570,0.051177,-0.101247,-0.020742,AD Child CT,36,36,mild/moderate
900129,0.061599,-0.018120,-0.002876,AD Child CT,44,44,mild/moderate
900321,0.002420,0.003792,0.040013,H Child UM,,0,mild/moderate
...,...,...,...,...,...,...,...
9003972,-0.107095,-0.089850,-0.019500,AD Child UM,54,54,severe
900097,0.025972,0.031322,-0.000608,AD Child CT,44,44,mild/moderate
900498,-0.033348,0.048009,0.030825,AD Child UM,,0,mild/moderate
900276,-0.108307,-0.102284,0.005354,AD Child UM,52,52,severe


In [1581]:
print("Columns in spca_df:", spca_df.columns)

Columns in spca_df: Index(['PC1', 'PC2', 'PC3', 'individual_case_location', 'o_scorad',
       'o_scorad_adj', 'severity_group'],
      dtype='object')


In [1582]:
spca_df["simplified_label"] = spca_df["individual_case_location"].replace({
    "H Child CT": "H Child",
    "AD Child CT": "AD Child",
    "H Child UM": "H Child",
    "AD Child UM": "AD Child"
})


In [1583]:
# Set the color palette for the groups in the correct order
palette = {
    'H Child CT': '#A7C7E7',
    'H Child UM': '#ADD8E6',
    'AD Child CT': '#d2b48c',
    'AD Child UM': '#fa8072'
}

In [1584]:
# import numpy as np
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# from matplotlib.patches import Circle

# # Create label map with (n=#)
# label_map = spca_df["individual_case_location"].value_counts().to_dict()
# label_map = {k: f"{k} (n={v})" for k, v in label_map.items()}

# # Add labeled group column
# spca_df["group_label"] = spca_df["individual_case_location"].map(label_map)

# # Create figure
# fig, axes = plt.subplots(1, 2, figsize=(9, 5), sharex=True, sharey=True)

# # Rename ordination axes
# fpca_df.columns = [f"PC{i+1}" for i in range(fpca_df.shape[1])]

# split_map = {
#     "Cape Town": ["H Child CT", "AD Child CT"],
#     "Umtata": ["H Child UM", "AD Child UM"]
# }

# for ax, (title, groups) in zip(axes, split_map.items()):
#     subset_df = spca_df[spca_df["individual_case_location"].isin(groups)].copy()
#     subset_df["group_label"] = subset_df["individual_case_location"].map(label_map)
    
#     sns.scatterplot(
#         data=subset_df,
#         x="PC1",
#         y="PC2",
#         hue="group_label",
#         hue_order=[label_map[g] for g in groups],
#         s=50,
#         edgecolor="black",
#         linewidth=0.5,
#         palette={label_map[g]: palette[g] for g in groups},
#         ax=ax
#     )
    
#     for case_type, case_type_df in subset_df.groupby("individual_case_location"):
#         color = palette[case_type]
#         points = case_type_df[["PC1", "PC2"]].values

#         centroid = points.mean(axis=0)
#         dists = np.linalg.norm(points - centroid, axis=1)
#         radius = np.percentile(dists, 90)

#         circle = Circle(centroid, radius, edgecolor=color, facecolor=color, alpha=0.2, lw=1, zorder=0)
#         ax.add_patch(circle)

#     ax.set_title(title, fontsize=16)
#     ax.set_xlabel("")
#     ax.set_ylabel("")
#     ax.legend(frameon=False, fontsize=9, loc='upper right')

# # Shared axis labels and ticks
# pc1_pct, pc2_pct, _ = [f"RPCA PC{i+1} ({x*100:.2f}%)" for i, x in enumerate(ordination.proportion_explained)]
# axes[0].set_ylabel(pc2_pct, fontsize=14)
# for ax in axes:
#     ax.set_xlabel(pc1_pct, fontsize=14)
#     ax.set_xticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
#     ax.set_xticklabels([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3], fontsize=12)
#     ax.set_yticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
#     ax.set_yticklabels([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3], fontsize=12)

# # PERMANOVA stats
# axes[0].text(
#     -0.2, 0.27,
#     f"p={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['p']):.3f}, "
#     f"F={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['f']):.2f}",
#     fontsize=12
# )
# axes[1].text(
#     -0.2, 0.27,
#     f"p={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['p']):.3f}, "
#     f"F={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['f']):.2f}",
#     fontsize=12
# )

# # Final touches
# fig.suptitle("Beta Diversity by Region", fontsize=20, y=0.92)
# plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.savefig("../Plots/Analysis_figures/Diversity/16S_Beta_Diversity_RPCA_skin_by-location_split.png", dpi=600)


In [1585]:
spca_df

Unnamed: 0,PC1,PC2,PC3,individual_case_location,o_scorad,o_scorad_adj,severity_group,simplified_label
900344,-0.031259,-0.030585,0.045341,H Child UM,,0,mild/moderate,H Child
900221,0.034560,0.014311,0.101067,AD Child UM,34,34,mild/moderate,AD Child
900570,0.051177,-0.101247,-0.020742,AD Child CT,36,36,mild/moderate,AD Child
900129,0.061599,-0.018120,-0.002876,AD Child CT,44,44,mild/moderate,AD Child
900321,0.002420,0.003792,0.040013,H Child UM,,0,mild/moderate,H Child
...,...,...,...,...,...,...,...,...
9003972,-0.107095,-0.089850,-0.019500,AD Child UM,54,54,severe,AD Child
900097,0.025972,0.031322,-0.000608,AD Child CT,44,44,mild/moderate,AD Child
900498,-0.033348,0.048009,0.030825,AD Child UM,,0,mild/moderate,AD Child
900276,-0.108307,-0.102284,0.005354,AD Child UM,52,52,severe,AD Child


In [1586]:
spca_df['o_scorad_adj'] = spca_df['o_scorad_adj'].astype(float)


In [1587]:
spca_df['PC1'].dtype


dtype('float64')

In [1588]:
spca_df['PC2'].dtype


dtype('float64')

In [1589]:
spca_df['o_scorad_adj'].dtype


dtype('float64')

In [1590]:
spca_df

Unnamed: 0,PC1,PC2,PC3,individual_case_location,o_scorad,o_scorad_adj,severity_group,simplified_label
900344,-0.031259,-0.030585,0.045341,H Child UM,,0.0,mild/moderate,H Child
900221,0.034560,0.014311,0.101067,AD Child UM,34,34.0,mild/moderate,AD Child
900570,0.051177,-0.101247,-0.020742,AD Child CT,36,36.0,mild/moderate,AD Child
900129,0.061599,-0.018120,-0.002876,AD Child CT,44,44.0,mild/moderate,AD Child
900321,0.002420,0.003792,0.040013,H Child UM,,0.0,mild/moderate,H Child
...,...,...,...,...,...,...,...,...
9003972,-0.107095,-0.089850,-0.019500,AD Child UM,54,54.0,severe,AD Child
900097,0.025972,0.031322,-0.000608,AD Child CT,44,44.0,mild/moderate,AD Child
900498,-0.033348,0.048009,0.030825,AD Child UM,,0.0,mild/moderate,AD Child
900276,-0.108307,-0.102284,0.005354,AD Child UM,52,52.0,severe,AD Child


In [1591]:
# # Rename ordination axes
# fpca_df.columns = [f"PC{i+1}" for i in range(fpca_df.shape[1])]

# # Merge ordination PCs into spca_df
# spca_df[['PC1', 'PC2']] = fpca_df[['PC1', 'PC2']]
# # spca_df = spca_df.dropna(subset=['PC1', 'PC2', 'o_scorad_adj']).copy()
# spca_df



In [1592]:
# Residualize PC1 and PC2
X = sm.add_constant(spca_df['o_scorad_adj'])
model_pc1 = sm.OLS(spca_df['PC1'], X).fit()
model_pc2 = sm.OLS(spca_df['PC2'], X).fit()
spca_df['PC1_resid'] = model_pc1.resid
spca_df['PC2_resid'] = model_pc2.resid

# Create label map with (n=#)
label_map = spca_df["individual_case_location"].value_counts().to_dict()
label_map = {k: f"{k} (n={v})" for k, v in label_map.items()}

# Add labeled group column
spca_df["group_label"] = spca_df["individual_case_location"].map(label_map)

# Create figure
fig, axes = plt.subplots(1, 2, figsize=(8, 5.5), sharex=True, sharey=True)

# Define groups per panel
split_map = {
    "Cape Town": ["H Child CT", "AD Child CT"],
    "Umtata": ["H Child UM", "AD Child UM"]
}


# Plot loop
for ax, (title, groups) in zip(axes, split_map.items()):
    subset_df = spca_df[spca_df["individual_case_location"].isin(groups)].copy()
    subset_df["group_label"] = subset_df["individual_case_location"].map(label_map)

    ax.grid(False)

    sns.scatterplot(
        data=subset_df,
        x="PC1_resid",
        y="PC2_resid",
        hue="group_label",
        hue_order=[label_map[g] for g in groups],
        s=50,
        edgecolor="black",
        linewidth=0.5,
        palette={label_map[g]: palette[g] for g in groups},
        ax=ax
    )

    # Draw 90% confidence circles
    for case_type, case_type_df in subset_df.groupby("individual_case_location"):
        color = palette[case_type]
        points = case_type_df[["PC1_resid", "PC2_resid"]].values
        centroid = points.mean(axis=0)
        dists = np.linalg.norm(points - centroid, axis=1)
        radius = np.percentile(dists, 90)
        circle = Circle(centroid, radius, edgecolor=color, facecolor=color, alpha=0.2, lw=1, zorder=0)
        ax.add_patch(circle)

    ax.set_title(title, fontsize=16)
    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.legend(frameon=False, fontsize=9, loc='upper right')

# Axis labels (now residualized)
axes[0].set_ylabel("Residual PC2 (adj. for SCORAD)", fontsize=14)
for ax in axes:
    ax.set_xlabel("Residual PC1 (adj. for SCORAD)", fontsize=14)
    ax.set_xticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
    ax.set_xticklabels([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3], fontsize=12)
    ax.set_yticks([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3])
    ax.set_yticklabels([-0.2, -0.1, 0.0, 0.1, 0.2, 0.3], fontsize=12)

# PERMANOVA stats
axes[0].text(
    -0.2, 0.27,
    f"p={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['p']):.3f}, "
    f"F={float(perma_res['Skin of Healthy vs. AD Children in Cape Town']['f']):.2f}",
    fontsize=12
)
axes[1].text(
    -0.2, 0.27,
    f"p={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['p']):.3f}, "
    f"F={float(perma_res['Skin of Healthy vs. AD Children in Umtata']['f']):.2f}",
    fontsize=12
)

# Final touches
fig.suptitle("SCORAD-Adjusted Beta Diversity", fontsize=18, y=0.93)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig("../Plots/Analysis_figures/Diversity/16S_Beta_Diversity_RPCA_skin_by-location_split_residualized.png", dpi=600)