# RCLR severity correlation boxplots (non-asv)

In [195]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [196]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})



metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [197]:
# Set microbiome type
microbiome_type = 'nares'

In [198]:
# Read in table at ASV level
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Get samples that start with 'skin' in metadata
skin_samples = metadata[metadata['group'].str.startswith(microbiome_type)].index

# Filter df to keep only skin samples
df = df.loc[skin_samples]
df

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Eubacterium_M,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STPN,51.0,0,32.0,0,193.0,0,0,0,48.0,0,...,0,0,0,0,0,0,0,0,0,0
900223,3.0,0,0,0,23.0,0,0,0,2.0,0,...,0,0,0,0,0,0,0,0,0,0
Ca010EBPN,83.0,4.0,100.0,0,55.0,0,8.0,0,23.0,29.0,...,0,0,0,0,0,0,0,0,0,0
900462,169.0,2.0,36.0,36.0,73.0,0,7.0,4.0,3.0,1.0,...,0,0,0,0,0,0,0,0,0,0
900053,188.0,0,0,144.0,0,0,0,0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003992,31.0,5.0,10.0,0,148.0,0,5.0,0,60.0,4.0,...,0,0,0,0,0,0,0,0,0,0
900402,21.0,74.0,20.0,0,119.0,21.0,0,14.0,0,7.0,...,0,0,0,0,0,0,0,0,0,0
9004022,9.0,0,14.0,301.0,11.0,0,4.0,0,6.0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONPN,1.0,6.0,2.0,69.0,45.0,0,0,0,227.0,0,...,0,0,0,0,0,0,0,0,0,0


In [199]:
def rclr_transform(df, pseudocount=1e-6):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df



rclr_df = rclr_transform(df)

In [200]:
df_rclr = rclr_transform(df)
df_rclr

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Eubacterium_M,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STPN,1.302853,,0.836764,,2.633718,,,,1.242229,,...,,,,,,,,,,
900223,-1.194695,,,,0.842187,,,,-1.600160,,...,,,,,,,,,,
Ca010EBPN,1.786005,-1.246541,1.972335,,1.374498,,-0.553394,,0.502659,0.734460,...,,,,,,,,,,
900462,3.839613,-0.597138,2.293233,2.293233,3.000174,,0.655625,0.096009,-0.191673,-1.290285,...,,,,,,,,,,
900053,4.044725,,,3.778096,,,,,-1.191716,-1.191716,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003992,2.011916,0.187367,0.880514,,3.575142,,0.187367,,2.672274,-0.035776,...,,,,,,,,,,
900402,1.352545,2.612087,1.303754,,3.087146,1.352545,,0.947079,,0.253932,...,,,,,,,,,,
9004022,0.252154,,0.693986,3.762039,0.452824,,-0.558776,,-0.153311,,...,,,,,,,,,,
Ca006ONPN,-2.658437,-0.866678,-1.965290,1.575669,1.148225,,,,2.766512,,...,,,,,,,,,,


In [201]:
df_rclr = df_rclr.merge(metadata[['o_scorad']], left_index=True, right_index=True)
df_rclr = df_rclr.merge(metadata[['group']], left_index=True, right_index=True)
df_rclr = df_rclr.merge(metadata[['area']], left_index=True, right_index=True)

# Convert 'o_scorad' to numeric (coerce errors to NaN)
df_rclr['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Drop rows with missing SCORAD values
df_rclr = df_rclr.dropna(subset=['o_scorad'])
df_rclr

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G,o_scorad,group,area
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STPN,1.302853,,0.836764,,2.633718,,,,1.242229,,...,,,,,,,,40.0,nares-AD,Cape Town
900223,-1.194695,,,,0.842187,,,,-1.600160,,...,,,,,,,,34.0,nares-AD,Umtata
Ca010EBPN,1.786005,-1.246541,1.972335,,1.374498,,-0.553394,,0.502659,0.734460,...,,,,,,,,21.0,nares-AD,Cape Town
900462,3.839613,-0.597138,2.293233,2.293233,3.000174,,0.655625,0.096009,-0.191673,-1.290285,...,,,,,,,,40.0,nares-AD,Umtata
900053,4.044725,,,3.778096,,,,,-1.191716,-1.191716,...,,,,,,,,41.0,nares-AD,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900399,-1.158252,3.368956,1.582588,,0.546496,0.345825,,-1.158252,,2.425267,...,,,,,,,,54.0,nares-AD,Umtata
9003992,2.011916,0.187367,0.880514,,3.575142,,0.187367,,2.672274,-0.035776,...,,,,,,,,54.0,nares-AD,Umtata
9004022,0.252154,,0.693986,3.762039,0.452824,,-0.558776,,-0.153311,,...,,,,,,,,38.0,nares-AD,Umtata
Ca006ONPN,-2.658437,-0.866678,-1.965290,1.575669,1.148225,,,,2.766512,,...,,,,,,,,34.0,nares-AD,Cape Town


In [202]:
# df_rclr = df_rclr[(df_rclr['group'] == 'skin-ADL')]
df_rclr = df_rclr[(df_rclr['group'] == 'nares-AD')]

df_rclr

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G,o_scorad,group,area
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STPN,1.302853,,0.836764,,2.633718,,,,1.242229,,...,,,,,,,,40.0,nares-AD,Cape Town
900223,-1.194695,,,,0.842187,,,,-1.600160,,...,,,,,,,,34.0,nares-AD,Umtata
Ca010EBPN,1.786005,-1.246541,1.972335,,1.374498,,-0.553394,,0.502659,0.734460,...,,,,,,,,21.0,nares-AD,Cape Town
900462,3.839613,-0.597138,2.293233,2.293233,3.000174,,0.655625,0.096009,-0.191673,-1.290285,...,,,,,,,,40.0,nares-AD,Umtata
900053,4.044725,,,3.778096,,,,,-1.191716,-1.191716,...,,,,,,,,41.0,nares-AD,Cape Town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900399,-1.158252,3.368956,1.582588,,0.546496,0.345825,,-1.158252,,2.425267,...,,,,,,,,54.0,nares-AD,Umtata
9003992,2.011916,0.187367,0.880514,,3.575142,,0.187367,,2.672274,-0.035776,...,,,,,,,,54.0,nares-AD,Umtata
9004022,0.252154,,0.693986,3.762039,0.452824,,-0.558776,,-0.153311,,...,,,,,,,,38.0,nares-AD,Umtata
Ca006ONPN,-2.658437,-0.866678,-1.965290,1.575669,1.148225,,,,2.766512,,...,,,,,,,,34.0,nares-AD,Cape Town


In [203]:
# Example input: list of bacterial taxa column names
taxa_list = [' g__Staphylococcus', ' g__Streptococcus', ' g__Micrococcus', ' g__Veillonella_A', ' g__Haemophilus_D_734546']

# Set up color map
area_colors = {'Cape Town': '#1f77b4', 'Umtata': '#ff7f0e'}  # Blue and orange

# Set up the figure with multiple subplots
fig, axes = plt.subplots(1, len(taxa_list), figsize=(len(taxa_list) * 2.5, 4), sharey=True)

for i, taxon in enumerate(taxa_list):
    ax = axes[i] if len(taxa_list) > 1 else axes  # handle single-subplot case

    # Drop missing values
    df_plot = df_rclr[['o_scorad', taxon, 'area']].dropna()

    # Plot each area with a different color
    for area, color in area_colors.items():
        subset = df_plot[df_plot['area'] == area]
        ax.scatter(subset['o_scorad'], subset[taxon], label=area, color=color, alpha=0.5, s=20)

    # Fit and plot regression line across all points
    sns.regplot(
        data=df_plot,
        x='o_scorad',
        y=taxon,
        scatter=False,
        line_kws={'color': 'black'},
        ax=ax
    )

    # Compute Pearson correlation
    r, pval = pearsonr(df_plot['o_scorad'], df_plot[taxon])
    ax.set_title(taxon.strip(), fontsize=14)
    ax.set_xlabel('SCORAD')
    if i == 0:
        ax.set_ylabel('RCLR Relative Abundance')
    else:
        ax.set_ylabel('')
    ax.text(0.05, 0.95, f"Pearson r = {r:.2f}\np = {pval:.1e}", transform=ax.transAxes,
            fontsize=12, va='top', ha='left', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

# Shared title and legend
fig.suptitle('Correlation Between Skin Bacteria and AD Severity', y=0.96, fontsize=18)
axes[0].legend(title='Area', loc='lower right')
plt.tight_layout()
plt.savefig('bacteria_vs_scorad_by_area.png', dpi=300)
