## Scatterplot correlations of differential taxa from Umtata and Umtata cohorts against severity

In [29]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [30]:
def rclr_transform(df, pseudocount=1e-6):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df

In [31]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [32]:
# Read in table at ASV level
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus.biom'

biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Get samples that don't start with 'skin' in metadata
skin_samples = metadata[metadata['group'].str.startswith('skin')].index

# Subset to only certain area samples
skin_samples = metadata[metadata['area'].str.startswith('Umtata')].index

# Filter df to keep only skin samples
df = df.loc[skin_samples]

# add o_scorad column
# df = df.merge(metadata[['o_scorad']], left_index=True, right_index=True)

df = rclr_transform(df)

df

Unnamed: 0_level_0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Eubacterium_M,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
900221,,1.126537,,,-1.476152,-1.070687,,,,,...,,,,,,,,,,
900460,3.510481,1.965581,2.411868,-0.401542,,-0.807007,1.832050,-0.113860,,1.064795,...,,,,,,,,,,
900226,-0.019947,-0.425412,4.293086,0.267735,-0.019947,-0.019947,-1.118559,-1.118559,,0.267735,...,,,,,,,,,,
900229,1.789438,,2.284759,,-0.487829,2.750849,0.610783,,,-0.082364,...,,,,,,,,,,
900222,-1.827729,1.504474,3.182905,,1.216792,,,1.468107,,-0.441436,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004012,0.517561,4.022689,1.210708,,2.730534,,-0.329737,-0.735202,,0.874236,...,,,,,,,,,,
900402,1.352545,2.612087,1.303754,,3.087146,1.352545,,0.947079,,0.253932,...,,,,,,,,,,
9004022,0.252154,,0.693986,3.762039,0.452824,,-0.558776,,-0.153311,,...,,,,,,,,,,
900403,2.789983,3.905690,1.987636,-0.171848,-0.171848,0.338978,0.926765,1.368597,,0.808982,...,,,,,,,,,,


In [33]:
# # Convert to relative abundance by dividing each value by the row sum
# df = df.div(df.sum(axis=1), axis=0)

# df = df.merge(metadata[['o_scorad']], left_index=True, right_index=True)

# Convert 'o_scorad' to numeric (coerce errors to NaN)
df['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Drop rows with missing SCORAD values
df = df.dropna(subset=['o_scorad'])

# df

In [34]:
# Set up figure
taxa_list = [' g__Streptococcus', ' g__Staphylococcus']


missing = [t for t in taxa_list if t not in df.columns]
print("Missing taxa:", missing)

Missing taxa: []


In [36]:
# Create subplots
fig, axes = plt.subplots(1, len(taxa_list), figsize=(len(taxa_list) * 2, 3.5), sharey=True)

# Ensure axes is iterable
if len(taxa_list) == 1:
    axes = [axes]

for i, taxon in enumerate(taxa_list):
    print(f"Processing: {taxon}")
    
    if taxon not in df.columns:
        print(f"⚠️ Taxon not found: {taxon}")
        continue  # Skip this taxon to avoid crash

    df_sub = df[['o_scorad', taxon]].dropna()

    ax = axes[i]  # Ensure you're referencing the correct axis

    sns.regplot(
        data=df_sub,
        x='o_scorad',
        y=taxon,
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
        ax=ax
    )

    # Pearson r calculation
    r, pval = pearsonr(df_sub['o_scorad'], df_sub[taxon])
    ax.text(
        0.05, 0.95,
        f"r = {r:.2f}\np = {pval:.2e}",
        transform=ax.transAxes,
        ha='left', va='top',
        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none')
    )

    # Axis formatting
    ax.set_title(taxon.replace('g__', ''), fontsize=12)
    ax.set_xlabel("SCORAD Severity", fontsize=12)
    if i == 0:
        ax.set_ylabel("RCLR-transformed abundance", fontsize=12)
    else:
        ax.set_ylabel("")
        ax.set_yticklabels([])

    ax.set_ylim(-2, 6)
    ax.set_yticks(np.arange(-2, 6, 1))

# Final layout adjustments
plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.suptitle("Correlation Between Skin Taxa of Umtata Samples and AD Severity", fontsize=14, y=0.98)

# Save figure
plt.savefig('../Plots/Analysis_figures/Severity_Correlations/rclr_abundance_vs_severity_skin_x.png', dpi=600)


Processing:  g__Streptococcus
Processing:  g__Staphylococcus
