In [458]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy import stats
from scipy.stats import pearsonr
import os
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols


In [459]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [460]:
# Read in table at ASV level
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

df

Unnamed: 0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Eubacterium_M,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G
900344,303.0,0,3.0,0,42.0,0,0,0,0,2.0,...,0,0,0,0,0,0,0,0,0,0
900459,57.0,9.0,2.0,131.0,0,0,5.0,1.0,133.0,0,...,0,0,0,0,0,0,0,0,0,0
900221,0,27.0,0,0,2.0,3.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900570,39.0,86.0,53.0,1.0,9.0,12.0,65.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900092,337.0,0,3.0,0,4.0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,78.0,52.0,16.0,4.0,7.0,4.0,12.0,3.0,0,39.0,...,0,0,0,0,0,0,0,0,0,0
900097,3.0,52.0,86.0,0,3.0,38.0,38.0,18.0,0,7.0,...,0,0,0,0,0,0,0,0,0,0
900498,9.0,22.0,43.0,0,6.0,19.0,0,15.0,0,24.0,...,0,0,0,0,0,0,0,0,0,0
900276,0,3.0,22.0,0,13.0,6.0,22.0,0,0,114.0,...,0,0,0,0,0,0,0,0,0,0


In [461]:
def rclr_transform(df, pseudocount=1e-6):
    """
    Applies Robust Centered Log-Ratio (RCLR) transformation to a DataFrame.
    Zeros are ignored in the geometric mean calculation per sample.
    """
    # Convert sparse to dense if needed
    df = df.sparse.to_dense()

    # Replace 0 with np.nan to ignore in log and mean
    df_masked = df.replace(0, np.nan)

    # Apply log (with pseudocount only where needed)
    log_df = np.log(df_masked + pseudocount)

    # Subtract mean of each row (feature)
    rclr_df = log_df.sub(log_df.mean(axis=1, skipna=True), axis=0)

    return rclr_df

In [462]:
rclr_df = rclr_transform(df)
rclr_df

Unnamed: 0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Eubacterium_M,g__Lachnospira,g__Helcococcus,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G
900344,2.902942,,-1.712178,,0.926879,,,,,-2.117643,...,,,,,,,,,,
900459,2.557814,0.711987,-0.792090,3.389960,,,0.124201,-1.485236,3.405112,,...,,,,,,,,,,
900221,,1.126537,,,-1.476152,-1.070687,,,,,...,,,,,,,,,,
900570,2.535851,3.326637,2.842581,-1.127710,1.069514,1.357196,3.046676,,,,...,,,,,,,,,,
900092,4.204868,,-0.516602,,-0.228920,,,,-1.615213,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,3.143951,2.738485,1.559830,0.173536,0.733152,0.173536,1.272148,-0.114146,,2.450803,...,,,,,,,,,,
900097,-0.696746,2.155885,2.658989,,-0.696746,1.842228,1.842228,1.095013,,0.150552,...,,,,,,,,,,
900498,0.164330,1.058148,1.728305,,-0.241135,0.911544,,0.675155,,1.145159,...,,,,,,,,,,
900276,,-0.574105,1.418325,,0.892232,0.119042,1.418325,,,3.063481,...,,,,,,,,,,


In [463]:
# add o_scorad column
rclr_df = rclr_df.merge(metadata[['o_scorad']], left_index=True, right_index=True)

# add pid column
rclr_df = rclr_df.merge(metadata[['pid']], left_index=True, right_index=True)

# add area column
rclr_df = rclr_df.merge(metadata[['area']], left_index=True, right_index=True)

# Convert 'o_scorad' to numeric (coerce errors to NaN)
rclr_df['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Drop rows with missing SCORAD values
rclr_df = rclr_df.dropna(subset=['o_scorad'])

# Filter for only Umtata samples (those with 'UM' in pid)
rclr_df = rclr_df[rclr_df['area'].str.contains('Umtata', na=False)]

rclr_df

Unnamed: 0,g__Streptococcus,g__Staphylococcus,g__,g__Haemophilus_D_734546,g__Corynebacterium,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Cutibacterium,...,g__Enterenecus,g__Evtepia,g__Fimenecus,g__CAG-41,g__Selenomonas_B_42762,g__Facklamia_A_322620,g__Eubacterium_G,o_scorad,pid,area
900459,2.557814,0.711987,-0.792090,3.389960,,,0.124201,-1.485236,3.405112,,...,,,,,,,,44.0,Ca-145-LM,Umtata
900221,,1.126537,,,-1.476152,-1.070687,,,,,...,,,,,,,,34.0,Ca-101-ID,Umtata
900466,2.883453,,,,-2.246445,,,,-0.637008,,...,,,,,,,,43.0,Ca-147-LJ,Umtata
900301,-1.458702,3.682961,-1.458702,,-0.765556,,,,,,...,,,,,,,,42.0,Ca-112-AM,Umtata
900423,,1.932297,1.599857,,-1.969675,0.863538,,-0.097874,,1.050749,...,,,,,,,,32.0,Ca-133-LT,Umtata
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900484,2.550841,,,0.547112,-0.551500,,,,0.489953,,...,,,,,,,,42.0,Ca-153-MN,Umtata
900294,-0.083496,-1.113116,2.352620,,-1.336259,-1.623941,,,,1.852157,...,,,,,,,,23.0,Ca-113-MN,Umtata
9003972,3.143951,2.738485,1.559830,0.173536,0.733152,0.173536,1.272148,-0.114146,,2.450803,...,,,,,,,,54.0,Ca-125-IM,Umtata
900276,,-0.574105,1.418325,,0.892232,0.119042,1.418325,,,3.063481,...,,,,,,,,52.0,Ca-111-IN,Umtata


In [464]:
# Model: does Staph predict Strep after adjusting for severity?
# Subset and clean data
model_df = rclr_df[[' g__Streptococcus', ' g__Staphylococcus', 'o_scorad', 'pid']].dropna()
# Remove leading spaces in column names
model_df.columns = [col.strip() for col in model_df.columns]

model_df

Unnamed: 0,g__Streptococcus,g__Staphylococcus,o_scorad,pid
900459,2.557814,0.711987,44.0,Ca-145-LM
900301,-1.458702,3.682961,42.0,Ca-112-AM
900422,4.338956,-0.091860,43.0,Ca-132-LD
900329,0.917941,4.577833,39.0,Ca-119-AM
900456,3.122239,1.634162,53.0,Ca-144-HN
...,...,...,...,...
900434,3.298513,1.610432,39.0,Ca-138-SM
900501,3.786917,1.453241,54.0,Ca-158-LC
900304,4.736879,-1.034561,33.0,Ca-115-OM
900294,-0.083496,-1.113116,23.0,Ca-113-MN


In [465]:
model = smf.mixedlm("Q('g__Streptococcus') ~ Q('g__Staphylococcus') + o_scorad + pid",  # DependentVariable ~ Predictor1 + Predictor2
                    data=model_df, 
                    groups=model_df["pid"])
result = model.fit()
print(result.summary())

               Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Q('g__Streptococcus')
No. Observations: 118     Method:             REML                 
No. Groups:       55      Scale:              2.3604               
Min. group size:  1       Log-Likelihood:     -120.3721            
Max. group size:  6       Converged:          No                   
Mean group size:  2.1                                              
-------------------------------------------------------------------
                         Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------------------
Intercept               -16.769                                    
pid[T.Ca-103-BT]          1.211    3.080  0.393 0.694 -4.825  7.248
pid[T.Ca-104-SB]          9.410                                    
pid[T.Ca-105-LX]         10.298                                    
pid[T.Ca-106-NM]         -1.572                                

  sdf[0:self.k_fe, 1] = np.sqrt(np.diag(self.cov_params()[0:self.k_fe]))


In [466]:
# Fit mixed models to get residuals adjusted for SCORAD and pid
resid_model_strep = smf.mixedlm("Q('g__Streptococcus') ~ o_scorad", data=model_df, groups=model_df['pid']).fit()
resid_model_staph = smf.mixedlm("Q('g__Staphylococcus') ~ o_scorad", data=model_df, groups=model_df['pid']).fit()

# Store residuals
model_df['strep_resid'] = resid_model_strep.resid
model_df['staph_resid'] = resid_model_staph.resid

# Plot residuals
plt.figure(figsize=(6, 4.5))

sns.regplot(
    data=model_df,
    x='staph_resid',
    y='strep_resid',
    scatter_kws={"alpha": 0.6},
    line_kws={"color": "black"}
)

# Correlation for annotation
r, p = pearsonr(model_df['staph_resid'], model_df['strep_resid'])
plt.text(0.97, 0.97, f'Pearson r = {r:.2f}\np = {p:.3e}',
         transform=plt.gca().transAxes,
         verticalalignment='top',
         horizontalalignment='right',
         fontsize=13)

# Labeling
plt.xlabel('Staph (RCLR residuals, adj. SCORAD + subject)', fontsize=13)
plt.ylabel('Strep (RCLR residuals, adj. SCORAD + subject)', fontsize=13)
plt.suptitle('Linear Regression', fontsize=18, x=0.55, y=0.96)
plt.title('Strep ~ Staph | adjusted for SCORAD and subject', fontsize=15)
plt.tight_layout()
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.xlabel('Staph (RCLR residuals)', fontsize=13)
plt.ylabel('Strep (RCLR residuals)', fontsize=13)

# Save figure
plt.savefig("../Plots/Analysis_figures/Severity_Correlations/Staph_Strep_effect_model_adjusted-Umtata.png", dpi=600)


