## Scatterplot correlations of differential taxa from Umtata and Umtata cohorts against severity

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches
from scipy.stats import pearsonr



In [23]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [24]:
# Read in table at ASV level
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'

biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Get samples that don't start with 'skin' in metadata
skin_samples = metadata[metadata['group'].str.startswith('skin')].index

# Subset to only certain area samples
skin_samples = metadata[metadata['area'].str.startswith('Umtata')].index

# Filter df to keep only skin samples
df = df.loc[skin_samples]

# add o_scorad column
# df = df.merge(metadata[['o_scorad']], left_index=True, right_index=True)

df

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Copromonas_ASV-3,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-17,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
900221,22.0,0,0,0,0,16.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900460,722.0,493.0,0,0,200.0,40.0,0,0,50.0,28.0,...,0,0,0,0,0,0,0,0,0,0
900226,98.0,0,0,0,2.0,226.0,0,0,0,31.0,...,0,0,0,0,0,0,0,0,0,0
900229,153.0,0,0,0,4.0,4.0,16.0,3.0,0,0,...,0,0,0,0,0,0,0,0,0,0
900222,7.0,4.0,0,0,0,11.0,0,64.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004012,0,0,0,0,0,131.0,0,0,0,14.0,...,0,0,0,0,0,0,0,0,0,0
900402,156.0,0,0,0,0,32.0,0,0,0,19.0,...,0,0,0,0,0,0,0,0,0,0
9004022,126.0,0,241.0,0,235.0,14.0,0,0,4.0,7.0,...,0,0,0,0,0,0,0,0,0,0
900403,187.0,0,0,0,4.0,33.0,0,0,0,5.0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Convert to relative abundance by dividing each value by the row sum
df = df.div(df.sum(axis=1), axis=0)

df = df.merge(metadata[['o_scorad']], left_index=True, right_index=True)

# Convert 'o_scorad' to numeric (coerce errors to NaN)
df['o_scorad'] = pd.to_numeric(metadata['o_scorad'], errors='coerce')

# Drop rows with missing SCORAD values
df = df.dropna(subset=['o_scorad'])

df

Unnamed: 0_level_0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-4,g___ASV-5,g__Cutibacterium_ASV-1,g___ASV-33,g___ASV-30,g___ASV-22,g__Cutibacterium_ASV-2,...,g__Microvirga_ASV-1,g__Leptotrichia_A_993758_ASV-17,g___ASV-103,g__Leptotrichia_A_993758_ASV-13,g__Capnocytophaga_820688_ASV-8,g__UBA6175_ASV-2,g__Comamonas_F_589250_ASV-3,g__Bosea_ASV-2,g___ASV-145,o_scorad
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
900221,0.000745,0.0,0.0,0.0,0.0,0.000542,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0
900460,0.18861,0.128788,0.0,0.0,0.052247,0.010449,0.0,0.0,0.013062,0.007315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0
900226,0.004087,0.0,0.0,0.0,0.000083,0.009424,0.0,0.0,0.0,0.001293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0
900229,0.087982,0.0,0.0,0.0,0.0023,0.0023,0.009201,0.001725,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0
900222,0.00406,0.00232,0.0,0.0,0.0,0.006381,0.0,0.037123,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900401,0.202952,0.128383,0.00738,0.006458,0.0,0.034748,0.0,0.0,0.0,0.018758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0
9004012,0.0,0.0,0.0,0.0,0.0,0.027354,0.0,0.0,0.0,0.002923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0
9004022,0.01787,0.0,0.03418,0.0,0.033329,0.001986,0.0,0.0,0.000567,0.000993,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0
900403,0.102242,0.0,0.0,0.0,0.002187,0.018043,0.0,0.0,0.0,0.002734,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0


In [26]:
print(df.columns.tolist())


['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2', 'g__Corynebacterium_ASV-1', 'g__Corynebacterium_ASV-4', 'g___ASV-5', 'g__Cutibacterium_ASV-1', 'g___ASV-33', 'g___ASV-30', 'g___ASV-22', 'g__Cutibacterium_ASV-2', 'g__Bifidobacterium_388775_ASV-4', 'g__Haemophilus_D_734546_ASV-1', 'g__Dolosigranulum_ASV-1', 'g__Dolosigranulum_ASV-2', 'g__Haemophilus_D_734546_ASV-2', 'g__Staphylococcus_ASV-2', 'g__Staphylococcus_ASV-1', 'g__Micrococcus_ASV-2', 'g__Lactococcus_A_346120_ASV-1', 'g__Limosilactobacillus_ASV-1', 'g__Prevotella_ASV-1', 'g__Corynebacterium_ASV-12', 'g__Prevotella_ASV-2', 'g__Fusobacterium_C_ASV-2', 'g__Neisseria_563205_ASV-3', 'g__Arthrobacter_D_ASV-1', 'g__Gemella_ASV-2', 'g___ASV-59', 'g__Leptotrichia_A_993758_ASV-3', 'g__Porphyromonas_A_859423_ASV-2', 'g__Fusobacterium_C_ASV-4', 'g___ASV-3', 'g___ASV-68', 'g__Brachybacterium_ASV-2', 'g__Neisseria_563205_ASV-4', 'g__Veillonella_A_ASV-2', 'g__Sphingobacterium_ASV-5', 'g__Jeotgalicoccus_A_310962_ASV-2', 'g__Rothia_ASV-6', 'g

In [27]:
# Set up figure
# taxa_list = ['g__Streptococcus_ASV-1', 'g__Streptococcus_ASV-2', 'g__Staphylococcus_ASV-1', 'g__Micrococcus_ASV-1', 'g__Veillonella_A_ASV-1', 'g__Acinetobacter_ASV-1']
taxa_list = ['g__Staphylococcus_ASV-1', 'g__Micrococcus_ASV-1', 'g__Veillonella_A_ASV-1']

missing = [t for t in taxa_list if t not in df.columns]
print("Missing taxa:", missing)

Missing taxa: []


In [28]:
# Create subplots
fig, axes = plt.subplots(1, len(taxa_list), figsize=(len(taxa_list) * 2, 3.5), sharey=True)

# Ensure axes is iterable
if len(taxa_list) == 1:
    axes = [axes]

for i, taxon in enumerate(taxa_list):
    print(f"Processing: {taxon}")
    
    if taxon not in df.columns:
        print(f"⚠️ Taxon not found: {taxon}")
        continue  # Skip this taxon to avoid crash

    df_sub = df[['o_scorad', taxon]].dropna()

    ax = axes[i]  # Ensure you're referencing the correct axis

    sns.regplot(
        data=df_sub,
        x='o_scorad',
        y=taxon,
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
        ax=ax
    )

    # Pearson r calculation
    r, pval = pearsonr(df_sub['o_scorad'], df_sub[taxon])
    ax.text(
        0.05, 0.95,
        f"r = {r:.2f}\np = {pval:.2e}",
        transform=ax.transAxes,
        ha='left', va='top',
        bbox=dict(facecolor='white', alpha=0.7, edgecolor='none')
    )

    # Axis formatting
    ax.set_title(taxon.replace('g__', ''), fontsize=12)
    ax.set_xlabel("SCORAD Severity", fontsize=12)
    if i == 0:
        ax.set_ylabel("RCLR-transformed abundance", fontsize=12)
    else:
        ax.set_ylabel("")
        ax.set_yticklabels([])

    ax.set_ylim(-2, 6)
    ax.set_yticks(np.arange(-2, 6, 2))

# Final layout adjustments
plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.suptitle("Correlation Between Skin Taxa of Umtata Samples and AD Severity", fontsize=14, y=0.98)

# Save figure
plt.savefig('../Plots/Analysis_figures/Severity_Correlations/rclr_abundance_vs_severity_ASV_skin_Umtata.png', dpi=600)


Processing: g__Staphylococcus_ASV-1
Processing: g__Micrococcus_ASV-1
Processing: g__Veillonella_A_ASV-1
