In [61]:
# Import Python packages
import pandas as pd
import numpy as np
import biom
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import os
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgba
from biom import load_table


In [62]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [63]:
# Read in table at ASV level
biom_path = '../Data/Tables/Absolute_Abundance_Tables/feature_table_with_tax_labels_Genus.biom'
# biom_path = '../Data/Tables/Absolute_Abundance_Tables/209766_filtered_by_prevalence_1pct_rare_Genus-ASV-non-collapse.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')

# Get samples that don't start with 'skin' in metadata
skin_samples = metadata[metadata['group'].str.startswith('skin')].index

# Filter df to keep only skin samples
df = df.loc[skin_samples]

df.index.name = None
df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Bacillus_P_294101_ASV-2,g__Cutibacterium_ASV-1,g___ASV-18,g___ASV-28,g___ASV-25,...,g__Blautia_A_141781_ASV-7,g___ASV-358,g__Peptoniphilus_A_ASV-7,g___ASV-154,g___ASV-169,g__UBA952_ASV-1,g__Petroclostridium_ASV-1,g__Capnocytophaga_820690_ASV-4,g___ASV-202,g__Streptococcus_ASV-37
Ca009STL,15.0,0,0,0,0,0,13.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca010EBL,8.0,0,0,0,0,0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900460,66.0,44.0,0,0,15.0,0,4.0,0,0,7.0,...,0,0,0,0,0,0,0,0,0,0
900051,30.0,0,0,0,0,0,10.0,5.0,0,2.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004032,66.0,0,99.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONL,12.0,0,0,0,0,0,27.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ca006ONNL,9.0,0,0,0,0,0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
diversity_results = pd.read_csv("../Analyses/Xebec/Output/results/alpha_div/phylo/faith_pd/vector.tsv", sep="\t")
diversity_results = diversity_results.set_index('Unnamed: 0')
diversity_results.index.name = None
diversity_results.columns = ['faith_pd']

# Filter diversity_results to only include samples present in df
diversity_results = diversity_results[diversity_results.index.isin(df.index)]

diversity_results

Unnamed: 0,faith_pd
900317,4.827391
900263,36.001057
900584,17.706857
900270,23.484170
900558,18.015693
...,...
900063,9.252135
900276,16.753817
900293,8.935723
900445,13.433266


In [65]:
# Map the faith_pd column to df
df['faith_pd'] = diversity_results['faith_pd'].reindex(df.index)
# Drop rows where faith_pd is NaN
df = df.dropna(subset=['faith_pd'])

df

Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Bacillus_P_294101_ASV-2,g__Cutibacterium_ASV-1,g___ASV-18,g___ASV-28,g___ASV-25,...,g___ASV-358,g__Peptoniphilus_A_ASV-7,g___ASV-154,g___ASV-169,g__UBA952_ASV-1,g__Petroclostridium_ASV-1,g__Capnocytophaga_820690_ASV-4,g___ASV-202,g__Streptococcus_ASV-37,faith_pd
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.911482
900460,66.0,44.0,0,0,15.0,0,4.0,0,0,7.0,...,0,0,0,0,0,0,0,0,0,11.102273
900051,30.0,0,0,0,0,0,10.0,5.0,0,2.0,...,0,0,0,0,0,0,0,0,0,9.615713
900226,3.0,0,0,0,0,0,4.0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.430502
900057,39.0,0,0,0,0,0,0,3.0,0,0,...,0,0,0,0,0,0,0,0,0,8.615012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900395,6.0,0,0,0,1.0,0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,19.200163
900397,92.0,56.0,4.0,0,0,0,5.0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.031552
900400,144.0,87.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.908046
900401,63.0,51.0,5.0,1.0,0,0,11.0,0,0,0,...,0,0,0,0,0,0,0,0,0,17.014607


In [66]:
# Map the 'group' column from metadata to df based on matching index
df['o_scorad'] = metadata.loc[df.index, 'o_scorad']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['o_scorad'] = metadata.loc[df.index, 'o_scorad']


Unnamed: 0,g__Streptococcus_ASV-1,g__Streptococcus_ASV-2,g__Corynebacterium_ASV-1,g__Corynebacterium_ASV-3,g___ASV-3,g__Bacillus_P_294101_ASV-2,g__Cutibacterium_ASV-1,g___ASV-18,g___ASV-28,g___ASV-25,...,g__Peptoniphilus_A_ASV-7,g___ASV-154,g___ASV-169,g__UBA952_ASV-1,g__Petroclostridium_ASV-1,g__Capnocytophaga_820690_ASV-4,g___ASV-202,g__Streptococcus_ASV-37,faith_pd,o_scorad
900221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.911482,34
900460,66.0,44.0,0,0,15.0,0,4.0,0,0,7.0,...,0,0,0,0,0,0,0,0,11.102273,40
900051,30.0,0,0,0,0,0,10.0,5.0,0,2.0,...,0,0,0,0,0,0,0,0,9.615713,41
900226,3.0,0,0,0,0,0,4.0,0,0,0,...,0,0,0,0,0,0,0,0,10.430502,34
900057,39.0,0,0,0,0,0,0,3.0,0,0,...,0,0,0,0,0,0,0,0,8.615012,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900395,6.0,0,0,0,1.0,0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,19.200163,28
900397,92.0,56.0,4.0,0,0,0,5.0,0,0,0,...,0,0,0,0,0,0,0,0,9.031552,54
900400,144.0,87.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.908046,38
900401,63.0,51.0,5.0,1.0,0,0,11.0,0,0,0,...,0,0,0,0,0,0,0,0,17.014607,38


In [67]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

# Clean data
df_clean = df[['o_scorad', 'faith_pd']].copy()
df_clean = df_clean.apply(pd.to_numeric, errors='coerce').dropna()

# Compute correlations
x = df_clean['o_scorad']
y = df_clean['faith_pd']
pearson_r, _ = pearsonr(x, y)
spearman_r, _ = spearmanr(x, y)

# Plot and save
plt.figure(figsize=(8, 6))
ax = sns.regplot(data=df_clean, x='o_scorad', y='faith_pd', scatter_kws={'alpha': 0.6})
plt.xlabel('SCORAD (Severity Score)')
plt.ylabel('Faith’s Phylogenetic Diversity')
plt.title('Correlation between SCORAD and Faith PD')
plt.text(0.05, 0.95,
         f"Pearson r = {pearson_r:.2f}\nSpearman ρ = {spearman_r:.2f}",
         transform=ax.transAxes,
         fontsize=12, verticalalignment='top',
         bbox=dict(facecolor='white', alpha=0.7))

plt.grid(True)
plt.tight_layout()
plt.savefig('faith_pd_vs_scorad.png', dpi=300)  # Save the figure
plt.show()


  plt.show()
