In [46]:
# make tables to run qadabra

In [47]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import biom
from biom import load_table
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import itertools
import matplotlib.patches


In [48]:
# Read in table at ASV level
biom_path = '../Data/Tables/Absolute_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_absolute.biom'
biom_tbl = load_table(biom_path)
df = pd.DataFrame(biom_tbl.to_dataframe().T)

# delete the prefix from the index
df.index = df.index.str.replace('15564.', '')
df

Unnamed: 0,g__Streptococcus,g__Staphylococcus,g__Haemophilus_D_734546,g__Corynebacterium,g__,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Moraxella_A,...,g__Aerococcus,g__Tepidiphilus,g__Pseudomonas_K,g__CADDYX01,g__Pseudomonas_S,g__Tetragenococcus,g__Thermicanus,g__Thermoanaerobacterium,g__Thermus_A,g__Nakamurella
900344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900221,2.0,175.0,0,7.0,7.0,82.0,3.0,4.0,0,0,...,0,0,0,0,0,0,0,0,0,0
900570,361.0,710.0,2.0,37.0,510.0,55.0,645.0,0,0,43.0,...,0,0,0,0,0,0,0,0,0,0
900092,2920.0,0,0,46.0,20.0,0,0,2.0,2.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,674.0,380.0,47.0,45.0,113.0,24.0,148.0,29.0,0,12.0,...,0,0,0,0,0,0,0,0,0,0
900097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900276,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Convert to presence/absence (1 where count > 0, 0 otherwise)
df_pa = (df > 0).astype(int)

# Calculate percentage of samples where each feature is present
presence_percent = (df_pa.sum() / len(df_pa)) * 100

# Filter features present in at least 10% of samples
df = df.loc[:, presence_percent >= 10]
df

Unnamed: 0,g__Streptococcus,g__Staphylococcus,g__Haemophilus_D_734546,g__Corynebacterium,g__,g__Acinetobacter,g__Prevotella,g__Micrococcus,g__Dolosigranulum,g__Moraxella_A,...,g__Rothia,g__Capnocytophaga,g__Anaerococcus,g__Finegoldia,g__Sphingomonas_L_486704,g__Brevundimonas,g__Alloprevotella,g__Nocardioides_A_392796,g__Telluria_573210,g__Pauljensenia
900344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900221,2.0,175.0,0,7.0,7.0,82.0,3.0,4.0,0,0,...,0,0,0,0,0,0,2.0,0,0,0
900570,361.0,710.0,2.0,37.0,510.0,55.0,645.0,0,0,43.0,...,0,0,4.0,0,0,3.0,4.0,12.0,0,4.0
900092,2920.0,0,0,46.0,20.0,0,0,2.0,2.0,0,...,0,0,0,0,0,0,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9003972,674.0,380.0,47.0,45.0,113.0,24.0,148.0,29.0,0,12.0,...,13.0,12.0,11.0,1.0,1.0,0,48.0,0,7.0,4.0
900097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900276,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
df.to_csv('../Data/Tables/Absolute_Abundance_Tables/df_16S_filtered_feature_table_rare_Genus_absolute_10filtered_ancombc.tsv', sep='\t')

In [50]:
# Load the metadata
metadata_path = '../Data/Metadata/updated_clean_ant_skin_metadata.tab'
metadata = pd.read_csv(metadata_path, sep='\t')

metadata['#sample-id'] = metadata['#sample-id'].str.replace('_', '')
# Set Sample-ID as the index for the metadata dataframe 
metadata = metadata.set_index('#sample-id')


# Create group column based on case_type to simplify group names
metadata['group'] = metadata['case_type'].map({
    'case-lesional skin': 'skin-ADL',
    'case-nonlesional skin': 'skin-ADNL', 
    'control-nonlesional skin': 'skin-H',
    'case-anterior nares': 'nares-AD',
    'control-anterior nares': 'nares-H'
})

metadata

Unnamed: 0_level_0,PlateNumber,PlateLocation,i5,i5Sequence,i7,i7Sequence,identifier,Sequence,Plate ID,Well location,...,sex,enrolment_date,enrolment_season,hiv_exposure,hiv_status,household_size,o_scorad,FWD_filepath,REV_filepath,group
#sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ca009STL,1,A1,SA501,ATCGTACG,SA701,CGAGAGTT,SA701SA501,CGAGAGTT-ATCGTACG,1.010000e+21,A1,...,male,4/16/2015,Autumn,Unexposed,negative,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900221,1,B1,SA502,ACTATCTG,SA701,CGAGAGTT,SA701SA502,CGAGAGTT-ACTATCTG,1.010000e+21,B1,...,female,8/11/2015,Winter,Unexposed,negative,7.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca010EBL,1,C1,SA503,TAGCGAGT,SA701,CGAGAGTT,SA701SA503,CGAGAGTT-TAGCGAGT,1.010000e+21,C1,...,female,11/20/2014,Spring,Unexposed,negative,7.0,21,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900460,1,D1,SA504,CTGCGTGT,SA701,CGAGAGTT,SA701SA504,CGAGAGTT-CTGCGTGT,1.010000e+21,D1,...,female,9/23/2015,Spring,Unexposed,,4.0,40,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
900051,1,E1,SA505,TCATCGAG,SA701,CGAGAGTT,SA701SA505,CGAGAGTT-TCATCGAG,1.010000e+21,E1,...,male,4/21/2015,Autumn,Unexposed,negative,7.0,41,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ca006ONL2,6,H1,SA508,GACACCGT,SB701,CTCGACTT,SB701SA508,CTCGACTT-GACACCGT,1.010000e+21,H1,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADL
Ca006ONNL,6,F2,SA506,CGTGAGTG,SB702,CGAAGTAT,SB702SA506,CGAAGTAT-CGTGAGTG,1.010000e+21,F2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONNL2,6,H2,SA508,GACACCGT,SB702,CGAAGTAT,SB702SA508,CGAAGTAT-GACACCGT,1.010000e+21,H2,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,skin-ADNL
Ca006ONPN,6,F3,SA506,CGTGAGTG,SB703,TAGCAGCT,SB703SA506,TAGCAGCT-CGTGAGTG,1.010000e+21,F3,...,female,3/25/2015,Autumn,Unexposed,negative,3.0,34,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,/Users/yac027/Gallo_lab/16S_AD_Dube_Dupont/ato...,nares-AD


In [51]:
# Create microbiome_type column based on whether group starts with 'skin' or 'nares'
metadata['microbiome_type'] = metadata['group'].apply(lambda x: 'skin' if x.startswith('skin') else 'nares')
metadata.to_csv('../Data/Metadata/updated_clean_ant_skin_metadata_microbiome_type.tab', sep='\t')

In [52]:
metadata[['microbiome_type']].to_csv('../Data/Metadata/differential_abundance_groups.tsv', sep='\t')