# *Endozoicomonas*-specific analysis



Questions
1. How many Endozoicomonas ASVs?
2. Different by region?
3. Different by compartment?
4. Correlated with disease?



In [15]:
from os import listdir
listdir("../")

['.Rhistory', '.DS_Store', 'input', 'output', 'procedure']

## Import Libraries

Note that a qiime2 virtual environment should be activated before trying to import QIIME2 functions
This step was run in the `qiime2-amplicon-2024.5` virtual environment

In [24]:
from os import mkdir
from os.path import join
from qiime2 import Artifact,Metadata
import qiime2.plugins.feature_table.actions as feature_table_actions
from qiime2.plugins.feature_table.methods import filter_features,filter_samples
from qiime2.plugins.taxa.methods import filter_table,collapse

#Beta diversity for Endos only
try:
    from qiime2.plugins.diversity.methods import beta
except:
    from qiime2.plugins.diversity.pipelines import beta

from qiime2.plugins.diversity.pipelines import beta_phylogenetic
import qiime2.plugins.diversity.actions as diversity_actions
import qiime2.plugins.emperor.actions as emperor_actions

output_dir = "../output/Endozoicomonas_analysis"
try:
    mkdir(output_dir)
except FileExistsError:
    print(f"{output_dir} exists ... OK")
    

import pandas as pd

../output/Endozoicomonas_analysis exists ... OK


## Load GCMP  feature table, metadata, taxonomy and tree

In [17]:

table = Artifact.load("../output/feature_tables/feature_table_all_1000.qza")
metadata = Metadata.load("../input/GCMP_EMP_map_r29.txt")
taxonomy_file = join("../input/silva_metaxa2_reference_taxonomy.qza")
taxonomy = Artifact.load(taxonomy_file)
tree_file = "../input/physeq_rooted_tree.qza"
tree = Artifact.load(tree_file)

## Summarize the overall feature table

First summarize the overall feature table (before restricting to just *Endozoicomonas*


In [18]:
table_viz, = feature_table_actions.summarize(
    table=table,
    sample_metadata=metadata,
)

table_viz.save(join(output_dir,"feature_table_all_1000_summary.qzv"))

'../output/Endozoicomonas_analysis/feature_table_all_1000_summary.qzv'

## Filter feature table to Endozoicomonas ASVs only 

Include only ASVs annotated as *Endozoicomonas* at the genus level. Summarize the resulting feature table to extract ASV counts.

In [19]:
filter_table_results = filter_table(table,taxonomy,include="endozoicomonas",mode="contains")

endo_table = filter_table_results.filtered_table
endo_table.save(join(output_dir,"feature_table_all_1000_endos.qza"))

table_viz, = feature_table_actions.summarize(
    table=endo_table,
    sample_metadata=metadata,
)

table_viz.save(join(output_dir,"feature_table_all_1000_endos_summary.qzv"))



'../output/Endozoicomonas_analysis/feature_table_all_1000_endos_summary.qzv'

## Remove ASVs with < 10 reads. 

Rare ASVs are often artifactual, and comparing extremely rare *Endozoicomonas* may dilute statistical power to detect biologically plausible trends during multiple comparisons correction.

In [20]:
filtered_endo_table, = filter_features(endo_table,min_frequency=10)
filtered_endo_table.save(join(output_dir,"feature_table_all_1000_endos_abund_min_10.qza"))

'../output/Endozoicomonas_analysis/feature_table_all_1000_endos_abund_min_10.qza'

## Build a heatmap of Endozoicomonas by tissue compartment

In [21]:
compartment_mdc = metadata.get_column("tissue_compartment")

#Remove non- mucus, tissue or skeleton samples
filtered_compartment_table, = filter_samples(table=filtered_endo_table,metadata=metadata,
where = 'tissue_compartment == "M" OR tissue_compartment == "T" OR tissue_compartment == "S"')

#Remove outgroups
filtered_compartment_table, = filter_samples(table=filtered_compartment_table,metadata=metadata,
where = 'outgroup == "n"')

#collapse samples by metadata first to allow easier visualization
filtered_compartment_table, = feature_table_actions.group(filtered_compartment_table,
                                                axis="sample",
                                                metadata=compartment_mdc,
                                                mode="sum")

#Visualize heatmap
heatmap_viz, = feature_table_actions.heatmap(
    table=filtered_compartment_table,
    sample_metadata=None,
    color_scheme = "viridis"
)
heatmap_viz.save(join(output_dir,"heatmap_by_compartment_endos_1000.qzv"))

'../output/Endozoicomonas_analysis/heatmap_by_compartment_endos_1000.qzv'

## Calculate a measure of mucus vs. tissue specificity for Endozoicomonas

In [25]:
compartment_df = filtered_compartment_table.view(pd.DataFrame)
compartment_df = compartment_df.transpose()
compartment_df.to_csv(join(output_dir,"Endozoicomonas_strain_tissue_specificity.csv"))
compartment_df

Unnamed: 0,M,S,T
TACGGAGGGTGCGAGCATTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTTTTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,2.0,0.0,28.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTTTTAAGTTGGATGTGAAAGCCTCGGGCTCAACCTGAGAACTG,0.0,0.0,189.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTTTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCCGGGAACTG,38.0,53.0,1485.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTAATTAAGTGGGATGTGAAAGCCCCGGGCTCAACTCGGGAACTG,19.0,31.0,182.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTTTTTAAGTTGGATGTGAAAGCCCCGGGCTTAACCTGAGAACTG,0.0,0.0,14.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,8.0,14.0,13.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTTGTTAAGTTGGATGTGAAAGCCCCGGGCTTAACCTGGGAACGG,1.0,4.0,13.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTTTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,1606.0,184.0,4763.0
TACGTAGGGCGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCGCGCAAGTCGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGG,0.0,0.0,14.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTACTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,25.0,2.0,28.0


In [26]:
from numpy import log
def calc_log_ratio(df:pd.DataFrame,target_category: str,all_categories: str):
    """Calculate log-fold enrichment of target category
    
    """
    numerator = log(df[target_category]+1)
    denominator = 0
    for c in all_categories:
        denominator += log(df[c]+1)
    result = numerator/denominator
    return result

for compartment in ["M","T","S"]:
    compartment_df[f"{compartment}_specificity"] = calc_log_ratio(compartment_df,compartment,["M","T","S"])

compartment_df.sort_values("M_specificity",ascending=False)

Unnamed: 0,M,S,T,M_specificity,T_specificity,S_specificity
TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTTTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,10.0,0.0,0.0,1.0,0.0,0.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTTTTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGAGAACTG,10.0,0.0,0.0,1.0,0.0,0.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTGCCTAAGTTGGATGTGAAAGCCCCGGGCTTAACCTGGGAACTG,106.0,0.0,9.0,0.6699,0.3301,0.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTTTTAAGTTGGATGTGAAAGCCCCGGGCTTAACCCGGGAACTG,50.0,3.0,4.0,0.567563,0.232324,0.200113
GACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTTTTAAGTTGGATGTGAAAGCCCCGGGCTTAACCTGGGAACGG,10.0,0.0,6.0,0.552026,0.447974,0.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTGTCTAAGTTGGGTGTGAAAGCCCCGGGCTCAACCTGGGAACTG,98.0,0.0,70.0,0.518765,0.481235,0.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTACCTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,141.0,0.0,172.0,0.490233,0.509767,0.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTTTTAAGTTGGATGTGAAAGCCCCGGGCTTAACCTGGGAACTG,5.0,0.0,7.0,0.462843,0.537157,0.0
TACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTAGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,359.0,2.0,384.0,0.454948,0.460138,0.084914
TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCTGCCTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACTG,310.0,9.0,117.0,0.447964,0.37233,0.179706


## Identify Endozoicomonas strains specific to Mucus, Tissue or Skeleton

In [27]:
print(compartment_df.columns)

def identify_endos_by_compartment(df: pd.DataFrame, target_compartment:str, other_compartments:list):
    if len(other_compartments) > 2:
        raise ValueError("Only set up to analyze exactly 3 compartments")
    ref1 = other_compartments[0]   
    ref2 = other_compartments[1] 
    df = df[(compartment_df[target_compartment] > compartment_df[ref1]) & (compartment_df[target_compartment] > compartment_df[ref2])]
    return df

mucus_endos = identify_endos_by_compartment(compartment_df,"M",["T","S"])
n_mucus_endo_ASVs = mucus_endos.shape[0]
print("Mucus Endozoicomonas ASVs:",n_mucus_endo_ASVs)
mucus_endos.to_csv(join(output_dir,"Mucus_Enriched_Endozoicomonas.csv"))


tissue_endos = identify_endos_by_compartment(compartment_df,"T",["M","S"])
n_tissue_endo_ASVs = tissue_endos.shape[0]
print("Tissue Endozoicomonas ASVs:",n_tissue_endo_ASVs)
tissue_endos.to_csv(join(output_dir,"Tissue_Enriched_Endozoicomonas.csv"))

skeleton_endos = identify_endos_by_compartment(compartment_df,"S",["M","T"])
n_skeleton_endo_ASVs = skeleton_endos.shape[0]
print("Skeleton Endozoicomonas ASVs:",n_skeleton_endo_ASVs)
skeleton_endos.to_csv(join(output_dir,"Skeleton_Enriched_Endozoicomonas.csv"))

Index(['M', 'S', 'T', 'M_specificity', 'T_specificity', 'S_specificity'], dtype='object')
Mucus Endozoicomonas ASVs: 11
Tissue Endozoicomonas ASVs: 39
Skeleton Endozoicomonas ASVs: 5


## Add genus-level trait values for M,T, or S-specific Endos

Here we will filter the overall Endozoicomonas feature table to just M,T or S -enriched ASVs in a loop.
Then collapse the Endo table by coral genus. Finally, extract summed counts of all remaining ASVs by genus. This will then be mapped to the trait table in the next step.



In [28]:
compartment_mdc = metadata.get_column("tissue_compartment")

#Remove non- mucus, tissue or skeleton samples
filtered_compartment_table, = filter_samples(table=filtered_endo_table,metadata=metadata,
where = 'tissue_compartment == "M" OR tissue_compartment == "T" OR tissue_compartment == "S"')

#Remove outgroups
filtered_compartment_table, = filter_samples(table=filtered_compartment_table,metadata=metadata,
where = 'outgroup == "n"')

In [31]:
#collapse samples by metadata first to allow easier visualization
genus_table, = feature_table_actions.group(filtered_compartment_table,
                                            axis="sample",
                                            metadata=metadata.get_column("host_genus_id"),
                                            mode="sum")
genus_df = genus_table.view(pd.DataFrame)
genus_df = genus_df.T

results_dir = join("..","output")
trait_table = join(results_dir,"GCMP_trait_table_with_abundances_and_adiv_and_metadata_zeros.tsv")
trait_table_df = pd.read_csv(trait_table,sep="\t")
trait_table_df = trait_table_df.set_index('host_genus')

growth_trait_table = join(results_dir,"GCMP_trait_table_with_abundances_and_adiv_and_metadata_and_growth_data_depth.tsv")
growth_trait_table_df = pd.read_csv(growth_trait_table,sep="\t")
growth_trait_table_df = growth_trait_table_df.set_index('host_genus')

labels = ["Mucus","Tissue","Skeleton"]
for i,compartment_endo_df in enumerate([mucus_endos,tissue_endos,skeleton_endos]):
    label = labels[i]
    endo_ids = list(compartment_endo_df.index)
    mask = genus_df.index.isin(endo_ids)
    endo_df = genus_df[mask]
    endo_trait_table = endo_df.T
    endo_trait_table[f"{label}_enriched_Endozoicomonas_ASV_count"] = endo_trait_table.sum(axis=1)
    endo_trait_table=endo_trait_table.loc[:,f"{label}_enriched_Endozoicomonas_ASV_count"]
    
    trait_table_df = trait_table_df.join(endo_trait_table,how="outer")
    trait_table_df.to_csv(join(results_dir,"GCMP_trait_table_with_abundances_and_adiv_and_metadata_zeros_endos_by_compartment.tsv"),sep="\t")
    
    growth_trait_table_df = growth_trait_table_df.join(endo_trait_table,how="outer")
    growth_trait_table_df.to_csv(join(results_dir,"GCMP_trait_table_growth_data_depth_endos_by_compartment.tsv"),sep="\t")

## *Endozoicomonas* -specific phylogenetic beta diversity analysis

Next we ran *Endozoicomonas*-specific beta diversity analysis to look for shifts *within* the pool of Endozoicomonas 

In [None]:
metric = 'weighted_unifrac'
beta_results_wu = beta_phylogenetic(table=endo_table, 
                                 phylogeny=tree, 
                                 metric=metric)

beta_dm_wu = beta_results_wu.distance_matrix

columns_of_interest = ['ocean_area','tissue_compartment','complex_robust']
for column in columns_of_interest: 

    pairwise_beta_diversity = diversity_actions.beta_group_significance(
        distance_matrix=beta_dm_wu, 
        metadata=metadata.get_column(column),
        method='permanova', 
        pairwise=True)

    beta_pairwise_visualization = pairwise_beta_diversity.visualization
    output_filename = join(output_dir,f"bdiv_{metric}_permanova_all_{column}.qzv")
    beta_pairwise_visualization.save(output_filename)

## Non-phylogenetic but compositional analysis of *Endozoicomonas* beta diversity

In [None]:
metric = 'aitchison'  
beta_results_aitchison = beta(table=endo_table, 
                                 metric=metric)

beta_dm_aitchison = beta_results_aitchison.distance_matrix

columns_of_interest = ['ocean_area','tissue_compartment','complex_robust']
for column in columns_of_interest: 

    pairwise_beta_diversity = diversity_actions.beta_group_significance(
        distance_matrix=beta_dm_aitchison, 
        metadata=metadata.get_column(column),
        method='permanova', 
        pairwise=True)

    beta_pairwise_visualization = pairwise_beta_diversity.visualization
    output_filename = join(output_dir,f"bdiv_{metric}_permanova_all_{column}.qzv")
    beta_pairwise_visualization.save(output_filename)

## Visualize Endozoicomonas Beta Diversity

This step uses the *Endozoicomonas*-specific beta diversity distance matrices (`beta_dm_wu` and `beta_dm_aitchison`) calculated above as input to Principle Coordinates Analysis visualization.

In [None]:

pcoa, = diversity_actions.pcoa(
    distance_matrix=beta_dm_wu,
)
emperor_viz, = emperor_actions.plot(
    pcoa=pcoa,
    metadata=metadata
)
emperor_viz.save(join(output_dir,"endos_weighted_unifrac_emperor_plot.qzv"))


pcoa, = diversity_actions.pcoa(
    distance_matrix=beta_dm_aitchison,
)
emperor_viz, = emperor_actions.plot(
    pcoa=pcoa,
    metadata=metadata
)
emperor_viz.save(join(output_dir,"endos_aitchison_emperor_plot.qzv"))



In [None]:
import qiime2.plugins.composition.actions as composition_actions


from qiime2.plugins.feature_table.methods import filter_samples
filtered_endo_table, = filter_samples(table=filtered_endo_table,metadata=metadata,
where = 'tissue_compartment == "M" OR tissue_compartment == "T" OR tissue_compartment == "S"')

import pandas as pd
filtered_endo_table.view(pd.DataFrame)

In [None]:
ancombc_compartment, = composition_actions.ancombc(
    table=filtered_endo_table,
    metadata=metadata,
    formula='tissue_compartment',
    reference_levels=["tissue_compartment::T"]
)
ancombc_compartment.save(join(output_dir,"ancom_bc_endos_tissue_stats.qza"))

In [None]:
da_barplot_compartment_viz, = composition_actions.da_barplot(
    data=ancombc_compartment,
    significance_threshold=0.001,
)

da_barplot_compartment_viz.save(join(output_dir,"ancom_bc_barplot_endos_tissue_compartment_MTS_only.qzv"))