In [None]:
#Species specific analysis by genus

In [None]:
#find unique genus in metadata


In [2]:
from qiime2 import Artifact
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2.plugins.taxa.methods import filter_table,collapse

#The below try/except block is unsightly but the alpha function got moved between recent versions of QIIME2
#and it's nice if the notebook is compatible with either
try:
    from qiime2.plugins.diversity.methods import alpha
    from qiime2.plugins.diversity.methods import alpha_phylogenetic
except:
    from qiime2.plugins.diversity.pipelines import alpha
    from qiime2.plugins.diversity.pipelines import alpha_phylogenetic
    
from qiime2.plugins.diversity.visualizers import alpha_group_significance

from qiime2.plugins.feature_table.methods import rarefy
from qiime2.plugins.feature_table.visualizers import summarize

from qiime2.metadata import Metadata
from os.path import abspath,exists,join
from os import mkdir
import shutil

import pandas as pd

In [3]:
input_directory = abspath("../input")
feature_table_file = join(input_directory, "feature_tables/feature_table_decon_all_1000.qza")
mapping_file = join(input_directory,"GCMP_EMP_map_r28_no_empty_samples.txt")
taxonomy_file = join(input_directory,"silva_metaxa2_reference_taxonomy.qza") 
tree_file = join(input_directory,"physeq_rooted_tree.qza")

#load the feature tables
feature_table_decon_all_1000 = Artifact.load("../input/feature_tables/feature_table_decon_all_1000.qza")
feature_table_decon_mucus_1000 = Artifact.load("../input/feature_tables/feature_table_decon_mucus_1000.qza")
feature_table_decon_skeleton_1000 = Artifact.load("../input/feature_tables/feature_table_decon_skeleton_1000.qza")
feature_table_decon_tissue_1000 = Artifact.load("../input/feature_tables/feature_table_decon_tissue_1000.qza")
#merge them
feature_tables_decon_1000 = {"mucus":feature_table_decon_mucus_1000, "tissue":feature_table_decon_tissue_1000, "skeleton":feature_table_decon_skeleton_1000, "all":feature_table_decon_all_1000}

output_dir = abspath("../output/")

ft_output_dir = join(output_dir,"feature_tables")

if not exists(ft_output_dir):
    print(f"Output directory {ft_output_dir} does not yet exist, creating it...")
    mkdir(ft_output_dir)
    print("Done.")

required_files = [feature_tables_decon_1000,mapping_file,taxonomy_file,tree_file]



In [4]:
from qiime2.plugins.feature_table.methods import filter_features
feature_table = Artifact.load(feature_table_file)
tree = Artifact.load(tree_file)
taxonomy = Artifact.load(taxonomy_file)
metadata = Metadata.load(mapping_file)



In [5]:
# Find unique genus names
df = metadata.to_dataframe()
unique_species_names = list(set(list(df['host_genus'])))
print(f"There are {len(unique_species_names)} unique species names in the dataset:")
print(sorted(unique_species_names))

There are 74 unique species names in the dataset:
['Acanthastrea', 'Acropora', 'Aiptasia', 'Alveopora', 'Astrea', 'Aurelia', 'Caulastraea', 'Coscinaraea', 'Ctenactis', 'Cyphastrea', 'Danafungia', 'Dendrophyllia', 'Diploastrea', 'Diploria', 'Dipsastraea', 'Distichopora', 'Echinophyllia', 'Echinopora', 'Eguchipsammia', 'Entacmaea', 'Favites', 'Fungid', 'Galaxea', 'Gardineroseris', 'Goniastrea', 'Goniopora', 'Heliopora', 'Herpolitha', 'Heteractis', 'Homophyllia', 'Hydnophora', 'Isopora', 'Leptastrea', 'Leptoria', 'Lithophyllon', 'Lobophyllia', 'Lobophytum', 'Macrorhynchia', 'Merulina', 'Millepora', 'Missing: Not collected', 'Mnemiopsis', 'Montastraea', 'Montipora', 'Mycedium', 'Not applicable', 'Orbicella', 'Oxypora', 'Pachyseris', 'Palythoa', 'Pavona', 'Pectinia', 'Physogyra', 'Platygyra', 'Plerogyra', 'Pocillopora', 'Podabacia', 'Porites', 'Psammocora', 'Pseudosiderastrea', 'Rhodactis', 'Sarcophyton', 'Scolymia', 'Seriatopora', 'Siderastrea', 'Sinularia', 'Stephanocoenia', 'Stichodactyl

In [6]:
#split into mucus, tissue, and skeleton compartments
feature_tables = {"all":feature_table_decon_all_1000}
compartments = ['M', 'T', 'S']
compartment_names = {"M":"mucus","S":"skeleton","T":"tissue"}
for compartment in compartments:
    compartment_name = compartment_names[compartment]
    print(f"Separating samples for compartment {compartment}")
    where = f"tissue_compartment = '{compartment}'"
    filtered_table, = filter_samples(feature_table, metadata = metadata,where = where)
    feature_tables[compartment_name]=filtered_table
    
print("Done filtering")
print(f"Resulting feature tables:",feature_tables)
    
    


Separating samples for compartment M
Separating samples for compartment T
Separating samples for compartment S
Done filtering
Resulting feature tables: {'all': <artifact: FeatureTable[Frequency] uuid: fc0b47de-3caf-4133-8a6b-c3bd6aba1e3d>, 'mucus': <artifact: FeatureTable[Frequency] uuid: c0483baa-982d-4974-b55e-c6592922b169>, 'tissue': <artifact: FeatureTable[Frequency] uuid: 33559ba7-6513-498d-a542-b149e3d133a9>, 'skeleton': <artifact: FeatureTable[Frequency] uuid: d344500a-95b4-4b5a-a316-9e05155eb795>}


In [7]:
from IPython.core.display import HTML
def calculate_per_species_diversities(feature_table,\
                                      metadata,\
                                      species_column = "host_genus",\
                                      compartment_name = 'all',\
                                      taxonomy = None,\
                                      metrics = ['faith_pd','observed_features','gini_index','dominance','simpson_e'],\
                                      to_skip = ['none','','Not applicable','Missing: Not collected']\
                                      ):
    #Set up a DataFrame to hold results
    results_columns = [species_column,f"n_samples_{compartment_name}"]
    results_columns.extend([f"{metric}_{compartment_name}" for metric in metrics])
    taxonomy_levels = ('domain','phylum','class','order','family','genus')
    taxonomy_labels = [f"most_abundant_{level}_{compartment_name}" for level in taxonomy_levels]
    results_columns.extend(taxonomy_labels)
    print("Result columns:",results_columns)
    results_df = pd.DataFrame(columns = results_columns)
    results_df = results_df.set_index(species_column)
    metadata_df = metadata.to_dataframe()
    unique_species_names = list(set(list(metadata_df[species_column])))
    for species in unique_species_names:
        if species in to_skip:
            continue
            
        #Filter the feature table to just our current species
        where = f"[{species_column}] = '{species}'"
        filter_results = filter_samples(feature_table, metadata = metadata,where = where)
        species_table = filter_results.filtered_table
        #We'll use the species table, not the overall feature table from here on down!
        
        
        #If taxonomy is provided, summarize the type of microbe with highest average
        #abundance at each taxonomic level
        if taxonomy:
            for i, taxon_label in enumerate(taxonomy_levels):
                level = i + 1 #domain is level 1 in QIIME2, not level 0
                most_abundant_taxon = get_dominant_taxon(species_table,taxonomy,level=level)
                column_label = f"most_abundant_{taxon_label}_{compartment_name}"
                results_df.loc[species,column_label] = most_abundant_taxon
            
        
        for metric in metrics:
            #print(f"Calculating {metric} for {species}")
            try:
                if metric == "faith_pd":
                    alpha_results = alpha_phylogenetic(species_table,phylogeny=tree,metric=metric)
                else:
                    alpha_results = alpha(species_table, metric = metric)
            except ValueError:
                print(f"Can't calculate {metric} for {species} {compartment_name}...skipping")
                continue
            
            alpha_diversity = alpha_results.alpha_diversity
            species_adiv = alpha_diversity.view(pd.Series)
            
            species_mean = species_adiv.mean()
            results_df.loc[species,f"{metric}_{compartment_name}"] = species_mean
            print(f"{species}\t{metric}\t{compartment_name}\t{round(species_mean,4)}")
            
            
            
        #For last metric only we'll grab n (should be the same for all)
        species_n = len(species_adiv)     
        results_df.loc[species,f"n_samples_{compartment_name}"]=species_n
    return results_df
 
#calculate the dominant taxon family
def get_dominant_taxon(feature_table,taxonomy,level=5):
    """Collapse the feature table at the specified level, then find which feature is most abundant"""
    try:
        collapse_results = collapse(feature_table,taxonomy,level)
        taxon_table = collapse_results.collapsed_table
        taxon_df = taxon_table.view(pd.DataFrame)
        #Calculate average abundance of each taxon in this species
    except TypeError:
        return None
    taxon_abundance_dict = {taxon_df[col].mean():col for col in list(taxon_df.columns)}
    greatest_mean_abundance = sorted(taxon_abundance_dict.keys())[-1]
    most_abundant_taxon = taxon_abundance_dict[greatest_mean_abundance]
    print("Most abundant taxon:",most_abundant_taxon)
    return most_abundant_taxon
    
    
results_dfs = {} 
for compartment,table in sorted(feature_tables_decon_1000.items(),reverse=True):
    result_df = calculate_per_species_diversities(table,metadata,compartment_name = compartment,taxonomy = taxonomy)
    species_adiv_path = join(output_dir,f"adiv_trait_table_{compartment}.tsv")
    results_dfs[compartment]=result_df
    result_df.to_csv(species_adiv_path,sep="\t")

Result columns: ['host_genus', 'n_samples_tissue', 'faith_pd_tissue', 'observed_features_tissue', 'gini_index_tissue', 'dominance_tissue', 'simpson_e_tissue', 'most_abundant_domain_tissue', 'most_abundant_phylum_tissue', 'most_abundant_class_tissue', 'most_abundant_order_tissue', 'most_abundant_family_tissue', 'most_abundant_genus_tissue']
Most abundant taxon: D_0__Bacteria
Most abundant taxon: D_0__Bacteria;D_1__Proteobacteria
Most abundant taxon: D_0__Bacteria;D_1__Proteobacteria;D_2__Alphaproteobacteria
Most abundant taxon: D_0__Bacteria;D_1__Proteobacteria;D_2__Alphaproteobacteria;D_3__Rhizobiales
Most abundant taxon: D_0__Bacteria;D_1__Proteobacteria;D_2__Alphaproteobacteria;D_3__Rhizobiales;__
Most abundant taxon: D_0__Bacteria;D_1__Proteobacteria;D_2__Alphaproteobacteria;D_3__Rhizobiales;__;__
Echinophyllia	faith_pd	tissue	64.7863
Echinophyllia	observed_features	tissue	139.4
Echinophyllia	gini_index	tissue	0.9437
Echinophyllia	dominance	tissue	0.1824
Echinophyllia	simpson_e	tiss

In [9]:
#Integrate data into a per species level table
from IPython.core.display import HTML,display

disease_df = pd.read_csv("../input/disease_by_genus.csv")
disease_df = disease_df.set_index('host_genus')
for compartment,results_df in results_dfs.items():
    results_df.index.rename('host_genus')    
    disease_df = pd.merge(disease_df, results_df, how="outer", left_index = True, right_index = True, indicator=False)
disease_df.to_csv("../output/GCMP_trait_table_genus.tsv",sep="\t")
