#Machine Learning on GCMP data
Goal is to try predicting interesting features like functional group from microbial data.
Inputs include the original OTU tables, taxon-summarized OTU tables, PICRUSt predictions.

In [1]:
#Import relevant libraries
from os.path import join,abspath

#Set up useful paths
otu_table_all = "../input/otu_table_mc2_wtax_no_pynast_failures_no_organelles_even1000.biom"
otu_table_coral_only = "../input/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only.biom"
ko_table = "../input/ko_predictions.biom"
cog_table = "../input/cog_predictions.biom"

output_folder = "../output"
mapping_file = "../input/gcmp16S_map_r23.txt"

taxa_summary_folder_coral_only = join(output_folder,"taxa_summary_coral_only")
otu_table_L6_coral_only = join(taxa_summary_folder_coral_only,"otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6.biom")
otu_table_L4_coral_only = join(taxa_summary_folder_coral_only,"otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4.biom")
otu_table_L2_coral_only = join(taxa_summary_folder_coral_only,"otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L2.biom")

input_data = {"coral_only_L2": otu_table_L2_coral_only,"coral_only_L4":otu_table_L4_coral_only,"coral_only_L6":otu_table_L6_coral_only,\
  "ko":ko_table,"cog":cog_table,"otu_table_all":otu_table_all,"otu_table_coral_only":otu_table_coral_only}

In [2]:
#Make results summarized by taxonomy
!summarize_taxa.py -i $otu_table_coral_only -o $taxa_summary_folder_coral_only

In [9]:
for otu_table_key in ["coral_only_L4","coral_only_L2","coral_only_L6","otu_table_coral_only"]:
    otu_table = input_data[otu_table_key]
    output_location = join(output_folder,"%s_per_compartment" % otu_table_key)
    !split_otu_table.py -i $otu_table -m $mapping_file -f "BiologicalMatter" -o $output_location
        

In [2]:
#Just adding these to the input_data dir in  a hardcoded way

input_data["coral_only_L2_mucus"] =\
  join("../output/coral_only_L2_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L2__BiologicalMatter_Coral_Mucus__.biom")
input_data["coral_only_L2_tissue"] =\
  join("../output/coral_only_L2_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L2__BiologicalMatter_Coral_Tissue__.biom")
input_data["coral_only_L2_skeleton"] =\
  join("../output/coral_only_L2_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L2__BiologicalMatter_Coral_Skeleton__.biom")
input_data["coral_only_L4_mucus"] =\
  join("../output/coral_only_L4_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Mucus__.biom")
input_data["coral_only_L4_tissue"] =\
  join("../output/coral_only_L4_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Tissue__.biom")
input_data["coral_only_L4_skeleton"] =\
  join("../output/coral_only_L4_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Skeleton__.biom")
input_data["coral_only_L6_mucus"] =\
  join("../output/coral_only_L6_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6__BiologicalMatter_Coral_Mucus__.biom")
input_data["coral_only_L6_tissue"] =\
  join("../output/coral_only_L6_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6__BiologicalMatter_Coral_Tissue__.biom")
input_data["coral_only_L6_skeleton"] =\
  join("../output/coral_only_L6_per_compartment/","otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6__BiologicalMatter_Coral_Skeleton__.biom")

In [6]:
from os.path import splitext
#Run supervised classifier, en masse
target_categories = ["host_genus_id","field_host_genus_id","host_clade_sensu_fukami_numeric","complex_robust"]
subdir = "RF_phylo_results_by_compartment"
filtered_by_cat = {}
for cat in target_categories:
    print "Running supervised classifiers for category:", cat
    for otu_table_key in ["coral_only_L4_tissue","coral_only_L4_skeleton","coral_only_L4_mucus",\
                          "coral_only_L2_tissue","coral_only_L2_skeleton","coral_only_L2_mucus",\
                          "coral_only_L6_tissue","coral_only_L6_skeleton","coral_only_L6_mucus",\
                         ]:
        
        otu_table = input_data[otu_table_key]
        print "Raw feature table:",otu_table
        
        #NOTE: new in r2: filter to exclude 'Unknown'.  Save these to the 'filtered_by_cat' directory
        #NOTE: new in r3: filter to exclude 'None'
        filtered_otu_table_path = otu_table.rsplit(".",1)[0]+".filtered.biom"
        new_key = "_".join([otu_table_key,cat])
        !filter_samples_from_otu_table.py -i $otu_table -o $filtered_otu_table_path -m $mapping_file -s "$cat:*,!Unknown;$cat:*,!None"
        filtered_by_cat[new_key]=filtered_otu_table_path
        print "Filtered feature table:",filtered_otu_table_path                    
                            
        random_forest_dir = join(output_folder,subdir,"random_forest_%s_%s" %(filtered_otu_table_path,cat))
        !supervised_learning.py -i $filtered_otu_table_path -o $random_forest_dir -c $cat -m $mapping_file -f

Running supervised classifiers for category: host_genus_id
Raw feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Tissue__.biom
Filtered feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Tissue__.filtered.biom
Raw feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Skeleton__.biom
Filtered feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Skeleton__.filtered.biom
Raw feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Cora

In [5]:
target_categories = ["functional_group_sensu_darling","Mode_of_larval_development","Growth_form_typical","Symbiodinium_sp._in_propagules","full_location","binary_macroalgal_contact","binary_turf_contact","IUCN_Red_List_category","geographic_area","Mode_of_larval_development"]
subdir = "RF_physiology_and_location_results_by_compartment"
filtered_by_cat = {}
for cat in target_categories:
    print "Running supervised classifiers for category:", cat
    for otu_table_key in ["coral_only_L4_tissue","coral_only_L4_skeleton","coral_only_L4_mucus",\
                          "coral_only_L2_tissue","coral_only_L2_skeleton","coral_only_L2_mucus",\
                          "coral_only_L6_tissue","coral_only_L6_skeleton","coral_only_L6_mucus"]:
        otu_table = input_data[otu_table_key]
        print "Raw feature table:",otu_table
        
        #NOTE: new in r2: filter to exclude 'Unknown'.  Save these to the 'filtered_by_cat' directory
        #NOTE: new in r3: filter to exclude 'None'
        filtered_otu_table_path = otu_table.rsplit(".",1)[0]+".filtered.biom"
        new_key = "_".join([otu_table_key,cat])
        !filter_samples_from_otu_table.py -i $otu_table -o $filtered_otu_table_path -m $mapping_file -s "$cat:*,!Unknown"
        filtered_by_cat[new_key]=filtered_otu_table_path
        print "Current feature table:",filtered_otu_table_path
        
        random_forest_dir = join(output_folder,subdir,"random_forest_%s_%s" %(otu_table_key,cat))
        !supervised_learning.py -i $filtered_otu_table_path -o $random_forest_dir -c $cat -m $mapping_file -f

Running supervised classifiers for category: functional_group_sensu_darling
Raw feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Tissue__.biom
Current feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Tissue__.filtered.biom
Raw feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Skeleton__.biom
Current feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__BiologicalMatter_Coral_Skeleton__.filtered.biom
Raw feature table: ../output/coral_only_L4_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4__Biolog

In [7]:
#Show that we can distinguish compartments with overall dataset

target_categories = ["BiologicalMatter"]
subdir = "all_compartments"
for cat in target_categories:
    print "Running supervised classifiers for category:", cat
    for otu_table_key in ["coral_only_L2","coral_only_L4","coral_only_L6","ko","cog","otu_table_coral_only"]:
        
        otu_table = input_data[otu_table_key]
        print "Current feature table:",otu_table
        filtered_otu_table_path = otu_table.rsplit(".",1)[0]+".filtered.biom"
        #Remove outgroups: we want to compare coral mucus, skeleton, tissue only
        !filter_samples_from_otu_table.py -i $otu_table -o $filtered_otu_table_path -m $mapping_file -s "outgroup:n"
        
        random_forest_dir = join(output_folder,subdir,"random_forest_%s_%s" %(otu_table_key,cat))
        !supervised_learning.py -i $filtered_otu_table_path -o $random_forest_dir -c $cat -m $mapping_file -f

Running supervised classifiers for category: BiologicalMatter
Current feature table: ../output/taxa_summary_coral_only/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L2.biom
Current feature table: ../output/taxa_summary_coral_only/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L4.biom
Current feature table: ../output/taxa_summary_coral_only/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6.biom
Current feature table: ../input/ko_predictions.biom
Traceback (most recent call last):
  File "/macqiime/anaconda/bin/filter_samples_from_otu_table.py", line 162, in <module>
    main()
  File "/macqiime/anaconda/bin/filter_samples_from_otu_table.py", line 138, in main
    write_biom_table(filtered_otu_table, output_fp)
  File "/macqiime/anaconda/lib/python2.7/site-packages/qiime/util.py", line 577, in write_biom_table
    biom_table.to_hdf5(biom_file, generated_by, compress)
  File 

In [8]:
target_categories = ["Corallite_width_minimum","prop_Colony_maximum_diameter_universal","prop_Colony_maximum_GCMP_recorded","colony_width_maximum",'temperature','depth','turf_contact_percent','latitude']
for cat in target_categories:
    print "Running regressions for category:", cat
    for otu_table_key in ["coral_only_L6_tissue","coral_only_L6_skeleton","coral_only_L6_mucus"]:
        
        otu_table = input_data[otu_table_key]
        print "Current OTU table:",otu_table
        
        random_forest_dir = join(output_folder,"correlation_%s_%s.txt" %(otu_table_key,cat))
        
        !observation_metadata_correlation.py -i $otu_table -o $random_forest_dir -c $cat -m $mapping_file --metadata_key None

Running regressions for category: Corallite_width_minimum
Current OTU table: ../output/coral_only_L6_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6__BiologicalMatter_Coral_Tissue__.biom
Current OTU table: ../output/coral_only_L6_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6__BiologicalMatter_Coral_Skeleton__.biom
Current OTU table: ../output/coral_only_L6_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6__BiologicalMatter_Coral_Mucus__.biom
Running regressions for category: prop_Colony_maximum_diameter_universal
Current OTU table: ../output/coral_only_L6_per_compartment/otu_table_mc2_wtax_no_pynast_failures_no_organelles_coral_tissue_mucus_skeleton_only_L6__BiologicalMatter_Coral_Tissue__.biom
Error in observation_metadata_correlation.py: The category (prop_Colony_maximum_diameter_universal) was not found in the mapping