# [Part 2] Create one file for each compartment containing all OTUs present in any core OTU table (at specific levels) with information on presence/absence in each coral genus

# For first part, see [PART 1]: 3c_core_microbiomes_by_compartment_python_v2.2.ipynb

In [8]:
setwd('..')
getwd()

In [105]:
# Load packages
library("dplyr")
library("tidyr")


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [35]:
# Create one file for each compartment containing all OTUs present at each specific core OTU levelwith information on presence/absence in each coral genus
# Write itol parameter files for each coral genus at each level to show presence/absence on the itol tree

# Define function to split filepath name in order to extract genus (see split_path(core_file_path)[2]->genus)
split_path <- function(path) {
    rev(setdiff(strsplit(path,"/|\\\\")[[1]], ""))
} 

# Define the "core" levels you are intersted in
levels_to_test = c(50,70,90)
compartments_to_test = c("mucus","skeleton","tissue")

for (compartment_of_interest in compartments_to_test){
    
    for(level in levels_to_test){
        # Clear dataframe
        combined = NULL
        otus_to_keep = NULL
        i=NULL
        paste("core_otus_",level,".txt", sep = "")->level_file_name
        path = paste("output/compartments/",compartment_of_interest,"_only",sep="")
        file.names <- dir(path,pattern = level_file_name, recursive = TRUE)
        paste(compartment_of_interest,"core",level,"itol_parameters", sep = "_")->out_folder
        dir.create(file.path(path,out_folder, fsep = .Platform$file.sep),showWarnings = FALSE)
        # Name "to keep" parameter out file
        paste(compartment_of_interest,"core",level,"otus_to_keep.txt", sep = "_")->to_keep_file
        file.path(path,to_keep_file, fsep = .Platform$file.sep)->to_keep_file_path
        print(to_keep_file_path)
        # Clear "to keep" file
        blank = NULL
        write(blank,file=to_keep_file_path)
        # Set i (which will count our colors) to 0
        i=0
        palette <- rainbow(35)
        # Blank out keep otus dataframe
        keep_otus = NULL
        for(file in file.names){
            new = NULL
            file.path(path,file, fsep = .Platform$file.sep)->core_file_path
            split_path(core_file_path)[2]->genus
            # Name itol parameter out file
            paste(compartment_of_interest,"core",level,"itol_parameters",genus,".txt", sep = "_")->out_file
            file.path(path,out_folder,out_file, fsep = .Platform$file.sep)->itol_out_path
            read.table(core_file_path,sep="\t", header=FALSE, col.names=c("OTU", genus),stringsAsFactors=FALSE)->new
            i=i+1
            if(nrow(new)>0){
                # Create a new dataframe with the first iteration of the loop
                if(i==1){
                    new->combined
                }
                # Add new otus to combined
                merge(combined,new, all = TRUE)->combined
                # Export OTUs to keep for tree building
                new$OTU->new_keep_otus
                c(keep_otus,new_keep_otus)->keep_otus
                ## Remove duplicate otus from keep_otus
                unique(keep_otus)->keep_otus_unique
                # Write "to keep" file
                write(keep_otus_unique,file=to_keep_file_path,append=FALSE)
                # Export iTOL parameters
                new["color"]<-substr(palette[i],1,7)
                paste(new$OTU, new$color, sep="\t")->newer
                color = paste("COLOR",substr(palette[i],1,7), sep="\t")
                genus_label = paste("DATASET_LABEL",genus, sep="\t")
                genus_title = paste("LEGEND_TITLE",genus, sep="\t") 
                legend_color = paste("LEGEND_COLORS",substr(palette[i],1,7), sep="\t")   
                legend_label = paste("LEGEND_LABELS",genus, sep="\t")
                top<-c("DATASET_COLORSTRIP","SEPARATOR TAB",genus_label,color,"COLOR_BRANCHES	0",genus_title,"LEGEND_SHAPES	1",legend_color,legend_label,"DATA")
                blank = NULL
                write(blank,file=itol_out_path)
                write(top,file=itol_out_path,append=TRUE)
                write(newer,file=itol_out_path,append=TRUE)
                }
        }
        # Write combine otu output files to CSV
        paste(path,"/",compartment_of_interest,"_core_",level,"_otus_combined.txt",sep = "")->combined_file_name
        write.table(combined, file = combined_file_name, row.names=FALSE, na="-1" , col.names=TRUE, sep="\t")
        }
    }

[1] "output/compartments/mucus_only/mucus_core_50_otus_to_keep.txt"
[1] "output/compartments/mucus_only/mucus_core_70_otus_to_keep.txt"
[1] "output/compartments/mucus_only/mucus_core_90_otus_to_keep.txt"
[1] "output/compartments/skeleton_only/skeleton_core_50_otus_to_keep.txt"
[1] "output/compartments/skeleton_only/skeleton_core_70_otus_to_keep.txt"
[1] "output/compartments/skeleton_only/skeleton_core_90_otus_to_keep.txt"
[1] "output/compartments/tissue_only/tissue_core_50_otus_to_keep.txt"
[1] "output/compartments/tissue_only/tissue_core_70_otus_to_keep.txt"
[1] "output/compartments/tissue_only/tissue_core_90_otus_to_keep.txt"


# Now, move to Jupyter notebook [PART 3]: 3c_core_microbiomes_by_compartment_python_v2.2.ipynb [Part 3] to create trees 

# [Part 4]

In [47]:
# Get only taxonomy information from TSV version of biom file

otu_name_tax = NULL

# Define TSV file location
otu_tsv = c("input/otu_table_mc2_wtax_no_pynast_failures_no_organelles_even1000_TSV.txt")

# Define edits file location
output_folder = c("output/itol_parameters_shared/")
dir.create(file.path(output_folder, fsep = .Platform$file.sep),showWarnings = FALSE)
output_file = paste(output_folder,"otu_taxonomy_only.txt",sep="")

# Read in the TSV file
read.table(otu_tsv,sep="\t", header=FALSE)->full_table

# Copy only first (OTU name) and last (taxonomy) columns
select(full_table, 1, ncol(full_table))->otu_name_tax

# Add meaningful column names
colnames(otu_name_tax) <- c("OTU","taxa")

# Seperate taxa into multiple columns for each taxonomic level
separate(otu_name_tax, taxa, into=c("trash","Kingdom","Phylum","Class","Order","Family","Genus","Species"), sep = "__", 
         remove = FALSE,convert = FALSE, extra = "warn", fill = "warn")->otu_name_tax

# Remove uniformative column
drops = c("trash")
otu_name_tax[ , !(names(otu_name_tax) %in% drops)]->otu_name_tax

# Remove extra/uniformative bits and pieces
tax_letters = c("k","p","c","o","f","g","s")
for (tax_letter in tax_letters){
    to_remove = paste("; ",tax_letter, sep = "")
    as.data.frame(sapply(otu_name_tax,gsub,pattern=to_remove,replacement=""))->otu_name_tax
}

# Replace empty cells with lowest known taxonomic level
otu_name_tax$Phylum <- ifelse(otu_name_tax$Phylum == "", paste(as.character(otu_name_tax$Kingdom),"p",sep="_"), as.character(otu_name_tax$Phylum))
otu_name_tax$Class <- ifelse(otu_name_tax$Class == "", paste(as.character(otu_name_tax$Phylum),"c",sep="_"), as.character(otu_name_tax$Class))
otu_name_tax$Order <- ifelse(otu_name_tax$Order == "", paste(as.character(otu_name_tax$Class),"o",sep="_"), as.character(otu_name_tax$Order))
otu_name_tax$Family <- ifelse(otu_name_tax$Family == "", paste(as.character(otu_name_tax$Order),"f",sep="_"), as.character(otu_name_tax$Family))
otu_name_tax$Genus <- ifelse(otu_name_tax$Genus == "", paste(as.character(otu_name_tax$Family),"g",sep="_"), as.character(otu_name_tax$Genus))
otu_name_tax$Species <- ifelse(otu_name_tax$Species == "", paste(as.character(otu_name_tax$Genus),"s",sep="_"), as.character(otu_name_tax$Species))

# Replace na cells with lowest known taxonomic level
otu_name_tax$Kingdom <- ifelse(is.na(otu_name_tax$Kingdom), paste(as.character(otu_name_tax$taxa),"k",sep="_"), as.character(otu_name_tax$Kingdom))
otu_name_tax$Phylum <- ifelse(is.na(otu_name_tax$Phylum), paste(as.character(otu_name_tax$Kingdom),"p",sep="_"), as.character(otu_name_tax$Phylum))
otu_name_tax$Class <- ifelse(is.na(otu_name_tax$Class), paste(as.character(otu_name_tax$Phylum),"c",sep="_"), as.character(otu_name_tax$Class))
otu_name_tax$Order <- ifelse(is.na(otu_name_tax$Order), paste(as.character(otu_name_tax$Class),"o",sep="_"), as.character(otu_name_tax$Order))
otu_name_tax$Family <- ifelse(is.na(otu_name_tax$Family), paste(as.character(otu_name_tax$Order),"f",sep="_"), as.character(otu_name_tax$Family))
otu_name_tax$Genus <- ifelse(is.na(otu_name_tax$Genus), paste(as.character(otu_name_tax$Family),"g",sep="_"), as.character(otu_name_tax$Genus))
otu_name_tax$Species <- ifelse(is.na(otu_name_tax$Species), paste(as.character(otu_name_tax$Genus),"s",sep="_"), as.character(otu_name_tax$Species))

# Write to TSV file
write.table(otu_name_tax, file = output_file, row.names=FALSE, na="Unknown" , col.names=TRUE, sep="\t",quote=FALSE)


: Too few values at 7659 locations: 20, 49, 74, 177, 185, 188, 205, 214, 217, 222, 242, 259, 264, 275, 306, 310, 325, 363, 381, 385, ...

In [276]:
# Build taxa colorstrip files to create colored rings around tree showing taxonomy at various taxonomic levels

#Read in taxonomy file
output_folder = c("output/itol_parameters_shared/")
output_file = paste(output_folder,"otu_taxonomy_only.txt",sep="")
read.table(output_file,sep="\t",header = TRUE)->taxa_data

# Define taxonomic levels
c("Kingdom","Phylum","Class","Order","Family","Genus","Species")->taxa_levels

# Create output folder
output_taxa_strip_folder = c("output/itol_parameters_shared/taxa_colorstrip")
dir.create(file.path(output_taxa_strip_folder, fsep = .Platform$file.sep),showWarnings = FALSE)

for (level in taxa_levels){
    i=(nrow(unique(taxa_data[level])))
    palette <- rainbow(i)
    output_taxa_strip_file = paste(level,"taxa_colorstrip.txt", sep="_")
    file.path(output_taxa_strip_folder,output_taxa_strip_file, fsep = .Platform$file.sep)->strip_out_path
    unique(taxa_data[level])->unique_taxa
    bottom=NULL
    newer=NULL
    legend_colors=NULL
    legend_labels=NULL
    legend_shapes=NULL
    k=0
    for (j in 1:nrow(unique_taxa)){
        as.character(unique_taxa[j,1])->curr_tax
        temp=NULL
        k=k+1
        as.factor(curr_tax)
        temp <- subset(taxa_data, get(level) == curr_tax, select=OTU)
        temp["color"]<-substr(palette[k],1,7)
        subset(temp,select=c(OTU,color))->newer
        rbind(newer,bottom)->bottom
        paste(legend_colors,substr(palette[k],1,7),sep="\t")->legend_colors
        paste(legend_labels,curr_tax,sep="\t")->legend_labels
        paste(legend_shapes,1,sep="\t")->legend_shapes
    }
    LEGEND_COLORS = paste("LEGEND_COLORS",legend_colors,sep="")
    LEGEND_LABELS= paste("LEGEND_LABELS",legend_labels,sep="")
    DATASET_LABEL = paste("DATASET_LABEL",paste(level,"taxa_colorstrip",sep="_"), sep="\t")   
    LEGEND_TITLE = paste("LEGEND_TITLE",paste(level,"taxa_colorstrip",sep="_"), sep="\t")
    LEGEND_SHAPES = paste("LEGEND_SHAPES",paste(legend_shapes), sep="")
    top<-c("DATASET_COLORSTRIP","SEPARATOR TAB",DATASET_LABEL,"COLOR	#ff0000","COLOR_BRANCHES	1",LEGEND_TITLE,LEGEND_SHAPES,LEGEND_COLORS,LEGEND_LABELS,"DATA")
    paste(bottom$OTU, bottom$color, sep="\t")->bottom_out
    write(blank,file=strip_out_path)
    write(top,file=strip_out_path,append=TRUE)
    write(bottom_out,file=strip_out_path,append=TRUE)
}

In [277]:
# Build taxa tree_colors files to color OTU tip labels by taxonomy at various taxonomic levels

#Read in taxonomy file
output_folder = c("output/itol_parameters_shared/")
output_file = paste(output_folder,"otu_taxonomy_only.txt",sep="")
read.table(output_file,sep="\t",header = TRUE)->taxa_data

# Define taxonomic levels
c("Kingdom","Phylum","Class","Order","Family","Genus","Species")->taxa_levels

# Create output folder
output_taxa_strip_folder = c("output/itol_parameters_shared/taxa_tip_colors")
dir.create(file.path(output_taxa_strip_folder, fsep = .Platform$file.sep),showWarnings = FALSE)

for (level in taxa_levels){
    i=(nrow(unique(taxa_data[level])))
    palette <- rainbow(i)
    output_taxa_strip_file = paste(level,"taxa_tip_colors.txt", sep="_")
    file.path(output_taxa_strip_folder,output_taxa_strip_file, fsep = .Platform$file.sep)->strip_out_path
    unique(taxa_data[level])->unique_taxa
    bottom=NULL
    newer=NULL
    k=0
    for (j in 1:nrow(unique_taxa)){
        as.character(unique_taxa[j,1])->curr_tax
        temp=NULL
        k=k+1
        as.factor(curr_tax)
        temp <- subset(taxa_data, get(level) == curr_tax, select=OTU)
        temp["label_head"]<-c("label")
        temp["color"]<-substr(palette[k],1,7)
        subset(temp,select=c(OTU,label_head,color))->newer
        rbind(newer,bottom)->bottom
    }
    top<-c("TREE_COLORS","SEPARATOR TAB","DATA")
    paste(bottom$OTU, bottom$label_head, bottom$color, sep="\t")->bottom_out
    write(blank,file=strip_out_path)
    write(top,file=strip_out_path,append=TRUE)
    write(bottom_out,file=strip_out_path,append=TRUE)
}

In [278]:
# Build taxa label files to rename OTU tip labels by taxonomy at various taxonomic levels

#Read in taxonomy file
output_folder = c("output/itol_parameters_shared/")
output_file = paste(output_folder,"otu_taxonomy_only.txt",sep="")
read.table(output_file,sep="\t",header = TRUE)->taxa_data

# Define taxonomic levels
c("Kingdom","Phylum","Class","Order","Family","Genus","Species")->taxa_levels

# Create output folder
output_taxa_strip_folder = c("output/itol_parameters_shared/taxa_tip_name")
dir.create(file.path(output_taxa_strip_folder, fsep = .Platform$file.sep),showWarnings = FALSE)

for (level in taxa_levels){
    i=(nrow(unique(taxa_data[level])))
    palette <- rainbow(i)
    output_taxa_strip_file = paste(level,"taxa_tip_name.txt", sep="_")
    file.path(output_taxa_strip_folder,output_taxa_strip_file, fsep = .Platform$file.sep)->strip_out_path
    unique(taxa_data[level])->unique_taxa
    bottom=NULL
    newer=NULL
    k=0
    for (j in 1:nrow(unique_taxa)){
        as.character(unique_taxa[j,1])->curr_tax
        temp=NULL
        k=k+1
        as.factor(curr_tax)
        temp <- subset(taxa_data, get(level) == curr_tax, select=OTU)
        temp["taxa"]<-curr_tax
        subset(temp,select=c(OTU,taxa))->newer
        rbind(newer,bottom)->bottom
    }
    top<-c("LABELS","SEPARATOR TAB","DATA")
    paste(bottom$OTU, bottom$taxa, sep="\t")->bottom_out
    write(blank,file=strip_out_path)
    write(top,file=strip_out_path,append=TRUE)
    write(bottom_out,file=strip_out_path,append=TRUE)
}

In [279]:
# Build taxa label files to rename OTU tip labels by taxonomy at various taxonomic levels and include OTU name

#Read in taxonomy file
output_folder = c("output/itol_parameters_shared/")
output_file = paste(output_folder,"otu_taxonomy_only.txt",sep="")
read.table(output_file,sep="\t",header = TRUE)->taxa_data

# Define taxonomic levels
c("Kingdom","Phylum","Class","Order","Family","Genus","Species")->taxa_levels

# Create output folder
output_taxa_strip_folder = c("output/itol_parameters_shared/taxa_tip_name_w_otu_name")
dir.create(file.path(output_taxa_strip_folder, fsep = .Platform$file.sep),showWarnings = FALSE)

for (level in taxa_levels){
    i=(nrow(unique(taxa_data[level])))
    palette <- rainbow(i)
    output_taxa_strip_file = paste(level,"taxa_tip_name_w_otu_name.txt", sep="_")
    file.path(output_taxa_strip_folder,output_taxa_strip_file, fsep = .Platform$file.sep)->strip_out_path
    unique(taxa_data[level])->unique_taxa
    bottom=NULL
    newer=NULL
    k=0
    for (j in 1:nrow(unique_taxa)){
        as.character(unique_taxa[j,1])->curr_tax
        temp=NULL
        k=k+1
        as.factor(curr_tax)
        temp <- subset(taxa_data, get(level) == curr_tax, select=OTU)
        temp["taxa"]<-curr_tax
        temp$OTU_taxa = paste(temp$taxa, temp$OTU, sep="_")
        subset(temp,select=c(OTU,OTU_taxa))->newer
        rbind(newer,bottom)->bottom
    }
    top<-c("LABELS","SEPARATOR TAB","DATA")
    paste(bottom$OTU,bottom$OTU_taxa, sep="\t")->bottom_out
    write(blank,file=strip_out_path)
    write(top,file=strip_out_path,append=TRUE)
    write(bottom_out,file=strip_out_path,append=TRUE)
}

# Compute core vs. variable microbiome

In [330]:
# Python/QIIME was used to convert level-specific biom files to TSV
# Now, we will use these TSV's to calculate proportion core

# Define function to split filepath name in order to extract genus (see split_path(core_file_path)[2]->genus)
split_path <- function(path) {
    rev(setdiff(strsplit(path,"/|\\\\")[[1]], ""))
} 

# Define the "core" levels you are intersted in
levels_to_test = c(50,70,90)

# Set rarefaction level
rarefaction = 1000

# Define compartments of interest
compartments_to_test = c("mucus","skeleton","tissue")

# Define genera of interest
current_genus = c('Acanthastrea','Acropora','Alveopora','Astrea','Cyphastrea','Diploastrea','Echinopora','Favia','Favites','Fungia','Galaxea','Goniastrea','Hydnophora','Isopora','Lobophyllia','Merulina','Montipora','Pachyseris','Pavona','Physogyra','Platygyra','Pocillopora','Porites','Psammocora','Seriatopora','Stylophora','Symphyllia','Turbinaria')
 
for (compartment_of_interest in compartments_to_test){
    for(level in levels_to_test){
        summary=NULL
        for (gen in current_genus){
            core_data=NULL
            totals=NULL
            t_totals=NULL
            path = paste("output/compartments/",compartment_of_interest,"_only",sep="")
            tsv_file_name = paste("core_table_",level,"_TSV.txt",sep="")
            curr_tsv = file.path(path,gen,tsv_file_name, fsep = .Platform$file.sep)
            if (file.exists(curr_tsv)){
                read.table(curr_tsv,sep="\t",skip = 1, comment.char="", header = TRUE, row.names = 1, )->core_data_w_tax
                core_data_w_tax[,1:(ncol(core_data_w_tax)-1)]->core_data
                core_data["total",]<-as.numeric(apply(core_data, 2, sum))
                core_data["total",]->totals
                as.data.frame(t(totals))->t_totals
                t_totals["prop_core"]<-t_totals$total/rarefaction
                t_totals["taxa"]<-gen
                rbind(summary,t_totals)->summary
                } else {
                # Add else to include taxa where no core
                t_totals<-data.frame("total"=as.numeric(0),"prop_core"=as.numeric(0),"taxa"=as.factor(gen))
                rownames(t_totals)<-gen
                rbind(summary,t_totals)->summary
                }
            }
        gen_summary=NULL
        all_summary=NULL
        for (gen in current_genus){
            subset(summary,taxa==gen,select=prop_core)->gen_of_interest
            mean(gen_of_interest$prop_core)->gen_mean
            sd(gen_of_interest$prop_core)->gen_sd
            data.frame("genus"=as.factor(gen),"proportion_core_mean"=as.numeric(gen_mean),"proportion_core_standard_deviation"=as.numeric(gen_sd),"level"=as.factor(level))->gen_summary
            rbind(all_summary,gen_summary)->all_summary
            }
#        print(all_summary)
        output_file_name = paste(compartment_of_interest,"_proportion_core_",level,".txt",sep="")
        output_file = file.path(path,output_file_name, fsep = .Platform$file.sep)
        write.table(all_summary, file = output_file, row.names=FALSE, na="0" , col.names=TRUE, sep="\t",quote=FALSE)
        }
    }
