# Gunc simulated genomes type 3b: determine appropriate taxonomic level for each contiguous sequence in synthetic genomes

The gunc genomes were simulated from genomes, using the NCBI taxonomy to determine the taxonomic level at which the contamination deviated from the dominate lineage. 
Since we're using GTDB taxonomy, this notebook determines the taxonomic level at which the contaminant genomes different from the dominant genome within the GTDB taxonomy.

The output is saved as a tsv file to `sandbox/gunc_synthetic_genomes3b/type3b.genomes/genome_contamination_level.tsv`.

In [1]:
setwd("..")

In [3]:
library(dplyr)
library(readr)
library(tidyr)
library(purrr)
library(rentrez)
library(ggplot2)

## obtain and read in GTDB rs207 metadata

In [4]:
destfile <- "inputs/gtdb-rs207/bac120_metadata_rs207.tar.gz"
url <- "https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_metadata_r207.tar.gz"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto") 
}
outfile <- "inputs/gtdb-rs207/bac120_metadata_r207.tsv"
if (!file.exists(outfile)){
  untar(destfile, exdir = "inputs/gtdb-rs207")
}

destfile <- "inputs/gtdb-rs207/ar53_metadata_rs207.tar.gz"
url <- "https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_metadata_r207.tar.gz"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto") 
}
outfile <- "inputs/gtdb-r207/ar53_metadata_r207.tsv"
if (!file.exists(outfile)){
  untar(destfile, exdir = "inputs/gtdb-rs207")
}

In [5]:
gtdb_metadata_rs207 <- read_tsv("inputs/gtdb-rs207/bac120_metadata_r207.tsv", show_col_types = FALSE) %>%
  #separate(gtdb_taxonomy, into = c("domain", "phylum", "class", "order", "family", "genus", "species"), 
  #         sep = ";") %>%
  mutate(ncbi_database = ifelse(grepl(pattern = "GB", accession), "GenBank", "none"),
         ncbi_database = ifelse(grepl(pattern = "RS", accession), "RefSeq", ncbi_database)) %>%
  mutate(accession = gsub("^RS_", "", accession),
         accession = gsub("^GB_", "", accession),
         accession_pruned = gsub("GC[FA]_", "", accession),
         accession_pruned = gsub("\\..", "", accession_pruned))

gtdb_metadata_rs207 <- read_tsv("inputs/gtdb-rs207/ar53_metadata_r207.tsv", show_col_types = FALSE) %>%
  #separate(gtdb_taxonomy, into = c("domain", "phylum", "class", "order", "family", "genus", "species"), 
  #         sep = ";") %>%
  mutate(ncbi_database = ifelse(grepl(pattern = "GB", accession), "GenBank", "none"),
         ncbi_database = ifelse(grepl(pattern = "RS", accession), "RefSeq", ncbi_database)) %>%
  mutate(accession = gsub("^RS_", "", accession),
         accession = gsub("^GB_", "", accession),
         accession_pruned = gsub("GC[FA]_", "", accession),
         accession_pruned = gsub("\\..", "", accession_pruned)) %>%
  bind_rows(gtdb_metadata_rs207)

“One or more parsing issues, see `problems()` for details”


## read in and parse faidx files

In [6]:
# faidx colname legend
# NAME	Name of this reference sequence
# LENGTH	Total length of this reference sequence, in bases
# OFFSET	Offset in the FASTA/FASTQ file of this sequence's first base
# LINEBASES	The number of bases on each line
# LINEWIDTH	The number of bytes in each line, including the newline
# QUALOFFSET	Offset of sequence's first quality within the FASTQ file

faidx_rs207_reps <- Sys.glob("sandbox/gunc_synthetic_genomes3b/type3b.genomes/outputs/genomes_to_charcoal3_vs_gtdb_rs207_reps/*fai") %>%
  set_names() %>%
  map_dfr(read_tsv, show_col_types = F, col_names = c("contig_name", "length", "offset", "linebases", "linewidth", "qualoffset"), .id = "filename") %>%
  mutate(type = ifelse(grepl("clean", x = filename), "clean", "dirty"),
         genome = gsub("_\\.fa.*", "", basename(filename))) %>%
  select(genome, type, contig_name, length) %>%
  separate(contig_name, into = c("num", "genome_accession", "contig"), sep = "\\.")

## convert biosample ID to genbank/refseq accession

In [7]:
# get conversion between biosample accession and assembly accession
biosample_to_assembly_df <- data.frame()
for(biosample_accession in unique(faidx_rs207_reps$genome_accession)){
    tmp_search <- entrez_search(db="assembly", term= biosample_accession) # search for the id assoc with this record
    tmp_summary <- entrez_summary(db="assembly", id=tmp_search$id)        # use id to retrieve info about accession
    if(length(tmp_summary) > 5){ # assemblies with multiple accessions will have nested lists, with the first list being shallow
        assembly_accession <- tmp_summary$assemblyaccession               # extract assembly accession
        tmp_biosample_to_assembly_df <- data.frame(biosample = biosample_accession, assembly = assembly_accession) # make temp df 
        biosample_to_assembly_df <- bind_rows(biosample_to_assembly_df, tmp_biosample_to_assembly_df) # bind to final results
    } else {
        assembly_accession <- tmp_summary[[1]]$assemblyaccession          # extract assembly accession
        tmp_biosample_to_assembly_df <- data.frame(biosample = biosample_accession, assembly = assembly_accession) # make temp df 
        biosample_to_assembly_df <- bind_rows(biosample_to_assembly_df, tmp_biosample_to_assembly_df) # bind to final results
    }
}

In [8]:
# join to gtdb lineage information
faidx_rs207_reps_annotated <- faidx_rs207_reps %>%
  left_join(biosample_to_assembly_df, by = c("genome_accession" = "biosample")) %>%
  mutate(assembly_pruned =  gsub("GC[AF]_", "", assembly),
         assembly_pruned = gsub("\\..", "", assembly_pruned)) %>%
  left_join(gtdb_metadata_rs207, by = c("assembly_pruned" = "accession_pruned")) %>%
  mutate(contamination_level = gsub("type3b_", "", genome),
         contamination_level = gsub("_.*", "", contamination_level)) %>%
  rename(contig_lineage = gtdb_taxonomy)

In [9]:
# join to gather lineage info and label as correct lineage, contaminate genome1, or contaminate genome2

lineages_rs207 <- read_csv("sandbox/gunc_synthetic_genomes3b/type3b.genomes/genomes_to_charcoal3_rs207_lineages.csv", show_col_types = F) %>%
  mutate(dom_lineage = paste(superkingdom, phylum, class, order, family, genus, species, sep = ";")) %>%
  mutate(ident = gsub("_\\.fa", "", ident)) %>%
  select(ident, dom_lineage)

faidx_rs207_reps_annotated <- faidx_rs207_reps_annotated %>%
  left_join(lineages_rs207, by = c("genome" = "ident"))

## Re-infer what levels of taxonomy contamination varies based on GTDB 

I think levels were previously set based on NCBI taxonomy. Convert to GTDB taxonomy to be copacetic with other analysis

In [10]:
# figure out which taxonomic level the dom lineage is first mismatched from the contam lineage
reset_contam_level1 <- faidx_rs207_reps_annotated %>%
  select(genome, contamination_level, dom_lineage, contig_lineage) %>%
  distinct() %>%
  separate(dom_lineage, into = c("dom_domain", "dom_phylum", "dom_class", "dom_order", "dom_family", "dom_genus", "dom_species"), sep = ";") %>%
  separate(contig_lineage, into = c("contig_domain", "contig_phylum", "contig_class", "contig_order", "contig_family", "contig_genus", "contig_species"), sep = ";") %>%
  group_by(genome) %>%
  mutate(gtdb_contamination_level = ifelse(dom_phylum != contig_phylum, "phylum", 
                                           ifelse(dom_class != contig_class, "class", 
                                                  ifelse(dom_order != contig_order, "order",
                                                         ifelse(dom_family != contig_family, "family",
                                                               ifelse(dom_genus != contig_genus, "genus",
                                                                     ifelse(dom_species != contig_species, "species", "dom_lineage")))))))

In [11]:
reset_contam_level2 <- reset_contam_level1 %>%
  filter(gtdb_contamination_level != "dom_lineage") %>%             # remove where dom == dom
  select(genome, contamination_level, gtdb_contamination_level) %>% # select genome and contam levels
  group_by(genome) %>%                                              # group by genome
  arrange(genome, factor(gtdb_contamination_level, levels = c("phylum", "class", "order", "family", "genus", "species"))) %>% # arrange by lineage order
  slice_head(n = 1)                                                 # select top lineage difference

  
reset_contam_level2

genome,contamination_level,gtdb_contamination_level
<chr>,<chr>,<chr>
type3b_class_3_0.33_0000_1121877.SAMN02745886,class,class
type3b_class_3_0.33_0000_1122947.SAMN01057329,class,phylum
type3b_class_3_0.33_0000_1123249.SAMN02440643,class,phylum
type3b_class_3_0.33_0000_1149862.SAMN01057330,class,phylum
type3b_class_3_0.33_0000_1160201.SAMN00811184,class,phylum
type3b_class_3_0.33_0000_1231054.SAMN01730125,class,class
type3b_class_3_0.33_0000_1231376.SAMN02469699,class,class
type3b_class_3_0.33_0000_1287313.SAMN02359689,class,phylum
type3b_class_3_0.33_0000_1298595.SAMD00000528,class,phylum
type3b_class_3_0.33_0000_1505.SAMEA1572114,class,phylum


In [12]:
# do we get the 11 genomes back where we found a mismatch before?

reset_contam_level2 %>%
  filter(! contamination_level %in% c("genus", "specI")) %>% # we don't expect to do good here anyway, just filter for now
  filter(contamination_level != gtdb_contamination_level)

genome,contamination_level,gtdb_contamination_level
<chr>,<chr>,<chr>
type3b_class_3_0.33_0000_1122947.SAMN01057329,class,phylum
type3b_class_3_0.33_0000_1123249.SAMN02440643,class,phylum
type3b_class_3_0.33_0000_1149862.SAMN01057330,class,phylum
type3b_class_3_0.33_0000_1160201.SAMN00811184,class,phylum
type3b_class_3_0.33_0000_1287313.SAMN02359689,class,phylum
type3b_class_3_0.33_0000_1298595.SAMD00000528,class,phylum
type3b_class_3_0.33_0000_1505.SAMEA1572114,class,phylum
type3b_family_3_0.33_0000_1278077.SAMN02469985,family,genus
type3b_family_3_0.33_0000_1428.SAMN04278951,family,order
type3b_family_3_0.33_0000_1433.SAMN05934573,family,order


In [14]:
write_tsv(reset_contam_level2, "sandbox/gunc_synthetic_genomes3b/type3b.genomes/genome_contamination_level.tsv")