# Gunc simulated genomes type 3b: analyze stage1 hitlist results

In [1]:
setwd("..")

In [31]:
library(dplyr)
library(readr)
library(tidyr)
library(ggplot2)

## Read in stage1_hitlist results

In [5]:
stage1_rs202_reps <- read_csv("sandbox/gunc_synthetic_genomes3b/type3b.genomes/outputs/genomes_to_charcoal3_vs_gtdb_rs202_reps/stage1_hitlist.csv", show_col_types = F)
stage1_rs207_reps <- read_csv("sandbox/gunc_synthetic_genomes3b/type3b.genomes/outputs/genomes_to_charcoal3_vs_gtdb_rs207_reps/stage1_hitlist.csv", show_col_types = F)

## Do something

In [13]:
# parse the stage1 hitlist
# contamination_level: what is the "control" level of contamination? -- e.g., what is the lowest common ancestor for the combined genomes?
# level_first_bad_bp: what is the taxonomic level of the first bad base pair that was detected?
stage1_rs207_reps <- stage1_rs207_reps %>%
  mutate(contamination_level = gsub("type3b_", "", genome),
         contamination_level = gsub("_.*", "", contamination_level)) %>%
  mutate(level_first_bad_bp = ifelse(phylum_bad_bp > 0, "phylum", 
                                    ifelse(phylum_bad_bp == 0 & class_bad_bp > 0, "class",
                                          ifelse(phylum_bad_bp == 0 & class_bad_bp == 0 & order_bad_bp > 0, "order", 
                                                ifelse(phylum_bad_bp == 0 & class_bad_bp == 0 & order_bad_bp == 0 & family_bad_bp > 0, "family", "genus")))))

In [24]:
# parse stage1 hitlist 
# level_majority_bad_bp: at what taxonomic level is the majority of contamination detected? 
stage1_rs207_reps_majority_bad_bp <- stage1_rs207_reps %>%
  select(genome, contamination_level, ends_with("bad_bp")) %>% 
  select(-total_bad_bp, -level_first_bad_bp) %>%
  pivot_longer(cols = ends_with("bad_bp"), names_to = "level_bad_bp", values_to = "base_pairs") %>% # pivot to longer
  mutate(base_pairs = base_pairs - 50000) %>% # subtract 50k bp; rm tiny contam detection, as contam should be ~2/3 of seq
  filter(base_pairs > 0) %>%                  # limit to bad bp greater than 0
  group_by(genome) %>%                        # group to genome
  slice_head(n = 1) %>%                       # select highest contam after 50bp rm within each genome
  mutate(level_bad_bp = gsub("_bad_bp", "", level_bad_bp)) %>% # edit name of level with majority contam
  select(genome, level_majority_bad_bp = level_bad_bp)
  

stage1_rs207_reps <- left_join(stage1_rs207_reps, stage1_rs207_reps_majority_bad_bp, by = "genome")

In [25]:
stage1_rs207_reps

genome,filter_at,override_filter_at,total_bad_bp,superkingdom_bad_bp,phylum_bad_bp,class_bad_bp,order_bad_bp,family_bad_bp,genus_bad_bp,f_ident,f_major,lineage,comment,contamination_level,level_first_bad_bp,level_majority_bad_bp
<chr>,<chr>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<lgl>,<chr>,<chr>,<chr>
type3b_class_3_0.33_0000_1121877.SAMN02745886_.fa,order,,3889773,0,0,3889773,3889773,5875708,5875708,0.398,0.81,d__Bacteria;p__Actinobacteriota;c__Acidimicrobiia;o__Acidimicrobiales,,class,class,class
type3b_phylum_3_0.33_0000_1280.SAMEA862559_.fa,order,,3165896,0,3160237,3160237,3165896,5026635,5026635,0.394,0.747,d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphylococcales,,phylum,phylum,phylum
type3b_phylum_3_0.33_0000_1280.SAMN04018804_.fa,order,,2670457,0,2608461,2608461,2670457,3897576,3897576,0.375,0.871,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,,phylum,phylum,phylum
type3b_phylum_3_0.33_0000_1313.SAMEA1025997_.fa,order,,2652241,0,2652241,2652241,2652241,3882166,3882166,0.356,0.9,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales,,phylum,phylum,phylum
type3b_class_3_0.33_0000_1231054.SAMN01730125_.fa,order,,2604069,0,0,2604069,2604069,3925994,3925994,0.372,0.911,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales,,class,class,class
type3b_order_3_0.33_0000_1050199.SAMN02261318_.fa,order,,2597740,0,0,0,2597740,3740115,3740115,0.653,0.99,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Streptomycetales,,order,order,order
type3b_order_3_0.33_0000_119857.SAMN03773898_.fa,order,,2437998,0,6673,6673,2437998,4049236,4049236,0.396,0.757,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Francisellales,,order,phylum,order
type3b_class_3_0.33_0000_1287313.SAMN02359689_.fa,order,,2405605,0,1347104,2405605,2405605,3517455,3517455,0.403,0.702,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales,,class,phylum,phylum
type3b_order_3_0.33_0000_1648182.SAMN03571527_.fa,order,,2371206,0,0,0,2371206,3535811,3535811,0.346,0.993,d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Streptomycetales,,order,order,order
type3b_order_3_0.33_0000_1158154.SAMN02441053_.fa,order,,2329797,0,23485,26009,2329797,3752446,3752446,0.42,0.712,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales,,order,phylum,order


In [26]:
table(stage1_rs207_reps$contamination_level == stage1_rs207_reps$level_first_bad_bp)


FALSE  TRUE 
   42    18 

In [27]:
table(stage1_rs207_reps$contamination_level == stage1_rs207_reps$level_majority_bad_bp)


FALSE  TRUE 
   31    29 

Charcoal doesn't really work for contamination detection below the family/order level. 
This might be because there are too many shared k-mers? ASK TESSA

In [28]:
stage1_rs207_reps_filt <- stage1_rs207_reps %>%
  filter(! contamination_level %in% c("genus", "specI"))

In [29]:
table(stage1_rs207_reps_filt$contamination_level == stage1_rs207_reps_filt$level_majority_bad_bp)


FALSE  TRUE 
   11    29 

## dig in to the 11 that don't match

**Hypotheses as to what is happening**

1. GTDB taxonomy is different than the NCBI taxonomy, so the majority contamination level is accurate but doesn't match the declared level. 
   - CHECK: the taxonomy of the combined genomes in GTDB. Makes the most sense for contams that are only one level off from where they are supposed to be
2. The progenomes2.1 genomes used to build these simulated genomes are contaminated. 
   - not really sure what to check here

In [33]:
stage1_rs207_reps_filt[stage1_rs207_reps_filt$contamination_level != stage1_rs207_reps_filt$level_majority_bad_bp, ]

genome,filter_at,override_filter_at,total_bad_bp,superkingdom_bad_bp,phylum_bad_bp,class_bad_bp,order_bad_bp,family_bad_bp,genus_bad_bp,f_ident,f_major,lineage,comment,contamination_level,level_first_bad_bp,level_majority_bad_bp
<chr>,<chr>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<lgl>,<chr>,<chr>,<chr>
type3b_class_3_0.33_0000_1287313.SAMN02359689_.fa,order,,2405605,0,1347104,2405605,2405605,3517455,3517455,0.403,0.702,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales,,class,phylum,phylum
type3b_class_3_0.33_0000_1160201.SAMN00811184_.fa,order,,2150493,0,1150803,2142891,2150493,3381158,3381158,0.406,0.759,d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales_A,,class,phylum,phylum
type3b_class_3_0.33_0000_1123249.SAMN02440643_.fa,order,,2115170,0,2110752,2110752,2115170,3335498,3335498,0.377,0.892,d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Selenomonadales,,class,phylum,phylum
type3b_family_3_0.33_0000_1502.SAMN05323905_.fa,order,,1745042,0,9598,9598,1745042,2948860,2948860,0.436,0.673,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Peptostreptococcales,,family,phylum,order
type3b_family_3_0.33_0000_1428.SAMN04278951_.fa,order,,1608222,0,3232,3232,1608222,2487568,2487568,0.364,0.931,d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphylococcales,,family,phylum,order
type3b_class_3_0.33_0000_1149862.SAMN01057330_.fa,order,,1416265,0,1416265,1416265,1416265,2084567,2084567,0.367,0.845,d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Propionisporales,,class,phylum,phylum
type3b_family_3_0.33_0000_1433.SAMN05934573_.fa,order,,1374471,0,0,0,1374471,2135877,2135877,0.377,0.863,d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphylococcales,,family,order,order
type3b_class_3_0.33_0000_1298595.SAMD00000528_.fa,order,,934477,0,934477,934477,934477,1526760,1526760,0.414,0.822,d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Veillonellales,,class,phylum,phylum
type3b_class_3_0.33_0000_1122947.SAMN01057329_.fa,order,,892586,0,892586,892586,892586,1465097,1465097,0.425,0.778,d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Propionisporales,,class,phylum,phylum
type3b_class_3_0.33_0000_1505.SAMEA1572114_.fa,order,,888160,0,528434,528434,888160,1527073,1527073,0.432,0.777,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Tissierellales,,class,phylum,phylum


tldr used other notebooks; when contigs are re-assigned taxonomy via GTDB, everything meshes how its supposed to.