# Parsing charcoal output files into useful dataframe formats

In [58]:
library(jsonlite)
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)

In [4]:
setwd("..")

## Parsing input file types

### Try with `stage2/*matches.json` at species level

In [33]:
read_matches_json <- function(matches_json_path){
  read_json(matches_json_path, simplifyVector = F) %>%
    as.data.frame() %>%
    mutate(across(everything(), as.character)) %>%
    pivot_longer(cols = !starts_with("query_info"), names_to = "name", values_to = "value") %>%
    separate(col = name, into = c("match", "match_accession", "version", "name"), sep = "\\.", remove = T) %>%
    mutate(accession = paste(match_accession, version, sep = ".")) %>%
    select(-version, -match) %>%
    pivot_wider(id_cols = c("query_info.genome", "query_info.genome_lineage", "query_info.match_rank", 
                            "query_info.scaled", "match_accession"), names_from = name, values_from = value) %>%
    mutate(counts = as.numeric(counts),
           query_info.scaled = as.numeric(query_info.scaled)) %>%
    arrange(match_type, desc(counts)) %>%
    separate(col = lineage, into = c("domain", "phylum", "class", "order", "family", "genus", "species"), sep = ';')
}

In [85]:
read_matches_json("~/Downloads/output.ibd2/stage2/SRS104400_110.fna.gz.matches.json")

query_info.genome,query_info.genome_lineage,query_info.match_rank,query_info.scaled,match_accession,domain,phylum,class,order,family,genus,species,match_type,counts
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCA_900316325,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Lachnospira,s__Lachnospira sp900316325,clean,1350
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCF_000020605,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter rectalis,clean,96
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCF_003480145,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Blautia_A,s__Blautia_A sp900066165,clean,84
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCF_000156035,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Faecalimonas,s__Faecalimonas nexilis,clean,40
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCF_001405555,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Fusicatenibacter,s__Fusicatenibacter saccharivorans,clean,34
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCA_000210015,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Blautia_A,s__Blautia_A obeum_B,clean,29
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCA_900317505,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__UBA2882,s__UBA2882 sp900317505,clean,29
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCA_900543865,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Butyrivibrio_A,s__Butyrivibrio_A sp900543865,clean,24
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCF_000153905,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Blautia_A,s__Blautia_A obeum,clean,22
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,GCA_900557055,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter sp900557055,clean,21


In [31]:
# when run on GTDB, need to join with GTDB metadata to get full query lineage not truncated to order

In [86]:
# combine contaminants to species level instead of genome level
matches <- read_matches_json("~/Downloads/output.ibd2/stage2/SRS104400_110.fna.gz.matches.json")

In [88]:
matches %>%
  group_by(query_info.genome, query_info.genome_lineage, query_info.match_rank, query_info.scaled, 
           domain, phylum, class, order, family, genus, species, match_type, counts) %>%
  summarize(species_counts = sum(counts)) %>%
  arrange(match_type)

`summarise()` has grouped output by 'query_info.genome', 'query_info.genome_lineage', 'query_info.match_rank', 'query_info.scaled', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'match_type'. You can override using the `.groups` argument.



query_info.genome,query_info.genome_lineage,query_info.match_rank,query_info.scaled,domain,phylum,class,order,family,genus,species,match_type,counts,species_counts
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Anaerotignaceae,g__Anaerotignum,s__Anaerotignum lactatifermentans,clean,6,6
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Acetatifactor,s__Acetatifactor sp900066565,clean,8,8
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__AF33-28,s__AF33-28 sp003477885,clean,7,7
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter faecis,clean,3,3
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter rectalis,clean,96,96
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter sp900317585,clean,12,12
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter sp900546625,clean,14,14
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter sp900547695,clean,4,4
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter sp900550545,clean,3,3
SRS104400_110.fna.gz,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,order,1000,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Lachnospirales,f__Lachnospiraceae,g__Agathobacter,s__Agathobacter sp900557055,clean,21,21


In [90]:
matches %>%
  group_by(order, match_type) %>%
  summarize(order_counts = sum(counts)) %>%
  arrange(desc(order_counts))

`summarise()` has grouped output by 'order'. You can override using the `.groups` argument.



order,match_type,order_counts
<chr>,<chr>,<dbl>
o__Lachnospirales,clean,2135
o__Bacteroidales,dirty,838
o__Oscillospirales,dirty,402
o__Monoglobales,dirty,79
o__Peptostreptococcales,dirty,31
o__Christensenellales,dirty,29
o__RFN20,dirty,18
o__Saccharofermentanales,dirty,8
o__UMGS1840,dirty,3


### Try with `stage1/*contam_summary.json` at order level

In [56]:
read_contam_summary <- function(contam_summary_path){
  json <- fromJSON(contam_summary_path)
  all_contam_summary <- data.frame()
  for(i in 1:length(json[[1]])){
    genome_taxonomy <- unlist(json[[1]][[i]][1])
    genome_taxonomy <- genome_taxonomy[5:8]
    genome_taxonomy <- paste(genome_taxonomy, collapse = ";")
    contig_taxonomy <- unlist(json[[1]][[i]][2])
    contig_taxonomy <- contig_taxonomy[5:8]
    contig_taxonomy <- paste(contig_taxonomy, collapse = ";")
    counts <- unlist(json[[1]][[i]][3])
    contam_summary <- data.frame(genome_taxonomy, contig_taxonomy, counts)
    all_contam_summary <- bind_rows(all_contam_summary, contam_summary)
  }
  return(all_contam_summary)
}

In [71]:
read_contam_summary('~/Downloads/output.ibd2/stage1/SRS104400_110.fna.gz.contam_summary.json')

genome_taxonomy,contig_taxonomy,counts
<chr>,<chr>,<int>
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,7
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,4
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,3
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,10
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,6
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,12
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,12
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,3
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,6
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales,6


### Try with `stage1/*contigs-tax.json` at order level

In [99]:
read_contigs_tax <- function(contigs_tax_path){  
  json <- fromJSON(contigs_tax_path)
  contig_tax_all <- data.frame()
  for(i in 1:length(json)){
    contig_name <- names(json)[i]
    basepairs <- json[[i]][[1]]
    hashes <- json[[i]][[2]]
    if(length(json[[i]][[3]]) > 0){
      lineage <- json[[i]][[3]][[1]][[1]][,2]
      lineage = paste(lineage, collapse = ";", sep = ";")
      matched_hashes <- json[[i]][[3]][[1]][[2]]
    } else {
      lineage = NA
      matched_hashes = NA
    }
    contig_tax <- data.frame(contig_name, basepairs, hashes, lineage, matched_hashes)
    contig_tax_all <- bind_rows(contig_tax_all, contig_tax)
  }
   contig_tax_all$genome <- gsub(".contigs-tax.json", "", basename(contigs_tax_path))
   return(contig_tax_all)
}

In [100]:
tmp <- read_contigs_tax('~/Downloads/output.ibd2/stage1/SRS104400_110.fna.gz.contigs-tax.json')

In [101]:
head(tmp)

Unnamed: 0_level_0,contig_name,basepairs,hashes,lineage,matched_hashes,genome
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<int>,<chr>
1,SRS104400|110|k99_206,20198,12,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,7,SRS104400_110.fna.gz
2,SRS104400|110|k99_217,6176,5,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,4,SRS104400_110.fna.gz
3,SRS104400|110|k99_443,5932,5,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,3,SRS104400_110.fna.gz
4,SRS104400|110|k99_515,18607,15,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,10,SRS104400_110.fna.gz
5,SRS104400|110|k99_563,4232,6,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,6,SRS104400_110.fna.gz
6,SRS104400|110|k99_694,2678,2,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales,1,SRS104400_110.fna.gz


In [92]:
tmp %>%
  group_by(lineage) %>%
  summarize(order_counts = sum(as.numeric(num3))) %>%
  arrange(desc(order_counts))

lineage,order_counts
<chr>,<dbl>
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales,2317.0
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,976.0
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales,504.0
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Monoglobales,79.0
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Peptostreptococcales,32.0
d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales,31.0
d__Bacteria;p__Firmicutes;c__Bacilli;o__RFN20,18.0
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Saccharofermentanales,11.0
d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Veillonellales,5.0
d__Bacteria;p__Firmicutes_A;c__Clostridia;o__UMGS1840,3.0


### Parsing summary
`stage1/*contigs-tax.json` contains counts at the order level, but `stage2/*matches.json` is the only file that holds species-lineage information