Permalink
Please sign in to comment.
Showing
with
72 additions
and 874 deletions.
- +46 −0 .gitignore
- +0 −93 new-testing/analyze.R
- +0 −240 new-testing/lib/compare.py
- +0 −74 new-testing/lib/read-sim.R
- +11 −0 {new-testing → testing}/README.md
- 0 {new-testing → testing}/analysis.R
- 0 {new-testing → testing}/btrim_adapters.fa
- +0 −147 testing/compare.py
- +0 −16 testing/cutadapt/parse_results.R
- +0 −20 testing/cutadapt/run-all.sh
- +0 −7 testing/cutadapt/run-diff-error.sh
- +1 −1 {new-testing → testing}/generate-reads.R
- +0 −19 testing/process_results.R
- +0 −21 testing/read-sampler.py
- +6 −4 testing/read-sim.R
- 0 {new-testing → testing}/reads.fastq
- +0 −125 testing/results_parse.py
- +8 −8 {new-testing → testing}/run.sh
- +0 −70 testing/run_tests.R
- +0 −16 testing/scythe/parse_results.R
- +0 −13 testing/scythe/run-diff-contam.sh
46
.gitignore
| @@ -0,0 +1,46 @@ | ||
| +# History files | ||
| +.Rhistory | ||
| + | ||
| +# Example code in package build process | ||
| +*-Ex.R | ||
| + | ||
| +*~ | ||
| +\#*\# | ||
| +/.emacs.desktop | ||
| +/.emacs.desktop.lock | ||
| +.elc | ||
| +auto-save-list | ||
| +tramp | ||
| +.\#* | ||
| + | ||
| +# Org-mode | ||
| +.org-id-locations | ||
| +*_archive | ||
| + | ||
| +.DS_Store | ||
| + | ||
| +# Thumbnails | ||
| +._* | ||
| + | ||
| +# Files that might appear on external disk | ||
| +.Spotlight-V100 | ||
| +.Trashes | ||
| + | ||
| + | ||
| +# Object files | ||
| +*.o | ||
| + | ||
| +# Libraries | ||
| +*.lib | ||
| + | ||
| +# Shared objects (inc. Windows DLLs) | ||
| +*.dll | ||
| +*.so | ||
| + | ||
| +# Executables | ||
| +*.exe | ||
| +*.out | ||
| + | ||
| +scythe | ||
| + | ||
| +*.gch |
| @@ -1,93 +0,0 @@ | ||
| -## analyze.R -- analyze the results from trimming of simulated reads. | ||
| - | ||
| - | ||
| -## In the cross-tool comparison, we are using 0.4 contamination | ||
| -scythe.dir <- "scythe/results/0.4/" | ||
| -cutadapt.dir <- "cutadapt/results/0.4/" | ||
| -btrim.dir <- "btrim/results/0.4/" | ||
| - | ||
| -scythe.result.files <- list.files(scythe.dir, pattern="results-.*\\.txt") | ||
| -cutadapt.result.files <- list.files(cutadapt.dir, pattern="results-.*\\.txt") | ||
| -btrim.result.files <- list.files(btrim.dir, pattern="results-.*\\.txt") | ||
| - | ||
| -tmp.d <- lapply(scythe.result.files, function(f) { | ||
| - stopifnot(file.exists(file.path(cutadapt.dir, f))) ## should have corresponding files | ||
| - tmp <- gsub("results-(\\d+)-(0.\\d+).txt", "\\1;;;\\2", f) | ||
| - tmp <- unlist(strsplit(tmp, ";;;")) | ||
| - rep <- as.numeric(tmp[1]) | ||
| - contam <- as.numeric(tmp[2]) | ||
| - d.scythe <- read.table(file.path(scythe.dir, f), sep="\t", col.names=c("name", "value"), stringsAsFactors=FALSE) | ||
| - d.cutadapt <- read.table(file.path(cutadapt.dir, f), sep="\t", col.names=c("name", "value"), stringsAsFactors=FALSE) | ||
| - tmp <- rbind(cbind(d.cutadapt, rep, contam, trimmer="cutadapt"), cbind(d.scythe, rep, contam, trimmer="scythe")) | ||
| - tmp | ||
| -}) | ||
| - | ||
| -# btrim has diff input parameter - num mismatches, so it needs to be handled seperately | ||
| -tmp.btrim.d <- lapply(btrim.result.files, function(f) { | ||
| - tmp <- gsub("results-(\\d+)-(\\d+).txt", "\\1;;;\\2", f) | ||
| - tmp <- unlist(strsplit(tmp, ";;;")) | ||
| - rep <- as.numeric(tmp[1]) | ||
| - num.mm <- as.numeric(tmp[2]) | ||
| - d.btrim <- read.table(file.path(btrim.dir, f), sep="\t", col.names=c("name", "value"), stringsAsFactors=FALSE) | ||
| - if (nrow(d.btrim)) | ||
| - tmp <- rbind(cbind(d.btrim, rep, contam=num.mm, trimmer="btrim")) | ||
| - else | ||
| - tmp <- NULL | ||
| - tmp | ||
| -}) | ||
| - | ||
| -d <- rbind(do.call(rbind, tmp.d), do.call(rbind, tmp.btrim.d)) | ||
| - | ||
| - | ||
| -## just get TPR/FPR info | ||
| -roc.d <- local({ | ||
| - tmp <- d[d$name == "sensitivity", c("trimmer", "value", "rep", "contam")] | ||
| - tmp <- cbind(d[d$name == "sensitivity", c("trimmer", "value", "rep", "contam")], d[d$name == "specificity", c("trimmer", "value", "rep", "contam")]) | ||
| - colnames(tmp) <- c("trimmer", "tpr", "rep", "contam", "trimmer", "specificity", "rep", "contam") | ||
| - tmp$fpr <- 1 - tmp$specificity | ||
| - tmp | ||
| -}) | ||
| - | ||
| - | ||
| -# with all sim data with contam = 0.4 | ||
| -trimmer.colors <- c(scythe="blue", cutadapt="green", btrim="purple") | ||
| -plot(tpr ~ fpr, data=roc.d, xlim=c(0, 1), ylim=c(0, 1), type="n") | ||
| -for (t in levels(roc.d$trimmer)) { | ||
| - for (r in unique(roc.d$rep)) { | ||
| - # remove useless 0, 0 points | ||
| - with(roc.d[-which(roc.d$tpr == 0 & roc.d$fpr == 0), ], { | ||
| - lines(fpr[trimmer==t & rep==r], tpr[trimmer==t & rep==r], col=trimmer.colors[t]) | ||
| - ## text(fpr[trimmer==t & rep==r], tpr[trimmer==t & rep==r], col=trimmer.colors[t], labels=contam[trimmer==t & rep==r]) | ||
| - }) | ||
| - } | ||
| -} | ||
| - | ||
| -# with just one rep, and priors/errors | ||
| -trimmer.colors <- c(scythe="blue", cutadapt="green") | ||
| -plot(tpr ~ fpr, data=roc.d, xlim=c(0, 1), ylim=c(0, 1), type="n") | ||
| -for (t in levels(roc.d$trimmer)) { | ||
| - with(roc.d[roc.d$trimmer==t & roc.d$rep==1 & roc.d$fpr != 0 & roc.d$tpr != 0, ], { | ||
| - text(fpr[trimmer==t], tpr[trimmer==t], col=trimmer.colors[t], labels=contam[trimmer==t]) | ||
| - }) | ||
| -} | ||
| -stop() | ||
| - | ||
| -## analyze offset data | ||
| -scythe.offset.files <- list.files(scythe.dir, pattern="compare-offsets-") | ||
| -cutadapt.offset.files <- list.files(cutadapt.dir, pattern="compare-offsets-") | ||
| - | ||
| -offset.list <- lapply(scythe.offset.files, function(f) { | ||
| - stopifnot(file.exists(file.path(cutadapt.dir, f))) ## should have corresponding files | ||
| - tmp <- gsub("compare-offsets-(\\d+)-(0.\\d+).txt", "\\1;;;\\2", f) | ||
| - tmp <- unlist(strsplit(tmp, ";;;")) | ||
| - rep <- as.numeric(tmp[1]) | ||
| - contam <- as.numeric(tmp[2]) | ||
| - d.scythe <- read.table(file.path(scythe.dir, f), sep="\t", col.names=c("offset", "count"), stringsAsFactors=FALSE) | ||
| - d.cutadapt <- read.table(file.path(cutadapt.dir, f), sep="\t", col.names=c("offset", "count"), stringsAsFactors=FALSE) | ||
| - if (nrow(d.scythe) & nrow(d.cutadapt)) | ||
| - rbind(cbind(d.scythe, rep, contam, total=sum(d.scythe$count), trimmer="scythe"), cbind(d.cutadapt, rep, contam, total=sum(d.cutadapt$count), trimmer="cutadapt")) | ||
| - else | ||
| - NULL | ||
| -}) | ||
| - | ||
| -d.offsets <- do.call(rbind, offset.list) |
| @@ -1,240 +0,0 @@ | ||
| -## compare.py -- compare the output of a contaminant trimmer with the | ||
| -## original simulated file to gauge accuracy. | ||
| - | ||
| -from optparse import OptionParser | ||
| -import re | ||
| -import sys | ||
| -import pdb | ||
| - | ||
| -parser = OptionParser() | ||
| -parser.add_option("-r", "--readsfile", dest="reads_file", | ||
| - help="original reads", metavar="FILE") | ||
| -parser.add_option("-t", "--trimmedfile", dest="trimmed_file", | ||
| - help="trimmed file output from scythe", metavar="FILE") | ||
| - | ||
| -(options, args) = parser.parse_args() | ||
| - | ||
| -def mean(ll): | ||
| - if len(ll) > 0: | ||
| - return sum(ll)/float(len(ll)) | ||
| - return None | ||
| - | ||
| -def FASTQIter(filename): | ||
| - """ | ||
| - Return a dict with the elements of a FASTQ block. | ||
| - """ | ||
| - fastq_file = open(filename, 'r') | ||
| - for line in fastq_file: | ||
| - header = line.strip()[1:] | ||
| - seq = fastq_file.next().strip() | ||
| - fastq_file.next() | ||
| - qual = fastq_file.next().strip() | ||
| - yield {'header':header, 'seq':seq, 'qual':qual} | ||
| - | ||
| -def FASTQPairIter(filename1, filename2): | ||
| - """ | ||
| - Iterates over a pair of FASTQ Files, returning them in a list of | ||
| - dicts. | ||
| - """ | ||
| - fastq_files = (open(filename1, 'r'), open(filename2, 'r')) | ||
| - complete = False | ||
| - while not complete: | ||
| - tmp = list() | ||
| - | ||
| - for fastq_file in fastq_files: | ||
| - line = fastq_file.next() | ||
| - | ||
| - if line is None: | ||
| - # check whether other file is done; if not error. | ||
| - if fastq_files.next().next() is not None: | ||
| - raise Exception, "cannot continue: different lengthed files!" | ||
| - else: | ||
| - # both are done; set breakout condition | ||
| - compelete = True | ||
| - break | ||
| - | ||
| - header = line.strip()[1:] | ||
| - seq = fastq_file.next().strip() | ||
| - fastq_file.next() | ||
| - qual = fastq_file.next().strip() | ||
| - | ||
| - tmp.append({'seq': seq, 'header':header, 'qual':qual}) | ||
| - yield tmp | ||
| - | ||
| -def FASTQStore(reads_file, trimmed_file): | ||
| - """ | ||
| - Iterate over two FASTQ entries | ||
| - """ | ||
| - fastq_files = {'reads': open(reads_file, 'r'), 'trimmed': open(trimmed_file, 'r')} | ||
| - store = dict() | ||
| - for key in fastq_files.keys(): | ||
| - tmp_file = fastq_files[key] | ||
| - for line in tmp_file: | ||
| - header = line.strip()[1:] | ||
| - seq = tmp_file.next().strip() | ||
| - tmp_file.next() | ||
| - qual = tmp_file.next().strip() | ||
| - | ||
| - found = store.get(header) | ||
| - if not found: | ||
| - store[header] = {'reads': dict(), 'trimmed': dict()} | ||
| - store[header][key] = {'seq': seq, 'qual':qual} | ||
| - return store | ||
| - | ||
| -contaminated = 0 | ||
| -uncontaminated = 0 | ||
| -trimmed = 0 | ||
| -untrimmed = 0 | ||
| -total = 0 | ||
| -left_out = 0 # the trimmer removed a read entirely from the dataset | ||
| - | ||
| -total_positives = 0 | ||
| -total_negatives = 0 | ||
| -false_positives = 0 | ||
| -true_positives = 0 | ||
| -true_negatives = 0 | ||
| -false_negatives = 0 | ||
| -contam_diff = dict() | ||
| - | ||
| - | ||
| -## New method in which pre-trimmed and trimmed reads are matched via | ||
| -## dict keys. This allows for cases in which some trimmers remove | ||
| -## entire FASTQ entries. | ||
| -store = FASTQStore(options.reads_file, options.trimmed_file) | ||
| - | ||
| -## We need to verify nothing funky is going on with the data. | ||
| -## First, check that all headers (keys) have reads entries: | ||
| - | ||
| -if not all([len(data['reads']) == 2 for header, data in store.items()]): | ||
| - raise Exception, "Simulated reads do not all have entries - are you using different reads file from trimmed file?" | ||
| - | ||
| -for header in store: | ||
| - total += 1 | ||
| - | ||
| - block = store[header] | ||
| - | ||
| - # Check if this is a contaminated or uncontaminated entry using | ||
| - # headers from simulated reads. | ||
| - if re.search("-uncontaminated", header) is not None: | ||
| - uncontaminated += 1 | ||
| - is_contaminated = False | ||
| - if re.search("-contaminated", header) is not None: | ||
| - # note we explicitly check both to check sums are correct | ||
| - is_contaminated = True | ||
| - contaminated += 1 | ||
| - | ||
| - ## Check if the trimmer removed this read entirely | ||
| - if len(block['trimmed']) == 0: | ||
| - left_out += 1 | ||
| - is_trimmed = True # we consider this trimming the entire read | ||
| - | ||
| - offset = len(block['reads']['seq']) | ||
| - contam_diff[offset] = contam_diff.get(offset, 0) + 1 | ||
| - elif len(block['trimmed']) == 2: | ||
| - # For simple accuracy | ||
| - is_trimmed = len(block['reads']['seq']) > len(block['trimmed']['seq']) | ||
| - | ||
| - # More complex (not only trimmed, but trimmed correctly) | ||
| - m = re.match(r".*-contaminated-([0-9]+)", header) | ||
| - if m is not None and is_trimmed and is_contaminated: | ||
| - m = int(m.group(1)) | ||
| - offset = (len(block['reads']['seq']) - len(block['trimmed']['seq'])) - m | ||
| - contam_diff[offset] = contam_diff.get(offset, 0) + 1 | ||
| - else: | ||
| - is_trimmed = False | ||
| - | ||
| - trimmed += int(is_trimmed) | ||
| - untrimmed += int(not is_trimmed) | ||
| - | ||
| - if is_trimmed: | ||
| - total_positives += 1 | ||
| - if is_trimmed and is_contaminated: | ||
| - true_positives += 1 | ||
| - if is_trimmed and not is_contaminated: | ||
| - false_positives += 1 | ||
| - if not is_trimmed and is_contaminated: | ||
| - false_negatives += 1 | ||
| - if not is_trimmed and not is_contaminated: | ||
| - true_negatives += 1 | ||
| - | ||
| -# for reads_block, trimmed_block in FASTQPairIter(options.reads_file, options.trimmed_file): | ||
| -# total += 1 | ||
| -# # Check if this is a contaminated or uncontaminated entry using | ||
| -# # headers from simulated reads. | ||
| -# if re.search("-uncontaminated", reads_block['header']) is not None: | ||
| -# uncontaminated += 1 | ||
| -# is_contaminated = False | ||
| -# if re.search("-contaminated", reads_block['header']) is not None: | ||
| -# # note we explicitly check both to check sums are correct | ||
| -# is_contaminated = True | ||
| -# contaminated += 1 | ||
| - | ||
| -# # For simple accuracy | ||
| -# is_trimmed = len(reads_block['seq']) > len(trimmed_block['seq']) | ||
| - | ||
| -# # More complex (not only trimmed, but trimmed correctly) | ||
| -# m = re.match(r".*-contaminated-([0-9]+)", reads_block['header']) | ||
| -# if m is not None and is_trimmed and is_contaminated: | ||
| -# m = int(m.group(1)) | ||
| -# offset = (len(reads_block['seq']) - len(trimmed_block['seq'])) - m | ||
| -# contam_diff[offset] = contam_diff.get(offset, 0) + 1 | ||
| - | ||
| -# trimmed += int(is_trimmed) | ||
| -# untrimmed += int(not is_trimmed) | ||
| - | ||
| -# if is_trimmed: | ||
| -# total_positives += 1 | ||
| -# if is_trimmed and is_contaminated: | ||
| -# true_positives += 1 | ||
| -# if is_trimmed and not is_contaminated: | ||
| -# false_positives += 1 | ||
| -# if not is_trimmed and is_contaminated: | ||
| -# false_negatives += 1 | ||
| -# if not is_trimmed and not is_contaminated: | ||
| -# true_negatives += 1 | ||
| - | ||
| -## check that confusion matrix values add up as they should | ||
| -assert(contaminated == true_positives + false_negatives) | ||
| -assert(uncontaminated == false_positives + true_negatives) | ||
| - | ||
| -# make string of offsets | ||
| -offset_string = "".join(["%s\t%s\n" % (k, v) for k, v in contam_diff.items()]) | ||
| -sys.stderr.write(offset_string) | ||
| - | ||
| - | ||
| -print "contaminated\t", contaminated | ||
| -print "uncontaminated\t", uncontaminated | ||
| -print "contamination rate\t", contaminated/float(total) if total > 0 else 0 | ||
| -print "total\t", total | ||
| -print "trimmed\t", trimmed | ||
| -print "untrimmed\t", untrimmed | ||
| -print "left_out\t", left_out | ||
| - | ||
| -true_positives = total_positives - false_positives | ||
| -total_negatives = true_negatives + false_negatives | ||
| - | ||
| -if total_positives > 0: | ||
| - ## check that confusion matrix values add up as they should | ||
| - assert(contaminated == true_positives + false_negatives) | ||
| - assert(uncontaminated == false_positives + true_negatives) | ||
| - | ||
| -print "false positives\t", false_positives | ||
| -print "true positives\t", true_positives | ||
| -print "total positives\t", total_positives | ||
| - | ||
| -if total_positives > 0: | ||
| - print "false positive rate\t", false_positives/float(total) | ||
| -else: | ||
| - print "false positive rate\tNA" | ||
| - | ||
| -print "true negatives\t", true_negatives | ||
| -print "false negatives\t", false_negatives | ||
| -print "total negatives\t", total_negatives | ||
| -print "false negative rate\t", (false_negatives)/float(total) | ||
| - | ||
| -if true_negatives > 0: | ||
| - print "sensitivity\t", true_positives/float(contaminated) if contaminated > 0 else 0 | ||
| - print "specificity\t", true_negatives/float(false_positives + true_negatives) | ||
| -else: | ||
| - print "sensitivity\tNA" | ||
| - print "specificity\tNA" |
Oops, something went wrong.
0 comments on commit
e7b2195