diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dda806e --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +# History files +.Rhistory + +# Example code in package build process +*-Ex.R + +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +.DS_Store + +# Thumbnails +._* + +# Files that might appear on external disk +.Spotlight-V100 +.Trashes + + +# Object files +*.o + +# Libraries +*.lib + +# Shared objects (inc. Windows DLLs) +*.dll +*.so + +# Executables +*.exe +*.out + +scythe + +*.gch diff --git a/new-testing/analyze.R b/new-testing/analyze.R deleted file mode 100644 index 158a2cf..0000000 --- a/new-testing/analyze.R +++ /dev/null @@ -1,93 +0,0 @@ -## analyze.R -- analyze the results from trimming of simulated reads. - - -## In the cross-tool comparison, we are using 0.4 contamination -scythe.dir <- "scythe/results/0.4/" -cutadapt.dir <- "cutadapt/results/0.4/" -btrim.dir <- "btrim/results/0.4/" - -scythe.result.files <- list.files(scythe.dir, pattern="results-.*\\.txt") -cutadapt.result.files <- list.files(cutadapt.dir, pattern="results-.*\\.txt") -btrim.result.files <- list.files(btrim.dir, pattern="results-.*\\.txt") - -tmp.d <- lapply(scythe.result.files, function(f) { - stopifnot(file.exists(file.path(cutadapt.dir, f))) ## should have corresponding files - tmp <- gsub("results-(\\d+)-(0.\\d+).txt", "\\1;;;\\2", f) - tmp <- unlist(strsplit(tmp, ";;;")) - rep <- as.numeric(tmp[1]) - contam <- as.numeric(tmp[2]) - d.scythe <- read.table(file.path(scythe.dir, f), sep="\t", col.names=c("name", "value"), stringsAsFactors=FALSE) - d.cutadapt <- read.table(file.path(cutadapt.dir, f), sep="\t", col.names=c("name", "value"), stringsAsFactors=FALSE) - tmp <- rbind(cbind(d.cutadapt, rep, contam, trimmer="cutadapt"), cbind(d.scythe, rep, contam, trimmer="scythe")) - tmp -}) - -# btrim has diff input parameter - num mismatches, so it needs to be handled seperately -tmp.btrim.d <- lapply(btrim.result.files, function(f) { - tmp <- gsub("results-(\\d+)-(\\d+).txt", "\\1;;;\\2", f) - tmp <- unlist(strsplit(tmp, ";;;")) - rep <- as.numeric(tmp[1]) - num.mm <- as.numeric(tmp[2]) - d.btrim <- read.table(file.path(btrim.dir, f), sep="\t", col.names=c("name", "value"), stringsAsFactors=FALSE) - if (nrow(d.btrim)) - tmp <- rbind(cbind(d.btrim, rep, contam=num.mm, trimmer="btrim")) - else - tmp <- NULL - tmp -}) - -d <- rbind(do.call(rbind, tmp.d), do.call(rbind, tmp.btrim.d)) - - -## just get TPR/FPR info -roc.d <- local({ - tmp <- d[d$name == "sensitivity", c("trimmer", "value", "rep", "contam")] - tmp <- cbind(d[d$name == "sensitivity", c("trimmer", "value", "rep", "contam")], d[d$name == "specificity", c("trimmer", "value", "rep", "contam")]) - colnames(tmp) <- c("trimmer", "tpr", "rep", "contam", "trimmer", "specificity", "rep", "contam") - tmp$fpr <- 1 - tmp$specificity - tmp -}) - - -# with all sim data with contam = 0.4 -trimmer.colors <- c(scythe="blue", cutadapt="green", btrim="purple") -plot(tpr ~ fpr, data=roc.d, xlim=c(0, 1), ylim=c(0, 1), type="n") -for (t in levels(roc.d$trimmer)) { - for (r in unique(roc.d$rep)) { - # remove useless 0, 0 points - with(roc.d[-which(roc.d$tpr == 0 & roc.d$fpr == 0), ], { - lines(fpr[trimmer==t & rep==r], tpr[trimmer==t & rep==r], col=trimmer.colors[t]) - ## text(fpr[trimmer==t & rep==r], tpr[trimmer==t & rep==r], col=trimmer.colors[t], labels=contam[trimmer==t & rep==r]) - }) - } -} - -# with just one rep, and priors/errors -trimmer.colors <- c(scythe="blue", cutadapt="green") -plot(tpr ~ fpr, data=roc.d, xlim=c(0, 1), ylim=c(0, 1), type="n") -for (t in levels(roc.d$trimmer)) { - with(roc.d[roc.d$trimmer==t & roc.d$rep==1 & roc.d$fpr != 0 & roc.d$tpr != 0, ], { - text(fpr[trimmer==t], tpr[trimmer==t], col=trimmer.colors[t], labels=contam[trimmer==t]) - }) -} -stop() - -## analyze offset data -scythe.offset.files <- list.files(scythe.dir, pattern="compare-offsets-") -cutadapt.offset.files <- list.files(cutadapt.dir, pattern="compare-offsets-") - -offset.list <- lapply(scythe.offset.files, function(f) { - stopifnot(file.exists(file.path(cutadapt.dir, f))) ## should have corresponding files - tmp <- gsub("compare-offsets-(\\d+)-(0.\\d+).txt", "\\1;;;\\2", f) - tmp <- unlist(strsplit(tmp, ";;;")) - rep <- as.numeric(tmp[1]) - contam <- as.numeric(tmp[2]) - d.scythe <- read.table(file.path(scythe.dir, f), sep="\t", col.names=c("offset", "count"), stringsAsFactors=FALSE) - d.cutadapt <- read.table(file.path(cutadapt.dir, f), sep="\t", col.names=c("offset", "count"), stringsAsFactors=FALSE) - if (nrow(d.scythe) & nrow(d.cutadapt)) - rbind(cbind(d.scythe, rep, contam, total=sum(d.scythe$count), trimmer="scythe"), cbind(d.cutadapt, rep, contam, total=sum(d.cutadapt$count), trimmer="cutadapt")) - else - NULL -}) - -d.offsets <- do.call(rbind, offset.list) diff --git a/new-testing/lib/compare.py b/new-testing/lib/compare.py deleted file mode 100644 index 4ec7fc8..0000000 --- a/new-testing/lib/compare.py +++ /dev/null @@ -1,240 +0,0 @@ -## compare.py -- compare the output of a contaminant trimmer with the -## original simulated file to gauge accuracy. - -from optparse import OptionParser -import re -import sys -import pdb - -parser = OptionParser() -parser.add_option("-r", "--readsfile", dest="reads_file", - help="original reads", metavar="FILE") -parser.add_option("-t", "--trimmedfile", dest="trimmed_file", - help="trimmed file output from scythe", metavar="FILE") - -(options, args) = parser.parse_args() - -def mean(ll): - if len(ll) > 0: - return sum(ll)/float(len(ll)) - return None - -def FASTQIter(filename): - """ - Return a dict with the elements of a FASTQ block. - """ - fastq_file = open(filename, 'r') - for line in fastq_file: - header = line.strip()[1:] - seq = fastq_file.next().strip() - fastq_file.next() - qual = fastq_file.next().strip() - yield {'header':header, 'seq':seq, 'qual':qual} - -def FASTQPairIter(filename1, filename2): - """ - Iterates over a pair of FASTQ Files, returning them in a list of - dicts. - """ - fastq_files = (open(filename1, 'r'), open(filename2, 'r')) - complete = False - while not complete: - tmp = list() - - for fastq_file in fastq_files: - line = fastq_file.next() - - if line is None: - # check whether other file is done; if not error. - if fastq_files.next().next() is not None: - raise Exception, "cannot continue: different lengthed files!" - else: - # both are done; set breakout condition - compelete = True - break - - header = line.strip()[1:] - seq = fastq_file.next().strip() - fastq_file.next() - qual = fastq_file.next().strip() - - tmp.append({'seq': seq, 'header':header, 'qual':qual}) - yield tmp - -def FASTQStore(reads_file, trimmed_file): - """ - Iterate over two FASTQ entries - """ - fastq_files = {'reads': open(reads_file, 'r'), 'trimmed': open(trimmed_file, 'r')} - store = dict() - for key in fastq_files.keys(): - tmp_file = fastq_files[key] - for line in tmp_file: - header = line.strip()[1:] - seq = tmp_file.next().strip() - tmp_file.next() - qual = tmp_file.next().strip() - - found = store.get(header) - if not found: - store[header] = {'reads': dict(), 'trimmed': dict()} - store[header][key] = {'seq': seq, 'qual':qual} - return store - -contaminated = 0 -uncontaminated = 0 -trimmed = 0 -untrimmed = 0 -total = 0 -left_out = 0 # the trimmer removed a read entirely from the dataset - -total_positives = 0 -total_negatives = 0 -false_positives = 0 -true_positives = 0 -true_negatives = 0 -false_negatives = 0 -contam_diff = dict() - - -## New method in which pre-trimmed and trimmed reads are matched via -## dict keys. This allows for cases in which some trimmers remove -## entire FASTQ entries. -store = FASTQStore(options.reads_file, options.trimmed_file) - -## We need to verify nothing funky is going on with the data. -## First, check that all headers (keys) have reads entries: - -if not all([len(data['reads']) == 2 for header, data in store.items()]): - raise Exception, "Simulated reads do not all have entries - are you using different reads file from trimmed file?" - -for header in store: - total += 1 - - block = store[header] - - # Check if this is a contaminated or uncontaminated entry using - # headers from simulated reads. - if re.search("-uncontaminated", header) is not None: - uncontaminated += 1 - is_contaminated = False - if re.search("-contaminated", header) is not None: - # note we explicitly check both to check sums are correct - is_contaminated = True - contaminated += 1 - - ## Check if the trimmer removed this read entirely - if len(block['trimmed']) == 0: - left_out += 1 - is_trimmed = True # we consider this trimming the entire read - - offset = len(block['reads']['seq']) - contam_diff[offset] = contam_diff.get(offset, 0) + 1 - elif len(block['trimmed']) == 2: - # For simple accuracy - is_trimmed = len(block['reads']['seq']) > len(block['trimmed']['seq']) - - # More complex (not only trimmed, but trimmed correctly) - m = re.match(r".*-contaminated-([0-9]+)", header) - if m is not None and is_trimmed and is_contaminated: - m = int(m.group(1)) - offset = (len(block['reads']['seq']) - len(block['trimmed']['seq'])) - m - contam_diff[offset] = contam_diff.get(offset, 0) + 1 - else: - is_trimmed = False - - trimmed += int(is_trimmed) - untrimmed += int(not is_trimmed) - - if is_trimmed: - total_positives += 1 - if is_trimmed and is_contaminated: - true_positives += 1 - if is_trimmed and not is_contaminated: - false_positives += 1 - if not is_trimmed and is_contaminated: - false_negatives += 1 - if not is_trimmed and not is_contaminated: - true_negatives += 1 - -# for reads_block, trimmed_block in FASTQPairIter(options.reads_file, options.trimmed_file): -# total += 1 -# # Check if this is a contaminated or uncontaminated entry using -# # headers from simulated reads. -# if re.search("-uncontaminated", reads_block['header']) is not None: -# uncontaminated += 1 -# is_contaminated = False -# if re.search("-contaminated", reads_block['header']) is not None: -# # note we explicitly check both to check sums are correct -# is_contaminated = True -# contaminated += 1 - -# # For simple accuracy -# is_trimmed = len(reads_block['seq']) > len(trimmed_block['seq']) - -# # More complex (not only trimmed, but trimmed correctly) -# m = re.match(r".*-contaminated-([0-9]+)", reads_block['header']) -# if m is not None and is_trimmed and is_contaminated: -# m = int(m.group(1)) -# offset = (len(reads_block['seq']) - len(trimmed_block['seq'])) - m -# contam_diff[offset] = contam_diff.get(offset, 0) + 1 - -# trimmed += int(is_trimmed) -# untrimmed += int(not is_trimmed) - -# if is_trimmed: -# total_positives += 1 -# if is_trimmed and is_contaminated: -# true_positives += 1 -# if is_trimmed and not is_contaminated: -# false_positives += 1 -# if not is_trimmed and is_contaminated: -# false_negatives += 1 -# if not is_trimmed and not is_contaminated: -# true_negatives += 1 - -## check that confusion matrix values add up as they should -assert(contaminated == true_positives + false_negatives) -assert(uncontaminated == false_positives + true_negatives) - -# make string of offsets -offset_string = "".join(["%s\t%s\n" % (k, v) for k, v in contam_diff.items()]) -sys.stderr.write(offset_string) - - -print "contaminated\t", contaminated -print "uncontaminated\t", uncontaminated -print "contamination rate\t", contaminated/float(total) if total > 0 else 0 -print "total\t", total -print "trimmed\t", trimmed -print "untrimmed\t", untrimmed -print "left_out\t", left_out - -true_positives = total_positives - false_positives -total_negatives = true_negatives + false_negatives - -if total_positives > 0: - ## check that confusion matrix values add up as they should - assert(contaminated == true_positives + false_negatives) - assert(uncontaminated == false_positives + true_negatives) - -print "false positives\t", false_positives -print "true positives\t", true_positives -print "total positives\t", total_positives - -if total_positives > 0: - print "false positive rate\t", false_positives/float(total) -else: - print "false positive rate\tNA" - -print "true negatives\t", true_negatives -print "false negatives\t", false_negatives -print "total negatives\t", total_negatives -print "false negative rate\t", (false_negatives)/float(total) - -if true_negatives > 0: - print "sensitivity\t", true_positives/float(contaminated) if contaminated > 0 else 0 - print "specificity\t", true_negatives/float(false_positives + true_negatives) -else: - print "sensitivity\tNA" - print "specificity\tNA" diff --git a/new-testing/lib/read-sim.R b/new-testing/lib/read-sim.R deleted file mode 100644 index 2949dfb..0000000 --- a/new-testing/lib/read-sim.R +++ /dev/null @@ -1,74 +0,0 @@ -## Contaminated read simulation - -NUCLEOTIDES <- c('A', 'T', 'C', 'G') -#adapters <- list("solexa_adapter_1"="AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG") -adapter <- "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG" -#adapter <- "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" - -set.seed(0) - -addErrors <- -# given a sequence, add errors based on an Illumina-encoded quality line -function(seq, quality) { - # Note: I have checked this against - # http://seqanswers.com/forums/showthread.php?t=1523 and it appears - # correct - VB - stopifnot(nchar(seq) == nchar(quality)) - - probs.wrong <- sapply(charToRaw(quality), function(x) 1/(10^((as.integer(x) - 64)/10))) - paste(mapply(function(base, p.wrong) { - if (rbinom(1, 1, p.wrong)) - return(sample(setdiff(NUCLEOTIDES, base), 1)) - else - return(base) - }, unlist(strsplit(seq, '')), probs.wrong), collapse="") -} - -generateRandomSeq <- -# generate a random sequence, possibly with contamination of an adapter, it's length uniform -function(length, adapter=NULL, quality=NULL, is.contam=FALSE, min.contam=3) { - if (!is.contam) - paste(sample(NUCLEOTIDES, length, replace=TRUE), collapse="") - else { - ll <- sample(min.contam:nchar(adapter), 1) - contam <- substr(adapter, 1, ll) - pre.error.seq <- paste(paste(sample(NUCLEOTIDES, length-nchar(contam), replace=TRUE), collapse=""), contam, sep="") - return(list(contam.n=ll, seq=addErrors(pre.error.seq, quality))) - } -} - -contaminateFASTQEntry <- function(con, outfile, rate, min.contam=3, verbose=FALSE) { - blocks.processed <- 0 - reads <- readLines(con) - outlist <- vector('character', length(reads)) - while (blocks.processed*4 < length(reads)) { - quality <- reads[4*blocks.processed+4] - seq <- reads[4*blocks.processed+2] - header <- reads[4*blocks.processed+1] - - if (runif(1) <= rate) { - # contaminate - tmp <- generateRandomSeq(nchar(seq), adapter=adapter, quality=quality, is.contam=TRUE, min.contam=min.contam) - seq <- tmp$seq - ll <- tmp$contam.n - header <- sprintf("%s-contaminated-%d", header, ll) - } else { - header <- sprintf("%s-uncontaminated", header) - seq <- generateRandomSeq(nchar(seq), is.contam=FALSE, min.contam=min.contam) - } - - # output results to vector - outlist[4*blocks.processed+1] <- header - outlist[4*blocks.processed+2] <- seq - outlist[4*blocks.processed+3] <- sprintf("+%s", substr(header, 2, nchar(header))) - outlist[4*blocks.processed+4] <- quality - - blocks.processed <- blocks.processed + 1 - if (blocks.processed %% 100 == 0 && verbose) - message(sprintf("%d blocks processed.", blocks.processed)) - } - of <- file(outfile) - writeLines(outlist, con=of) - close(of) -} - diff --git a/new-testing/README.md b/testing/README.md similarity index 64% rename from new-testing/README.md rename to testing/README.md index 9259b6a..909b8d4 100644 --- a/new-testing/README.md +++ b/testing/README.md @@ -14,3 +14,14 @@ `run.sh` is designed to use the most recent Scythe version; it references it in the directory above the testing directory. Other trimmers must be in the `$PATH` environmental variable. + +### Trimmer Versions + + - btrim, not versioned but the 2011-09-16 version of + `btrim_mac` was downloaded from http://graphics.med.yale.edu/trim/. + - cutadapt, version 1.0 was tested. This version is dated 2011-11-04 + and downloaded from + http://code.google.com/p/cutadapt/downloads/list. + - scythe version 0.981 was used. + + diff --git a/new-testing/analysis.R b/testing/analysis.R similarity index 100% rename from new-testing/analysis.R rename to testing/analysis.R diff --git a/new-testing/btrim_adapters.fa b/testing/btrim_adapters.fa similarity index 100% rename from new-testing/btrim_adapters.fa rename to testing/btrim_adapters.fa diff --git a/testing/compare.py b/testing/compare.py deleted file mode 100644 index 9e8ebe3..0000000 --- a/testing/compare.py +++ /dev/null @@ -1,147 +0,0 @@ -## compare.py -- compare the output of a contaminant trimmer with the -## original simulated file to gauge accuracy. - -from optparse import OptionParser -import re -import sys - -parser = OptionParser() -parser.add_option("-r", "--readsfile", dest="reads_file", - help="original reads", metavar="FILE") -parser.add_option("-t", "--trimmedfile", dest="trimmed_file", - help="trimmed file output from scythe", metavar="FILE") - -(options, args) = parser.parse_args() - -def mean(ll): - if len(ll) > 0: - return sum(ll)/float(len(ll)) - return None - -def FASTQIter(filename): - """ - Return a dict with the elements of a FASTQ block. - """ - fastq_file = open(filename, 'r') - for line in fastq_file: - header = line.strip()[1:] - seq = fastq_file.next().strip() - fastq_file.next() - qual = fastq_file.next().strip() - yield {'header':header, 'seq':seq, 'qual':qual} - -def FASTQPairIter(filename1, filename2): - """ - Iterates over a pair of FASTQ Files, returning them in tuples of - dicts. - """ - fastq_files = (open(filename1, 'r'), open(filename2, 'r')) - complete = False - while not complete: - tmp = list() - - for fastq_file in fastq_files: - line = fastq_file.next() - - if line is None: - # check whether other file is done; if not error. - if fastq_files.next().next() is not None: - raise Exception, "cannot continue: different lengthed files!" - else: - # both are done; set breakout condition - compelete = True - break - - header = line.strip()[1:] - seq = fastq_file.next().strip() - fastq_file.next() - qual = fastq_file.next().strip() - - tmp.append({'seq': seq, 'header':header, 'qual':qual}) - yield tmp - -contaminated = 0 -uncontaminated = 0 -trimmed = 0 -untrimmed = 0 -total = 0 - -total_positives = 0 -total_negatives = 0 -false_positives = 0 -true_positives = 0 -true_negatives = 0 -false_negatives = 0 -contam_diff = dict() - -for reads_block, trimmed_block in FASTQPairIter(options.reads_file, options.trimmed_file): - total += 1 - # Check if this is a contaminated or uncontaminated entry using - # headers from simulated reads. - if re.search("-uncontaminated", reads_block['header']) is not None: - uncontaminated += 1 - is_contaminated = False - if re.search("-contaminated", reads_block['header']) is not None: - # note we explicitly check both to check sums are correct - is_contaminated = True - contaminated += 1 - - # For simple accuracy - is_trimmed = len(reads_block['seq']) > len(trimmed_block['seq']) - - # More complex (not only trimmed, but trimmed correctly) - m = re.match(r".*-contaminated-([0-9]+)", reads_block['header']) - if m is not None: - m = int(m.group(1)) - offset = (len(reads_block['seq']) - len(trimmed_block['seq'])) - m - contam_diff[offset] = contam_diff.get(offset, 0) + 1 - - trimmed += int(is_trimmed) - untrimmed += int(not is_trimmed) - - if is_trimmed: - total_positives += 1 - if is_trimmed and is_contaminated: - true_positives += 1 - if is_trimmed and not is_contaminated: - false_positives += 1 - if not is_trimmed and is_contaminated: - false_negatives += 1 - if not is_trimmed and not is_contaminated: - true_negatives += 1 - -## check that confusion matrix values add up as they should -assert(contaminated == true_positives + false_negatives) -assert(uncontaminated == false_positives + true_negatives) - -# make string of offsets -offset_string = "".join(["%s\t%s\n" % (k, v) for k, v in contam_diff.items()]) -sys.stderr.write(offset_string) - - -print "contaminated\t", contaminated -print "uncontaminated\t", uncontaminated -print "contamination rate\t", contaminated/float(total) if total > 0 else 0 -print "total\t", total -print "trimmed\t", trimmed -print "untrimmed\t", untrimmed - -if total_positives > 0: - true_positives = total_positives - false_positives - total_negatives = true_negatives + false_negatives - - ## check that confusion matrix values add up as they should - assert(contaminated == true_positives + false_negatives) - assert(uncontaminated == false_positives + true_negatives) - - print "false positives\t", false_positives - print "true positives\t", true_positives - print "total positives\t", total_positives - print "false positive rate\t", false_positives/float(total) - - print "true negatives\t", true_negatives - print "false negatives\t", false_negatives - print "total negatives\t", total_negatives - print "false negative rate\t", (false_negatives)/float(total) - print "sensitivity\t", true_positives/float(contaminated) if contaminated > 0 else 0 - print "specificity\t", true_negatives/float(false_positives + true_negatives) diff --git a/testing/cutadapt/parse_results.R b/testing/cutadapt/parse_results.R deleted file mode 100644 index d621ea0..0000000 --- a/testing/cutadapt/parse_results.R +++ /dev/null @@ -1,16 +0,0 @@ -## results_parse.R - parse the multiple files from compare.py for cutadapt -dir <- "cutadapt/results/diff-error/" -compare.files <- list.files(dir, pattern="compare-0") -errors <- as.numeric(sub("compare-(0\\.[0-9]+)\\.txt", "\\1", compare.files)) - -d <- mapply(function(f, e) { - tmp <- read.table(file.path(dir, f), header=FALSE, sep="\t", col.names=c("field", "value")) - cbind(tmp, error=e) -}, compare.files, errors, SIMPLIFY=FALSE) - -d.cutadapt <- do.call(rbind, d) - -## sens <- d[d$field == "sensitivity", 'value'] -## spec <- d[d$field == "specificity", 'value'] - -## plot(1-spec, sens, xlim=c(0, 1), ylim=c(0, 1)) diff --git a/testing/cutadapt/run-all.sh b/testing/cutadapt/run-all.sh deleted file mode 100644 index 042487f..0000000 --- a/testing/cutadapt/run-all.sh +++ /dev/null @@ -1,20 +0,0 @@ -simdir=./sim-reads/ -readsfiles=$(ls ./sim-reads/) - -if [ ! -d cutadapt/results ]; -then - echo "creating directory for cutadapt results" - mkdir -p cutadapt/results/all -fi - -for rfile in $readsfiles -do - contam=$(echo $rfile | sed 's/sim-reads-\(.*\)\.fastq/\1/') - echo $contam - cutadapt -e 0.2 -a AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT $simdir/$rfile > cutadapt/results/trimmed-$contam.fastq 2> /dev/null - python compare.py -r $simdir/$rfile -t cutadapt/results/trimmed-$contam.fastq > cutadapt/results/compare-$contam.txt -done - - - - diff --git a/testing/cutadapt/run-diff-error.sh b/testing/cutadapt/run-diff-error.sh deleted file mode 100644 index b05c7b1..0000000 --- a/testing/cutadapt/run-diff-error.sh +++ /dev/null @@ -1,7 +0,0 @@ -rfile=sim-reads/sim-reads-0.500000.fastq - -for error in 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9 0.95 -do - cutadapt -e $error -a AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT $rfile > cutadapt/results/diff-error/trimmed-$error.fastq 2> /dev/null - python compare.py -r $rfile -t cutadapt/results/diff-error/trimmed-$error.fastq > cutadapt/results/diff-error/compare-$error.txt 2> cutadapt/results/diff-error/compare-offsets-$error.txt -done \ No newline at end of file diff --git a/new-testing/generate-reads.R b/testing/generate-reads.R similarity index 96% rename from new-testing/generate-reads.R rename to testing/generate-reads.R index a25e263..e1d978f 100644 --- a/new-testing/generate-reads.R +++ b/testing/generate-reads.R @@ -1,7 +1,7 @@ ## generate-reads.R -- generate reads with different contamination ## rates. -source("lib/read-sim.R") +source("read-sim.R") contams <- seq(0, 0.95, 0.05) n.reps <- 10 diff --git a/testing/process_results.R b/testing/process_results.R deleted file mode 100644 index 4fe5021..0000000 --- a/testing/process_results.R +++ /dev/null @@ -1,19 +0,0 @@ -## process_results.R - process all results - -## For the sim-reads file at the 0.5 contamination level -source("cutadapt/parse_results.R") -source("scythe/parse_results.R") - -sens.scythe <- d.scythe[d.scythe$field == "sensitivity", 'value'] -spec.scythe <- d.scythe[d.scythe$field == "specificity", 'value'] -priors.scythe <- d.scythe[d.scythe$field == "specificity", 'error'] - -plot(1-spec.scythe, sens.scythe, xlim=c(0, 1), ylim=c(0, 1), type="n") -text(1-spec.scythe, sens.scythe, labels=priors.scythe, col="blue") - -sens.cutadapt <- d.cutadapt[d.cutadapt$field == "sensitivity", 'value'] -spec.cutadapt <- d.cutadapt[d.cutadapt$field == "specificity", 'value'] -error.cutadapt <- d.cutadapt[d.cutadapt$field == "specificity", 'error'] -text(1-spec.cutadapt, sens.cutadapt, labels=error.cutadapt, col="green") - -abline(a=0, b=1, col="red") diff --git a/testing/read-sampler.py b/testing/read-sampler.py deleted file mode 100644 index e683b6c..0000000 --- a/testing/read-sampler.py +++ /dev/null @@ -1,21 +0,0 @@ -import random -import sys - -random.seed(0) - -percent = float(sys.argv[1]) -file = sys.argv[2] - -fastqfile = open(file, 'r') -for line in fastqfile: - if random.random() <= percent: - sys.stdout.write(line) - sys.stdout.write(fastqfile.next()) - sys.stdout.write(fastqfile.next()) - sys.stdout.write(fastqfile.next()) - else: - fastqfile.next() - fastqfile.next() - fastqfile.next() - - diff --git a/testing/read-sim.R b/testing/read-sim.R index d6eb98c..4a8c7b7 100644 --- a/testing/read-sim.R +++ b/testing/read-sim.R @@ -1,4 +1,4 @@ -## Contaminated read simulation +## read-sim.R -- contaminated read simulation using real Illumina qualities NUCLEOTIDES <- c('A', 'T', 'C', 'G') #adapters <- list("solexa_adapter_1"="AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG") @@ -37,7 +37,7 @@ function(length, adapter=NULL, quality=NULL, is.contam=FALSE, min.contam=3) { } } -contaminateFASTQEntry <- function(con, outfile, rate, adapters, min.contam=3) { +contaminateFASTQEntry <- function(con, outfile, rate, min.contam=3, verbose=FALSE) { blocks.processed <- 0 reads <- readLines(con) outlist <- vector('character', length(reads)) @@ -64,9 +64,11 @@ contaminateFASTQEntry <- function(con, outfile, rate, adapters, min.contam=3) { outlist[4*blocks.processed+4] <- quality blocks.processed <- blocks.processed + 1 - if (blocks.processed %% 100 == 0) + if (blocks.processed %% 100 == 0 && verbose) message(sprintf("%d blocks processed.", blocks.processed)) } - writeLines(outlist, con=file(outfile)) + of <- file(outfile) + writeLines(outlist, con=of) + close(of) } diff --git a/new-testing/reads.fastq b/testing/reads.fastq similarity index 100% rename from new-testing/reads.fastq rename to testing/reads.fastq diff --git a/testing/results_parse.py b/testing/results_parse.py deleted file mode 100644 index 6385e0f..0000000 --- a/testing/results_parse.py +++ /dev/null @@ -1,125 +0,0 @@ -## results-parse.py -- take output from scythe run on simulated reads -## and generate statistics. - -from optparse import OptionParser -import re -import sys - -parser = OptionParser() -parser.add_option("-r", "--readsfile", dest="reads_file", - help="original reads", metavar="FILE") -parser.add_option("-m", "--matchesfile", dest="matches_file", - help="matches file output from scythe", metavar="FILE") -parser.add_option("-t", "--trimmedfile", dest="trimmed_file", - help="trimmed file output from scythe", metavar="FILE") - -(options, args) = parser.parse_args() - -def mean(ll): - if len(ll) > 0: - return sum(ll)/float(len(ll)) - return None - -def FASTQIter(filename): - """ - Return a dict with the elements of a FASTQ block. - """ - fastq_file = open(filename, 'r') - for line in fastq_file: - header = line.strip()[1:] - seq = fastq_file.next().strip() - fastq_file.next() - qual = fastq_file.next().strip() - yield {'header':header, 'seq':seq, 'qual':qual} - -def matchesIter(filename): - """ - Return a dict with the elements of the matches file block. - """ - matches_file = open(filename, 'r') - for line in matches_file: - p_contam, p_ncontam, adapter = [k.split(': ')[1] for k in line.split('; ')] - header = matches_file.next().strip() - read = matches_file.next().strip() - matches = matches_file.next().strip() - seq = matches_file.next().strip() - qual = matches_file.next().strip() - base_probs = [float(k) for k in matches_file.next().strip()[1:-1].split(', ')] # remove [, ], & split - matches_file.next().strip() - - false_pos = re.search("-uncontaminated", header) is not None - yield {'false_pos':false_pos, 'header':header, 'p_contam':p_contam, 'p_ncontam':p_ncontam, 'base_probs':base_probs} - - -## Gather initial statistics on the simulated reads actual contamination rate -uncontaminated = 0 -contaminated = 0 -total = 0 -for block in FASTQIter(options.reads_file): - if re.search("-uncontaminated", block['header']) is not None: - uncontaminated += 1 - if re.search("-contaminated", block['header']) is not None: - contaminated += 1 - total += 1 -print "contaminated\t", contaminated -print "uncontaminated\t", uncontaminated -print "contamination rate\t", contaminated/float(total) if total > 0 else 0 -print "total\t", total - -## Gather statistics on the number of false positives via the matches file -false_positives = 0 -total_positives = 0 -false_positives_qual = list() -true_positives_qual = list() -for block in matchesIter(options.matches_file): - is_false_positive = block['false_pos'] - false_positives += 1 if is_false_positive else 0 - base_probs = mean(block['base_probs']) - if is_false_positive: - false_positives_qual.append(base_probs) - else: - true_positives_qual.append(base_probs) - total_positives += 1 - -## Gather statistics on the number of false negatives from clipped file -false_negatives = 0 -true_negatives = 0 -contam_diff = dict() - -for block in FASTQIter(options.trimmed_file): - if re.search(r"-contaminated-[0-9]+$", block['header']) is not None: - false_negatives += 1 - if re.search(r"-uncontaminated$", block['header']) is not None: - true_negatives += 1 - m = re.match(r".*-contaminated-([0-9]+);;cut_scythe-([0-9]+)", block['header']) - if m is not None: - n_contam, n_cut = m.group(1, 2) - d = int(n_contam) - int(n_cut) - contam_diff[d] = contam_diff.get(d, 0) + 1 - -# make string of offsets -offset_string = "".join(["%s\t%s\n" % (k, v) for k, v in contam_diff.items()]) -#sys.stderr.write(offset_string) -print offset_string - -if total_positives > 0: - true_positives = total_positives - false_positives - total_negatives = true_negatives + false_negatives - - ## check that confusion matrix values add up as they should - assert(contaminated == true_positives + false_negatives) - assert(uncontaminated == false_positives + true_negatives) - - print "false positives\t", false_positives - print "true positives\t", true_positives - print "total positives\t", total_positives - print "false positive rate\t", false_positives/float(total) - print "false positive mean qual\t", mean(false_positives_qual) - print "true positive mean qual\t", mean(true_positives_qual) - - print "true negatives\t", true_negatives - print "false negatives\t", false_negatives - print "total negatives\t", total_negatives - print "false negative rate\t", (false_negatives)/float(total) - print "sensitivity\t", true_positives/float(contaminated) if contaminated > 0 else 0 - print "specificity\t", true_negatives/(false_positives + true_negatives) diff --git a/new-testing/run.sh b/testing/run.sh similarity index 73% rename from new-testing/run.sh rename to testing/run.sh index adb66e3..4bac899 100644 --- a/new-testing/run.sh +++ b/testing/run.sh @@ -1,24 +1,24 @@ ## run.sh -- run tests -contam_rates=$(ls simulated-reads/) +contam_rate=0.4 -mkdir -p scythe/results/0.4/ +mkdir -p scythe/results/$contam_rates ## scythe trimming - multiple priors for 40% contamination -cfile=simulated-reads/0.4 +cfile=simulated-reads/$contam_rates for prior in 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 do for rep in 01 02 03 04 05 06 07 08 09 10 do - ../scythe -n 0 -p $prior -a ../illumina_adapters.fa $cfile/$rep.fastq > scythe/results/0.4/trimmed-$rep-$prior.fastq 2> /dev/null + ../scythe -n 0 -p $prior -a ../illumina_adapters.fa $cfile/$rep.fastq > scythe/results/$contam_rates/trimmed-$rep-$prior.fastq 2> /dev/null done done ## cutadapt trimming - multiple error rates for 40% contamination -mkdir -p cutadapt/results/0.4/ +mkdir -p cutadapt/results/$contam_rates/ for error in 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 do for rep in 01 02 03 04 05 06 07 08 09 10 do - cutadapt -e $error -a AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT $cfile/$rep.fastq > cutadapt/results/0.4/trimmed-$rep-$error.fastq 2> /dev/null + cutadapt -e $error -a AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT $cfile/$rep.fastq > cutadapt/results/$contam_rates/trimmed-$rep-$error.fastq 2> /dev/null done done @@ -28,12 +28,12 @@ done ## - no quality trimming ## - 3'-end only, and the varying error is the ## number of mismatches -mkdir -p btrim/results/0.4/ +mkdir -p btrim/results/$contam_rates/ for error in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ## Up to max error do for rep in 01 02 03 04 05 06 07 08 09 10 do - btrim_mac -v $error -3 -a 0 -P -p btrim_adapters.fa -t $cfile/$rep.fastq -o btrim/results/0.4/trimmed-$rep-$error.fastq > /dev/null + btrim_mac -v $error -3 -a 0 -P -p btrim_adapters.fa -t $cfile/$rep.fastq -o btrim/results/$contam_rates/trimmed-$rep-$error.fastq > /dev/null done done diff --git a/testing/run_tests.R b/testing/run_tests.R deleted file mode 100644 index 915d7cf..0000000 --- a/testing/run_tests.R +++ /dev/null @@ -1,70 +0,0 @@ -## run tests and get results -library(lattice) -## source read simulation functions -source('read-sim.R') - -pythonDictConvert <- function(x) { - # convert python dict to dataframe - tmp <- substr(x, 2, nchar(x)-1) # move braces - tmp <- unlist(strsplit(tmp, ", ")) - as.data.frame(do.call(rbind, strsplit(tmp, ": "))) -} - -if (!file.exists("sim-reads")) - dir.create("sim-reads") - -contam.rates <- seq(0, 0.9, 0.1) - - -prior.data <- lapply(contam.rates, function(contam.rate) { - priors <- seq(0, 1, 0.1) - - ## simulate reads - ff <- file("test-sample.fastq", open="r") - contaminateFASTQEntry(ff, outfile=sprintf("sim-reads/sim-reads-%f.fastq", contam.rate), contam.rate, adapter) - - out <- lapply(priors, function(prior) { - system(sprintf("../scythe -t -n 0 -a ../solexa_adapters.fa -p %f -m results/matches-prior-%f.txt -o results/out-prior-%f.fastq sim-reads/sim-reads-%f.fastq", prior, prior, prior, contam.rate), intern=TRUE) - - cmd <- sprintf("python results_parse.py -m results/matches-prior-%f.txt -r sim-reads/sim-reads-%f.fastq -t results/out-prior-%f.fastq > results/results-prior-%f.txt", prior, contam.rate, prior, prior) - ok <- system(cmd) - stopifnot(ok == 0) - tmp <- t(read.table(sprintf("results/results-prior-%f.txt", prior), header=FALSE, sep="\t")) - - if (ncol(tmp) < 6) - return(NULL) - - ## process contam diff - contam.diff <- tmp[, tmp[1, ] == "contam diff"][2] - contam.diff <- pythonDictConvert(contam.diff) - - d <- as.numeric(t(tmp[2, ])) - dim(d) <- c(1, length(d)) - colnames(d) <- tmp[1, ] - - # write contam diffs to be processed later - if (length(contam.diff) >= 2) { - colnames(contam.diff) <- c("diff", "count") - write.table(contam.diff, file.path("results", sprintf("prior-%f-contam-%f-diffs.txt", prior, contam.rate)), sep="\t", quote=FALSE) - } - as.data.frame(d) - }) - include <- !unlist(lapply(out, is.null)) - priors <- seq(0, 1, 0.1)[include] - d <- cbind(contam.rate=contam.rate, prior=priors, do.call(rbind, out[include])) - - return(d) -}) - -## Munge data together for all contamination rates and plot with lattice -prior.data <- do.call(rbind, prior.data) -prior.data$y <- prior.data[, 'sensitivity'] -prior.data$x <- 1-prior.data[, 'specificity'] -png("roc-curves.png", width=1000, height=1000, res=100) -p <- xyplot(y ~ x | factor(contam.rate), data=prior.data, ylim=c(0, 1), xlim=c(0, 1), - panel=function(x, y, subscript, ...) { - panel.text(x, y, label=prior.data[subscript, 'prior'], cex=0.6) - panel.abline(a=0, b=1, col="purple") - }, xlab="1 - specificity (FPR)", ylab="sensitivity (TPR)") -print(p) -dev.off() diff --git a/testing/scythe/parse_results.R b/testing/scythe/parse_results.R deleted file mode 100644 index b95511b..0000000 --- a/testing/scythe/parse_results.R +++ /dev/null @@ -1,16 +0,0 @@ -## results_parse.R - parse the multiple files from compare.py for scythe -dir <- "scythe/results/diff-contam/" -compare.files <- list.files(dir, pattern="compare-0") -priors <- as.numeric(sub("compare-(0\\.[0-9]+)\\.txt", "\\1", compare.files)) - -d <- mapply(function(f, e) { - tmp <- read.table(file.path(dir, f), header=FALSE, sep="\t", col.names=c("field", "value")) - cbind(tmp, error=e) -}, compare.files, priors, SIMPLIFY=FALSE) - -d.scythe <- do.call(rbind, d) - -## sens <- d[d$field == "sensitivity", 'value'] -## spec <- d[d$field == "specificity", 'value'] - -## plot(1-spec, sens, xlim=c(0, 1), ylim=c(0, 1)) diff --git a/testing/scythe/run-diff-contam.sh b/testing/scythe/run-diff-contam.sh deleted file mode 100644 index d39e18f..0000000 --- a/testing/scythe/run-diff-contam.sh +++ /dev/null @@ -1,13 +0,0 @@ -rfile=sim-reads/sim-reads-0.500000.fastq - -if [ ! -d scythe/results/diff-contam ]; -then - echo "creating directory for scythe results" - mkdir -p scythe/results/diff-contam -fi - -for contam in 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9 0.95 -do - ../scythe -n 0 -p $contam -a ../solexa_adapters.fa $rfile > scythe/results/diff-contam/trimmed-$contam.fastq 2> /dev/null - python compare.py -r $rfile -t scythe/results/diff-contam/trimmed-$contam.fastq > scythe/results/diff-contam/compare-$contam.txt 2> scythe/results/diff-contam/compare-offsets-$contam.txt -done \ No newline at end of file