Skip to content
Browse files

updated scythe's output of headers so that it writes full header string

  • Loading branch information...
1 parent 54af8fe commit e89d1802f9051499faedaca63e6228156be1cdd2 @vsbuffalo committed Feb 18, 2012
Showing with 100 additions and 24 deletions.
  1. +1 −1 Makefile
  2. 0 solexa_adapters.fa → illumina_adapters.fa
  3. +15 −6 paper/scythe-paper.org
  4. +12 −15 src/scythe.c
  5. +21 −1 src/scythe.h
  6. +51 −1 src/util.c
View
2 Makefile
@@ -45,4 +45,4 @@ build: match.o scythe.o util.o prob.o
$(CC) $(CFLAGS) $(LDFLAGS) $? -o scythe
debug:
- $(MAKE) build "CFLAGS=-Wall -pedantic -g -DDEBUG"
+ $(CC) $(LDFLAGS) $(DEBUG) -o scythe src/*.c
View
0 solexa_adapters.fa → illumina_adapters.fa
File renamed without changes.
View
21 paper/scythe-paper.org
@@ -25,10 +25,10 @@ these low-quality 3'-end sequences from reads. This is thought to
increase mapping rates and improve assembly quality. However doing
quality-based 3'-end trimming before contaminant removal would remove
sequence that could be used (despite being unreliable) to identify the
-contaminants more positively. Scythe supposes that it's better to use
-full information, even if it's unreliable. How unreliable a base is is
-indicated by the FASTQ quality score, which can be incorporated into
-classification procedures.
+contaminants more positively. Scythe takes the approach that it is
+better to use full information, even if it's unreliable. How
+unreliable a base is is indicated by the FASTQ quality score, which
+can be incorporated into classification procedures.
Fixed-number of mismatch approaches have the disadvantage that they
don't differentially weight a mismatch on a low-quality base from a
@@ -46,7 +46,7 @@ and fixed-number of mismatch techniques are accurate and
fastest. Scythe only addresses the challenging problem of contaminants
in low-quality 3-ends. In doing so, it makes some assumptions:
-1. All contaminants sequences are known /a priori/.
+1. All contaminants sequences are known /a priori/ and are reliable.
2. A contaminant sequence with length $l_c$ in a read of length $l_r$
will only be found between $l_r - l_c$ and $l_r$.
3. If the read is contaminated, the number of bases contaminating the
@@ -70,13 +70,16 @@ read. For each adapter in the file, Scythe looks for the best match in
terms of scores. A nucleotide match is scored as a 1, and a mismatch
is scored as a -1. Because Scythe doesn't address contaminants with
insertions or deletions, it doesn't use a standard alignment strategy
-(e.g. dynamic programming). Instead, it considers every possible
+(i.e. dynamic programming). Instead, it considers every possible
substring of contamination: for a contaminant of length $l_c$, it
scores all possible substrings from the 5'-end of the contaminant. The
top scoring match is then used in the classification procedure. For
efficiency, other matches are not used in the classification
procedure.[fn:: This option may be added to further Scythe versions.]
+The time complexity of Scythe's match algorithm for a single adapter
+of length is $l_a$ $O(l_a^2 * R)$ for a FASTQ file with $R$ entries.
+
** Bayesian classification of top-scoring matches
There are two mutually exclusive and exhaustive events: a top scoring
@@ -127,3 +130,9 @@ page works well as an initial guess for the prior.
* Results
+Scythe was tested against two similar program: Btrim (Kong, 2011) and
+Cutadapt (Martin, 2011). Btrim
+
+Cutadapt has an advantage over Scythe in that it does gapped
+alignments (originally it was developed to trim 454 sequences which
+have homopolymer repeats).
View
27 src/scythe.c
@@ -14,7 +14,10 @@
#include "scythe.h"
#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+__KS_GETC(gzread, BUFFER_SIZE)
+__KS_GETUNTIL(gzread, BUFFER_SIZE)
+__KSEQ_READ
+
static const float default_prior = 0.05;
#ifndef PROGRAM_NAME
@@ -240,17 +243,10 @@ int main(int argc, char *argv[]) {
/* Best match was classified as contaminated, print trimmed
results and match entry (if specified). */
contaminated++;
- if (add_tag) {
- fprintf(output_fp,
- "@%s%s-%d\n%.*s\n+%s%s-%d\n%.*s\n", seq->name.s, tag, best_match->n,
- (int) seq->seq.l-best_match->n, seq->seq.s, seq->name.s, tag, best_match->n,
- (int) seq->seq.l-best_match->n, seq->qual.s);
- } else {
- fprintf(output_fp,
- "@%s\n%.*s\n+%s\n%.*s\n", seq->name.s,
- (int) seq->seq.l-best_match->n, seq->seq.s, seq->name.s,
- (int) seq->seq.l-best_match->n, seq->qual.s);
- }
+
+ /* Write out the trimmed sequence, with optional tag added */
+ write_fastq(output_fp, seq, add_tag, tag, best_match->n);
+
/* print match for 5'-end; seq->seq.s is point to the
approperiate place by taking the sequence length and
subtracting the match length. */
@@ -275,9 +271,10 @@ int main(int argc, char *argv[]) {
aa->adapters[best_match->adapter_index].occurrences[best_match->n-1]++;
} else {
- /* No trimming done; print sequence as is. */
- fprintf(output_fp,
- "@%s\n%s\n+%s\n%s\n", seq->name.s, seq->seq.s, seq->name.s, seq->qual.s);
+ /* No trimming done; print sequence as is; no tagging because
+ it's not trimmed.
+ */
+ write_fastq(output_fp, seq, 0, NULL, -1);
}
if (best_match)
View
22 src/scythe.h
@@ -2,7 +2,26 @@
#define SCYTHE_H
#include <zlib.h>
-
+#include "kseq.h"
+
+/* KSEQ_INIT() cannot be called here, because we only need the types
+ defined. Calling KSEQ_INIT() would also define functions, leading
+ to an unused function warning with GCC. So, the basic typedefs
+ kseq.h has are included here, and each file that reads needs:
+
+ This was a fix also used in Nik Joshi's Sickle (which I also helped
+ with) and is the only way I know of dealing with this.
+
+ __KS_GETC(gzread, BUFFER_SIZE)
+ __KS_GETUNTIL(gzread, BUFFER_SIZE)
+ __KSEQ_READ
+*/
+
+#define BUFFER_SIZE 4096
+__KS_TYPE(gzFile)
+__KS_BASIC(gzFile, BUFFER_SIZE)
+__KSEQ_TYPE(gzFile)
+__KSEQ_BASIC(gzFile)
#define MAX_ADAPTERS 20
#define MATCH_SCORE 1
@@ -87,6 +106,7 @@ void print_int_array(const int *array, int n);
void print_uint_array(const unsigned int *array, int n);
void fprint_uint_array(FILE *fp, const unsigned int *array, int n);
int sum(const int *x, int n);
+void write_fastq(gzFile output_fp, kseq_t *seq, int add_tag, char *tag, int match_n);
/* match.c prototypes */
int *score_sequence(const char *seqa, const char *seqb, int n);
View
52 src/util.c
@@ -14,7 +14,10 @@
#define MIN(a,b) ((a)>(b)?(b):(a))
-KSEQ_INIT(gzFile, gzread)
+__KS_GETC(gzread, BUFFER_SIZE)
+__KS_GETUNTIL(gzread, BUFFER_SIZE)
+__KSEQ_READ
+
void *xmalloc(size_t size) {
void *ret = malloc(size);
@@ -169,3 +172,50 @@ int sum(const int *x, int n) {
s += x[i];
return s;
}
+
+void write_fastq(gzFile output_fp, kseq_t *seq, int add_tag, char *tag, int match_n) {
+ /* Heng Li's kseq.h handles FASTQ headers as such: anything after
+ the first space is put in the field "comment" of the kseq_t
+ struct. This function writes a single FASTQ block and wraps
+ simple fprintf such that we don't have to worry about whether
+ there's a comment or not. This has a variety of arguments for
+ different output options,
+ */
+ if (match_n > 0) {
+ /* If match_n is the number of matches we have to trim by, so
+ output trimmed FASTQ seq (and possible header if add_tag is
+ true. */
+ if (add_tag) {
+ if (seq->comment.s)
+ fprintf(output_fp,
+ "@%s %s%s-%d\n%.*s\n+%s %s%s-%d\n%.*s\n", seq->name.s, seq->comment.s, tag, match_n,
+ (int) seq->seq.l-match_n, seq->seq.s, seq->name.s, seq->comment.s, tag, match_n,
+ (int) seq->seq.l-match_n, seq->qual.s);
+ else
+ fprintf(output_fp,
+ "@%s%s-%d\n%.*s\n+%s%s-%d\n%.*s\n", seq->name.s, tag, match_n,
+ (int) seq->seq.l-match_n, seq->seq.s, seq->name.s, tag, match_n,
+ (int) seq->seq.l-match_n, seq->qual.s);
+
+ } else {
+ if (seq->comment.s)
+ fprintf(output_fp,
+ "@%s %s\n%.*s\n+%s %s\n%.*s\n", seq->name.s, seq->comment.s,
+ (int) seq->seq.l-match_n, seq->seq.s, seq->name.s, seq->comment.s,
+ (int) seq->seq.l-match_n, seq->qual.s);
+ else
+ fprintf(output_fp,
+ "@%s\n%.*s\n+%s\n%.*s\n", seq->name.s,
+ (int) seq->seq.l-match_n, seq->seq.s, seq->name.s,
+ (int) seq->seq.l-match_n, seq->qual.s);
+
+ }
+ } else {
+ if (seq->comment.s)
+ fprintf(output_fp,
+ "@%s %s\n%s\n+%s %s\n%s\n", seq->name.s, seq->comment.s, seq->seq.s, seq->name.s, seq->comment.s, seq->qual.s);
+ else
+ fprintf(output_fp,
+ "@%s\n%s\n+%s\n%s\n", seq->name.s, seq->seq.s, seq->name.s, seq->qual.s);
+ }
+}

0 comments on commit e89d180

Please sign in to comment.
Something went wrong with that request. Please try again.