Skip to content
Browse files

initial new matching prototype

  • Loading branch information...
1 parent 872a54c commit b24583cfb1257903384c0db7b2f7e9c13cfba9a4 @vsbuffalo committed Dec 18, 2012
Showing with 471 additions and 521 deletions.
  1. +8 −11 Makefile
  2. +1 −3 illumina_adapters.fa
  3. +141 −194 paper/scythe-paper.tex
  4. +175 −163 src/kseq.h
  5. +61 −60 src/match.c
  6. +8 −5 src/prob.c
  7. +13 −49 src/scythe.c
  8. +22 −25 src/scythe.h
  9. +42 −11 src/util.c
View
19 Makefile
@@ -1,14 +1,14 @@
PROGRAM_NAME = scythe
-VERSION = 0.981
-CC = gcc
-CFLAGS = -Wall -pedantic -DVERSION=$(VERSION) -std=gnu99
-DEBUG = -g
+VERSION = 0.99
+CC = clang
+CFLAGS = -Wall -pedantic -DVERSION=$(VERSION) -std=gnu99 -g
OPT = -O3
ARCHIVE = $(PROGRAM_NAME)_$(VERSION)
LDFLAGS = -lz -lm
+LDTESTFLAGS = -lcheck
SDIR = src
-.PHONY: clean default build distclean dist debug
+.PHONY: clean default build distclean dist
default: build
@@ -20,14 +20,14 @@ util.o: $(SDIR)/util.c $(SDIR)/kseq.h $(SDIR)/scythe.h
$(CC) $(CFLAGS) -c $(SDIR)/$*.c
prob.o: $(SDIR)/prob.c $(SDIR)/scythe.h
$(CC) $(CFLAGS) -c $(SDIR)/$*.c
-tests.o: $(SDIR)/tests/tests.c $(SDIR)/scythe.h
+test.o: $(SDIR)/match.c $(SDIR)/scythe.h $(SDIR)/prob.c
$(CC) $(CFLAGS) -c $(SDIR)/$*.c
valgrind: build
valgrind --leak-check=full --show-reachable=yes ./scythe -a solexa_adapters.fa test.fastq
-test: match.o util.o prob.o tests.o
- $(CC) $(CFLAGS) $(LDFLAGS) $? -o tests && ./tests
+test: clean match.o util.o prob.o test.o
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LDTESTFLAGS) $? -o test && ./test
testclean:
rm -rf ./tests
@@ -43,6 +43,3 @@ dist:
build: match.o scythe.o util.o prob.o
$(CC) $(CFLAGS) $(LDFLAGS) $? -o scythe
-
-debug:
- $(CC) $(LDFLAGS) $(DEBUG) -o scythe src/*.c
View
4 illumina_adapters.fa
@@ -1,4 +1,2 @@
>Solexa_forward_contam
-[MMMMMM]AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAA
->Solexa_reverse_contam
-[NNNNNN]AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAAAA
+AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
View
335 paper/scythe-paper.tex
@@ -1,93 +1,121 @@
-\documentclass{bioinfo}
-\copyrightyear{2012}
-\pubyear{2012}
+% Created 2012-11-30 Fri 14:43
+\documentclass[11pt]{article}
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage{fixltx2e}
+\usepackage{graphicx}
+\usepackage{longtable}
+\usepackage{float}
+\usepackage{wrapfig}
+\usepackage{soul}
+\usepackage{textcomp}
+\usepackage{marvosym}
+\usepackage{wasysym}
+\usepackage{latexsym}
+\usepackage{amssymb}
+\usepackage{hyperref}
+\tolerance=1000
+\providecommand{\alert}[1]{\textbf{#1}}
+
+\title{Scythe: A tool for removing 3'-end adapter contaminants using Bayesian classification}
+\author{Vince Buffalo}
+\date{}
+\hypersetup{
+ pdfkeywords={},
+ pdfsubject={},
+ pdfcreator={Emacs Org-mode version 7.8.11}}
\begin{document}
-\firstpage{1}
-
-\title[Scythe]{Scythe: A tool for removing 3'-end adapter contaminants using Bayesian classification}
-\author[Buffalo \textit{et~al}]{Vince Buffalo\,\footnote{to whom correspondence should be addressed}, Joseph Fass\, and Dawei Lin}
-\address{Bioinformatics Core, UC Davis Genome Center}
-
-\history{Received on XXXXX; revised on XXXXX; accepted on XXXXX}
-
-\editor{Associate Editor: XXXXXXX}
\maketitle
-\begin{abstract}
+\setcounter{tocdepth}{3}
+\tableofcontents
+\vspace*{1cm}
-\section{Motivation:}
-Modern sequencing technologies can leave artifactual contaminant
-sequences at the 3'-end of reads. 3'-end regions also have the lowest
-quality bases that are likely to be called incorrectly, which makes
-identifying and removing 3'-end contaminants difficult.
-\section{Results:}
-Scythe is a program designed specifically to remove 3'-end
-contaminants. It searches for 3'-end contaminants and uses a Bayesian
-model that considers individual base qualities to decide whether a
-given match is a contaminant or background sequence. With a variety of
-contamination rates, Scythe outperforms other adapter removal software
-tools.
+\textbf{Motivation:} Modern sequencing technologies can leave artifactual
+ contaminant sequences at the 3'-end of reads. 3'-end regions also
+ have the lowest quality bases that are likely to be called
+ incorrectly, which makes identifying and removing 3'-end contaminants
+ difficult. Fixed-number mismatch approaches to remove contaminants
+ perform poorly in these low quality regions. Failing to remove such
+ contaminants can seriously confound downstream analyses like assembly
+ and mapping.
-\section{Availability:}
-Scythe is freely available under the MIT license at
-\href{http://github.com/vsbuffalo/scythe}{http://github.com/vsbuffalo/scythe}.
-\section{Contact:} \href{mailto:vsbuffalo@gmail.com}{vsbuffalo@gmail.com}
-\end{abstract}
+\textbf{Results:} Scythe is a program designed specifically to remove 3'-end
+ contaminants. It searches for 3'-end contaminants and uses a Bayesian
+ model that considers individual base qualities to decide whether a
+ given match is a contaminant or background sequence. Even for a
+ variety of prior contamination rates, Scythe outperforms other
+ adapter removal software tools.
+
+\textbf{Availability:} Scythe is freely available under the MIT license at
+ \href{https://github.com/vsbuffalo/scythe}{https://github.com/vsbuffalo/scythe}.
\section{Introduction}
-Scythe is focused on trimming 3'-end contaminants, specifically those
-due to adapters or barcodes. It embraces the Unix Philosophy of
-``programs that do one thing well'' (\citealp{raymond2003}). Many
-second-generation sequencing technologies such as Illumina's Genome
-Analyzer II and HiSeq have lower-quality 3'-end bases. These
-low-quality bases are more likely to have nucleotides called
-incorrectly, making contaminant identification more
-difficult. Futhermore, 3'-end quality deterioration is not uniform
-across all reads (see Fig. 1 in Supplementary Materials).
-
-By using a Bayesian classification procedure, Scythe accurately trims
-3'-end contaminants off reads. The underlying model differentially
-weights mismatching bases according to their FASTQ quality.
-
-% A common step in read quality improvement procedures is to remove
-% these low-quality 3'-end sequences from reads. This is thought to
-% increase mapping rates and improve assembly quality. However doing
-% quality-based 3'-end trimming before contaminant removal would remove
-% sequence that could be used (despite being unreliable) to identify the
-% contaminants more accurately. Scythe takes the approach that it is
-% better to use full information, even if it's unreliable. How
-% unreliable a base is is indicated by the FASTQ quality score, which
-% can be incorporated into Scythe's classification procedure.
-
-% Fixed-number of mismatch approaches have the disadvantage that they
-% do not differentially weight a mismatch on a low-quality base from a
-% mismatch on a high-quality base. Futhermore, the fixed-number could
-% easily be exhausted in a run of bad bases (which are quite common in
-% the 3'-end), even though every good-quality base perfectly matches the
-% contaminant sequence.
-
-
-\begin{methods}
-\section{String Matching in Scythe}
-
-Scythe employs a simple string matching heuristic to find a best
-match. Scythe scores each alignment of the contaminant sequence
-against the read sequence, starting from the entire contaminant in the
-3'-end to incrementally fewer bases. A minimum match length can be
-specified via a parameter (by default, 5). Each of these alignments is
-scored using a $1$ for match, $-1$ for mismatch approach. The top
-scoring alignment is then passed to the probabilistic classification
-procedure, which decides whether the match is background sequence or a
-contaminant. The time complexity of Scythe's matching algorithm for a
-single adapter of length $l_a$ is $O(l_a^2 R)$ for a FASTQ file with
-$R$ entries.
-
-
-\section{Bayesian Classification of Top-Scoring Matches}
+\label{sec-1}
+
+
+Scythe focuses on 3'-end contaminants, specifically those due to
+adapters or barcodes. It embraces the Unix Philosophy of ``programs
+that do one thing well''
+(\href{http://www.faqs.org/docs/artu/ch01s06.html}{http://www.faqs.org/docs/artu/ch01s06.html}). Many second-generation
+sequencing technologies such as Illumina's Genome Analyzer II and
+HiSeq have lower-quality 3'-end bases. These low-quality bases are
+more likely to have nucleotides called incorrectly, making contaminant
+identification more difficult. Futhermore, 3'-end quality
+deterioration is not uniform across all reads (see figure 1 in
+Supplementary Materials), there is variation in the quality per base.
+
+A common step in read quality improvement procedures is to remove
+these low-quality 3'-end sequences from reads. This is thought to
+increase mapping rates and improve assembly quality. However doing
+quality-based 3'-end trimming before contaminant removal would remove
+sequence that could be used (despite being unreliable) to identify the
+contaminants more positively. Scythe takes the approach that it is
+better to use full information, even if it's unreliable. How
+unreliable a base is is indicated by the FASTQ quality score, which
+can be incorporated into classification procedures.
+
+Fixed-number of mismatch approaches have the disadvantage that they
+don't differentially weight a mismatch on a low-quality base from a
+mismatch on a high-quality base. Futhermore, the fixed-number could
+easily be exhausted in a run of bad bases (which are quite common in
+the 3'-end), even though every good-quality base perfectly matches the
+contaminant sequence.
+\section{Scythe's Methods}
+\label{sec-2}
+
+
+Scythe uses Bayesian methods to identify and remove a given set of
+3'-end contaminants (most often sequencing adapters). Scythe only
+checks for the adapters in the 3'-end; contaminants further towards
+the middle and 5'-ends often have high quality bases, so identifying
+and removing them is much simpler. Scythe makes some assumptions: see
+the Supplementary Materials. TODOREF
+\subsection{String matching in Scythe}
+\label{sec-2-1}
+
+
+For each adapter in the file, Scythe looks for the best match in
+terms of scores. A nucleotide match is scored as a 1, and a mismatch
+is scored as a -1. Because Scythe doesn't address contaminants with
+insertions or deletions, it doesn't use a standard alignment strategy
+(i.e. dynamic programming). Instead, it considers every possible
+substring of contamination: for a contaminant of length $l_c$, it
+scores all possible substrings from the 5'-end of the contaminant. The
+top scoring match is then used in the classification procedure. For
+efficiency, other matches are not used in the classification
+procedure.\footnote{This option may be added to further Scythe versions.
+ }
+
+The time complexity of Scythe's match algorithm for a single adapter
+of length is $l_a$ \$O(l\_{}a\^{}2 * R)\$ for a FASTQ file with $R$ entries.
+\subsection{Bayesian classification of top-scoring matches}
+\label{sec-2-2}
+
There are two mutually exclusive and exhaustive events: a top scoring
match is a contaminant or it is random sequence (that happens to be
@@ -99,135 +127,54 @@ \section{Bayesian Classification of Top-Scoring Matches}
the top-scoring match is contaminant sequence (event $C$), the
likelihood $P(S | C)$ (where $S$ is the sequence match data) is:
-$$ P(S | C) = \prod_{i=1}^{l_t} q_i^{m_i} \cdot (1-q_i)^{1 - m_i} $$
+$$ P(S | C) = \prod_i^{l_t} q_i^{m_i} \cdot (1-q_i)^{1 - m_i} $$
-Where $m_i \in \{0, 1\}$ indicating whether the $i$ base is a mismatch
-or match (respectively) and $q_i$ is the probability the $i$ base is
-called correctly (from the ASCII-encoded quality). $l_t$ is the length
-of the top-scoring match.
+Where $m_i \in \{0, 1\}$ indicating whether the $i$ base is a
+mismatch or match (respectively) and $q_i$ is the probability the $i$
+base is called correctly (from the ASCII-encoded quality). $l_t$ is
+the length of the top-scoring match.
If the top-scoring sequence is not a contaminant sequence (event
$C'$), it is assumed that the matches are just by chance. Thus, the
likelihood function is:
-$$ P(S | C') = \prod_{i=1}^{l_t} \left(\frac{1}{4}\right)^{m_i} \cdot \left(\frac{3}{4}\right)^{1 - m_i} $$
+$$ P(S | C') = \prod_i^{l_t} \left(\frac{1}{4}\right)^{m_i} \cdot \left(\frac{3}{4}\right)^{1 - m_i} $$
These likelihoods can then be combined using Bayes' Thereom to give
-the probability of contamination given the top-scoring match:
+the probability of contamination given then top-scoring match:
$$ P(C|S) = \frac{P(C) P(S|C)}{P(S)} $$
-Since these are mutually exclusive and exhaustive events, the
-\emph{maximum a posteriori} rule can be used to classify a top-scoring
-sequence as either contaminated or non-contaminated (e.g. if $P(C'|S)
-> P(C|S)$, the sequence is not contaminated and visa-versa).
+Where the denominator can be replaced with $P(S) = P(S | C)P(C) +
+P(S | C') P(C')$ by the Law of Total Probability. $P(C'|S)$ is
+calculated similarly. Since these are mutually exclusive and
+exhaustive events, the \emph{maximum a posteriori} rule can be used to
+classify a top-scoring sequence as either contaminated or
+non-contaminated (e.g. if $P(C'|S) > P(C|S)$, the sequence is not
+contaminated and visa-versa).
Required in this Bayesian formulation is the prior of contamination,
-$P(C)$. Specifying the prior may seem like a nuisance, but it is a
-useful parameter that can be adjusted for a variety of different
-contamination scenarios. More information on specifying a parameter is
-in Supplementary Materials.
-
+$P(C)$. Specifying the prior may seem like a nuisance, but it allows
+ultimately allows for more accurate classification in a variety of
+different contamination scenarios. The prior doesn't need to be
+estimated increadibly well; one technique that works well is to view
+the FASTQ file in the Unix command line tool less and search for the
+5'-end bases of the adapter contaminant. The number of results on a
+page of less output divided by the number of FASTQ entries on that
+page works well as an initial guess for the prior.
\section{Results}
-\begin{centering}
-\begin{figure*}[!tpb]
-\includegraphics[angle=-90]{graphics/roc-and-incorrect-trimmed.eps}
-\caption{ROC curve showing Scythe's higher rate of true positives for
- a given false positive rate and a bar chart indicating Scythe's
- fewer incorrectly trimmed reads.}\label{fig:02}
-\end{figure*}
-\end{centering}
-
-Scythe was tested against two similar program: Btrim
-(\citealp{pmid21651976}) and Cutadapt (\citealp{EJ200}). Both
-Btrim and Cutadapt can handle insertions/deletions, but in Illumina
-data we've analzed, no indels in adapter contaminants were
-witnessed. This problem is more associated with 454 reads, which can
-have homopolymer repeats. Both Btrim and Cutadapt also include quality
-trimmers, which Scythe intentionally does not.
-
-To test Scythe, Btrim, and Cutadapt at 3'-end adapter contaminant
-removal, random reads were generated and contaminated at fixed
-contamination rates of 40\% and 70\%. FASTQ qualities from an Illumina
-HiSeq run were added to these read sequences. Real base qualities were
-used because properly modeling and simulating 3'-end quality
-degradation is an arduous task. For the two fixed contamination rates,
-ten replicate FASTQ files were generated randomly to ensure
-contaminated and non-contaminated reads were well dispersed across
-different read qualities. Each simulated FASTQ reads file contains
-10,000 entries.
-
-Each program was run with varying parameters to see how true positive
-and false positive rates change. Btrim uses a fixed number of
-mismatches, so integer values from 0 to 10 were used. Cutadapt uses an
-error rate for a matched sequence, which was varied from 0 to 0.95 in
-0.05 increments. Likewise, Scythe uses the same values for its prior
-parameter. Each piece of software was run with all other options off
-to ensure a fair comparison of 3'-end contaminant removal.
-
-While Cutadapt and Scythe only trim reads, Btrim occasionally removes
-a read entirely from the sample. For comparison purposes, we count
-this as trimming the entire length of a read. We compared the length
-of the original simulated read to the trimmed read for all programs,
-parameters, replicates, and contamination rates. This, combined with
-information about whether a read was contaminated allows the
-calculation of true positive and false positive rates. Also, we
-investigate how many times Btrim, Cutadapt, and Scythe incorrectly
-trim a read, e.g. whether the trimmed length is not equal to the
-contaminate length for reads that were contaminated.
-
-In these tests, we find that Scythe outperforms Btrim and Cutadapt in
-terms of true positive rates for a variety of false positive rates
-across a variety of parameters (Fig. \ref{fig:02}). Furthermore,
-Scythe also has fewer incorrectly trimmed reads across a variety of
-parameters compared to Btrim and Cutadapt. Differences in
-contamination rates (40\% and 70\%) do not adversely affect Scythe's
-performance. Note that Cutadapt performs well too, but is very
-sensitive to its error parameter. Changing Cutadapt's error parameter
-from 35\% to 40\% slightly increases the true positive rate, but
-drastically increases the false positive rate. By contrast, Scythe's
-probabilistic approach is much less sensitive to different prior
-parameters, illustrated by the overplotting of the 35\% to 95\% prior
-parameters in the ROC curve (Fig. \ref{fig:02}). Furthermore, we argue
-that Scythe's parameter is easier to intuitively choose than a
-sequence error rate.
-
-\end{methods}
-
-\section{Conclusion}
-
-When compared to Btrim and Cutadapt, Scythe is a very accurate 3'-end
-contaminant trimmer appropriate for a variety of pre-analysis quality
-pipelines. It is a modular and orthogonal program, allowing use with
-other read processing tools and diagnostics between each step. We
-suggest that Scythe be used for all Illumina read processing
-pipelines.
-
-We suspect Scythe's superior performance is due to its (1)
-consideration of base-level qualities and (2) Scythe's use of Bayesian
-model rather than a simple fixed-number of mismatches or fixed-error
-approach.
-
-
-\section*{Acknowledgement}
-Monica Britton and Nikhil Joshi for helpful feedback about bugs in
-Scythe. Xiran Dong was very helpful in early testing of Scythe against
-other trimmers.
-
-%\paragraph{Funding\textcolon} None.
-
-
-\bibliographystyle{natbib}
-% \bibliographystyle{achemnat}
-% \bibliographystyle{plainnat}
-% \bibliographystyle{abbrv}
-% \bibliographystyle{bioinformatics}
-\nocite{qrqc}
-\nocite{ShortRead}
-\nocite{Biostrings}
-\nocite{ggplot2}
+\label{sec-3}
-\bibliography{references}
+Scythe was tested against two similar program: Btrim (Kong, 2011) and
+Cutadapt (Martin, 2011). Btrim
+
+Cutadapt has an advantage over Scythe in that it does gapped
+alignments (originally it was developed to trim 454 sequences which
+have homopolymer repeats).
+
+
+\bibliographystyle{plain}
+\bibliography{references}
-\end{document}
+\end{document}
View
338 src/kseq.h
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2008 Genome Research Ltd (GRL).
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@@ -23,9 +23,7 @@
SOFTWARE.
*/
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
-/* Last Modified: 12APR2009 */
+/* Last Modified: 05MAR2012 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
@@ -34,47 +32,48 @@
#include <string.h>
#include <stdlib.h>
-#define KS_SEP_SPACE 0 /* isspace(): \t, \n, \v, \f, \r */
-#define KS_SEP_TAB 1 /* isspace() && !' ' */
-#define KS_SEP_MAX 1
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
-#define __KS_TYPE(type_t) \
- typedef struct __kstream_t { \
- char *buf; \
- int begin, end, is_eof; \
- type_t f; \
- } kstream_t;
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ unsigned char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
-#define __KS_BASIC(type_t, __bufsize) \
- static inline kstream_t *ks_init(type_t f) \
- { \
- kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
- ks->f = f; \
- ks->buf = (char*)malloc(__bufsize); \
- return ks; \
- } \
- static inline void ks_destroy(kstream_t *ks) \
- { \
- if (ks) { \
- free(ks->buf); \
- free(ks); \
- } \
- }
-
-#define __KS_GETC(__read, __bufsize) \
- static inline int ks_getc(kstream_t *ks) \
- { \
- if (ks->is_eof && ks->begin >= ks->end) return -1; \
- if (ks->begin >= ks->end) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) return -1; \
- } \
- return (int)ks->buf[ks->begin++]; \
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) return -1; \
+ } \
+ return (int)ks->buf[ks->begin++]; \
}
#ifndef KSTRING_T
@@ -89,135 +88,148 @@ typedef struct __kstring_t {
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
-#define __KS_GETUNTIL(__read, __bufsize) \
- static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
- { \
- if (dret) *dret = 0; \
- str->l = 0; \
- if (ks->begin >= ks->end && ks->is_eof) return -1; \
- for (;;) { \
- int i; \
- if (ks->begin >= ks->end) { \
- if (!ks->is_eof) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) break; \
- } else break; \
- } \
- if (delimiter > KS_SEP_MAX) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == delimiter) break; \
- } else if (delimiter == KS_SEP_SPACE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i])) break; \
- } else if (delimiter == KS_SEP_TAB) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
- } else i = 0; /* never come to here! */ \
- if (str->m - str->l < i - ks->begin + 1) { \
- str->m = str->l + (i - ks->begin) + 1; \
- kroundup32(str->m); \
- str->s = (char*)realloc(str->s, str->m); \
- } \
- memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
- str->l = str->l + (i - ks->begin); \
- ks->begin = i + 1; \
- if (i < ks->end) { \
- if (dret) *dret = ks->buf[i]; \
- break; \
- } \
- } \
- if (str->l == 0) { \
- str->m = 1; \
- str->s = (char*)calloc(1, 1); \
- } \
- str->s[str->l] = '\0'; \
- return str->l; \
- }
-
-#define KSTREAM_INIT(type_t, __read, __bufsize) \
- __KS_TYPE(type_t) \
- __KS_BASIC(type_t, __bufsize) \
- __KS_GETC(__read, __bufsize) \
- __KS_GETUNTIL(__read, __bufsize)
-
-#define __KSEQ_BASIC(type_t) \
- static inline kseq_t *kseq_init(type_t fd) \
- { \
- kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
- s->f = ks_init(fd); \
- return s; \
- } \
- static inline void kseq_rewind(kseq_t *ks) \
- { \
- ks->last_char = 0; \
- ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
- } \
- static inline void kseq_destroy(kseq_t *ks) \
- { \
- if (!ks) return; \
- free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
- ks_destroy(ks->f); \
- free(ks); \
- }
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) break; \
+ } else break; \
+ } \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ SCOPE void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
/* Return value:
>=0 length of the sequence (normal)
-1 end-of-file
-2 truncated quality string
-*/
-#define __KSEQ_READ \
- static int kseq_read(kseq_t *seq) \
- { \
- int c; \
- kstream_t *ks = seq->f; \
- if (seq->last_char == 0) { /* then jump to the next header line */ \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
- if (c == -1) return -1; /* end of file */ \
- seq->last_char = c; \
- } /* the first header char has been read */ \
- seq->comment.l = seq->seq.l = seq->qual.l = 0; \
- if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
- if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
- if (isgraph(c)) { /* printable non-space character */ \
- if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
- seq->seq.m = seq->seq.l + 2; \
- kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
- } \
- seq->seq.s[seq->seq.l++] = (char)c; \
- } \
- } \
- if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
- if (c != '+') return seq->seq.l; /* FASTA */ \
- if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
- seq->qual.m = seq->seq.m; \
- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
- } \
- while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
- if (c == -1) return -2; /* we should not stop here */ \
- while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
- if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
- seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
- seq->last_char = 0; /* we have not come to the next header line */ \
- if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
- return seq->seq.l; \
- }
+ */
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
#define __KSEQ_TYPE(type_t) \
- typedef struct { \
- kstring_t name, comment, seq, qual; \
- int last_char; \
- kstream_t *f; \
- } kseq_t;
-
-#define KSEQ_INIT(type_t, __read) \
- KSTREAM_INIT(type_t, __read, 4096) \
- __KSEQ_TYPE(type_t) \
- __KSEQ_BASIC(type_t) \
- __KSEQ_READ
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 16384) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
#endif
View
121 src/match.c
@@ -1,102 +1,102 @@
/*
matches.c - sequence match scoring for scythe.
*/
-
+#include <assert.h>
+#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
-
#include "scythe.h"
+int *score_sequence(const char *seq, const char *pattern, int n) {
+ /*
+ Score a string using constant match and mismatch scores. Assumes
+ seq is longer than or of equal length to pattern. Only the first
+ n characters of pattern and seq are scored. We explicitly error
+ out of in the case of a null byte.
+
-int *score_sequence(const char *seqa, const char *seqb, int n) {
+ [----------seq-------------------]
+ |----pattern-----|
+
+ */
int i;
int *matches;
- /* printf("find_matches(): %s\n%s\n", seqa, seqb); */
- /* if (strlen(seqa) != strlen(seqb)) */
- /* die("matches() comparing strings of different lengths; unexpected error."); */
+ assert(strlen(seq) >= strlen(pattern));
+ assert(strlen(pattern) <= n);
matches = xmalloc(sizeof(int) * n);
for (i = 0; i < n; i++) {
- if (seqa[i] == seqb[i])
+ assert(seq[i] && pattern[i]); /* no string termination */
+ if (seq[i] == pattern[i])
matches[i] = MATCH_SCORE;
else
matches[i] = MISMATCH_SCORE;
}
return matches;
}
-match *find_best_match(const adapter_array *aa, const char *read,
- float *p_quals, float prior, float p_match, int min) {
- /*
- Take an adapter_array, and check the read against all adapters,
- minding 3' and 5' ends.
+match *find_best_match(const adapter_array *aa, const char *read,
+ float *p_quals, float prior, float p_match, int min_l) {
+ /*
+ Take an adapter array, and check the read against all
+ adapters. Brute force string matching is used. This is to avoid
+ approximate matching algorithms which required an a priori
+ specified number mismatches.
- Returns NULL on no match with score > 0.
*/
- match *best_match;
- int i, l, first=1, rl=strlen(read), *max=NULL, *m=NULL, max_score=0, best_n=0, current_score, best_adapter;
- float *best_p_quals; /* for the subset of read qualities of the matching sequence */
+
+ match *best_match=NULL;
+ int i, shift, max_shift;
+ int *best_arr, best_adapter=0, best_n=0, best_score=INT_MIN;
+ int al, curr_score, *curr_arr;
+ int rl = strlen(read);
+ posterior_set *ps=NULL;
+ float *best_p_quals;
+
for (i = 0; i < aa->n; i++) {
- if (min >= aa->adapters[i].length) {
+ if (min_l >= aa->adapters[i].length) {
fprintf(stderr, "Minimum match length (option -n) greater than or equal to length of adapter.\n");
exit(EXIT_FAILURE);
}
- for (l = (aa->adapters[i]).length; l > min; l--) {
- m = score_sequence(&(read)[rl-l], (aa->adapters[i]).seq, l);
- current_score = sum(m, l);
-
- /* the first sequence comparison is always the max_score */
- if (first) {
- max = m;
- max_score = current_score;
- best_p_quals = &(p_quals)[rl-l];
- best_n = l;
- first = 0;
- best_adapter = i;
- continue;
+ max_shift = rl - min_l;
+ for (shift = 0; shift < max_shift; shift++) {
+ al = min(aa->adapters[i].length, strlen(&(read)[shift]));
+ curr_arr = score_sequence(&(read)[shift], (aa->adapters[i]).seq, al);
+ curr_score = sum(curr_arr, al);
+ if (curr_score > best_score) {
+ best_score = curr_score;
+ best_arr = curr_arr;
+ best_n = shift;
+ best_p_quals = &(p_quals)[shift];
+ free(ps);
+ ps = posterior(best_arr, best_p_quals, prior, 0.25, best_n);
+ if (ps && ps->is_contam) {
+ break;
+ }
}
-
- if (current_score > max_score) {
- if (max)
- free(max); /* free last max array */
- else
- max = xmalloc(l*sizeof(int));
- max = m;
- max_score = current_score;
- best_p_quals = &(p_quals)[rl-l];
- best_n = l;
- best_adapter = i;
- } else {
- free(m);
- }
-
- /* early exit when it's no longer possible to score higher */
- if (max_score >= l-1)
- break;
}
+ if (ps && ps->is_contam)
+ break;
}
-
- if (max_score <= 0) {
- free(max);
+
+ if (best_score <= 0)
return NULL;
- }
- /* save this match */
+
+ /* save this match */
best_match = xmalloc(sizeof(match));
- best_match->match = max;
+ best_match->match = best_arr;
best_match->n = best_n;
- best_match->score = max_score;
+ best_match->ps = ps;
+ best_match->score = best_score;
best_match->adapter_index = best_adapter;
best_match->p_quals = best_p_quals;
best_match->match_pos = xmalloc(best_n*sizeof(int));
for (i = 0; i < best_n; i++) {
if (best_match->match[i] == MATCH_SCORE)
best_match->match_pos[i] = 1;
- else
- best_match->match_pos[i] = 0;
+ else
+ best_match->match_pos[i] = 0;
}
-
- /* calculate posterior qualities */
- best_match->ps = posterior(best_match->match_pos, best_p_quals, prior, 0.25, best_match->n);
return best_match;
}
@@ -110,3 +110,4 @@ void destroy_match(match *m) {
free(m->ps);
free(m);
}
+
View
13 src/prob.c
@@ -59,18 +59,21 @@ double p_data(const int *matches, float *p_quals, float p_prior_contam, float p_
posterior_set *posterior(const int *matches, float *p_quals, float p_prior, float p_match, int n) {
/* The posterior for both the random model and the non-random
model.
+
+ `matches`: a int array of 0s and 1s indicating matching and
+ mismatch positions.
+ `p_quals`: quality values convered to probabilities.
+ `p_prior`: prior probability
+ `p_match`: probability of a match (for DNA: default is 0.25)
+ `n`: match length.
*/
posterior_set *ps = xmalloc(sizeof(posterior_set));
likelihood_set *ls = likelihood(matches, p_quals, p_match, n);
double p_denom = p_data(matches, p_quals, p_prior, p_match, n);
ps->contam = (ls->contam * p_prior)/p_denom;
ps->random = (ls->random * (1-p_prior))/p_denom;
-
- if (ps->contam > ps->random)
- ps->is_contam = CONTAM;
- else
- ps->is_contam = NOT_CONTAM;
+ ps->is_contam = ps->contam > ps->random;
free(ls);
return ps;
}
View
62 src/scythe.c
@@ -14,9 +14,10 @@
#include "scythe.h"
#include "kseq.h"
+__KSEQ_BASIC(static, gzFile)
__KS_GETC(gzread, BUFFER_SIZE)
__KS_GETUNTIL(gzread, BUFFER_SIZE)
-__KSEQ_READ
+__KSEQ_READ(static)
static const float default_prior = 0.05;
@@ -63,6 +64,7 @@ static struct option long_options[] = {
/* Options with an argument */
{"adapter-file", required_argument, 0, 'a'},
{"prior", required_argument, 0, 'p'},
+ {"partial", required_argument, 0, 'P'},
{"quality-type", required_argument, 0, 'q'},
{"matches-file", required_argument, 0, 'm'},
{"output-file", required_argument, 0, 'c'},
@@ -108,7 +110,7 @@ Options:\n", stdout);
int main(int argc, char *argv[]) {
kseq_t *seq;
- int i, l, index, min=5;
+ int l, index, min=5;
int debug=0, verbose=1;
int contaminated=0, total=0;
quality_type qual_type=ILLUMINA;
@@ -117,14 +119,14 @@ int main(int argc, char *argv[]) {
adapter_array *aa;
gzFile adapter_fp=NULL, output_fp=stdout, matches_fp=NULL, fp;
int optc;
- char *match_string;
int add_tag = 0;
- char tag[14] = "";
+ char tag[] = ";;cut_scythe";
+
extern char *optarg;
while (1) {
int option_index = 0;
- optc = getopt_long(argc, argv, "dtp:a:o:q:m:o:n:", long_options, &option_index);
+ optc = getopt_long(argc, argv, "dtfp:a:o:q:m:o:n:", long_options, &option_index);
if (optc == -1)
break;
@@ -143,8 +145,6 @@ int main(int argc, char *argv[]) {
break;
case 't':
add_tag = 1;
- strcpy(tag, ";;cut_scythe");
- printf("%s\n", tag);
break;
case 'Q':
verbose = 0;
@@ -205,7 +205,6 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "No FASTQ file specified.\n");
usage(EXIT_FAILURE);
}
-
/* load all adapter sequences into memory */
if (!adapter_fp) {
@@ -222,13 +221,10 @@ int main(int argc, char *argv[]) {
return EXIT_FAILURE;
}
-
seq = kseq_init(fp);
-
- /* Loop through entire file, find best match against adapters, and
- score these best matches. Write trimmed sequences to file (or
- stdout), and record matches in a match file if specifed.
+ /* Loop through entire sequence file. Write trimmed sequences to
+ file (or stdout), and record matches in a match file if specifed.
*/
while ((l = kseq_read(seq)) >= 0) {
if (!seq->qual.s) {
@@ -240,32 +236,10 @@ int main(int argc, char *argv[]) {
best_match = find_best_match(aa, seq->seq.s, qprobs, prior, 0.25, min);
if (best_match && best_match->ps->is_contam) {
- /* Best match was classified as contaminated, print trimmed
- results and match entry (if specified). */
contaminated++;
-
- /* Write out the trimmed sequence, with optional tag added */
+ /* write the FASTQ entry, and if we have a match file, write that */
write_fastq(output_fp, seq, add_tag, tag, best_match->n);
-
- /* print match for 5'-end; seq->seq.s is point to the
- approperiate place by taking the sequence length and
- subtracting the match length. */
-
- if (matches_fp) {
- /* Make a string that indicates the position of the matches with "|"s. */
- match_string = fmt_matches((aa->adapters[best_match->adapter_index]).seq,
- &(seq->seq.s)[seq->seq.l-best_match->n],
- best_match->match, best_match->n);
-
- fprintf(matches_fp, "p(c|s): %f; p(!c|s): %f; adapter: %s\n%s\n%s\n%s\n",
- best_match->ps->contam, best_match->ps->random,
- aa->adapters[best_match->adapter_index].name,
- seq->name.s, match_string,
- &(seq->qual.s)[seq->qual.l-best_match->n]);
- fprint_float_array(matches_fp, qual_to_probs(&(seq->qual.s)[seq->qual.l-best_match->n], qual_type), best_match->n);
- fprintf(matches_fp, "\n\n");
- free(match_string);
- }
+ if (matches_fp) print_match(seq, best_match, matches_fp, aa, qual_type);
/* record occurence in adapter position */
aa->adapters[best_match->adapter_index].occurrences[best_match->n-1]++;
@@ -283,18 +257,8 @@ int main(int argc, char *argv[]) {
total++;
}
- /* If --quiet not specified, print out a summary at the end. */
- if (verbose) {
- fprintf(stderr, "prior: %0.3f\n", prior);
- fprintf(stderr, "\nAdapter Trimming Complete\ncontaminated: %d, uncontaminated: %d, total: %d\n",
- contaminated, total-contaminated, total);
- fprintf(stderr, "contamination rate: %f", contaminated/(float) total);
- for (i = 0; i < aa->n; i++) {
- fprintf(stderr, "\nAdapter %d '%s' contamination occurences:\n", i+1, aa->adapters[i].name);
- fprint_uint_array(stderr, aa->adapters[i].occurrences, aa->adapters[i].length);
- fprintf(stderr, "\n");
- }
- }
+ if (verbose)
+ print_summary(aa, prior, total-contaminated, contaminated, total);
kseq_destroy(seq);
destroy_adapters(aa, MAX_ADAPTERS);
View
47 src/scythe.h
@@ -21,13 +21,13 @@
__KS_TYPE(gzFile)
__KS_BASIC(gzFile, BUFFER_SIZE)
__KSEQ_TYPE(gzFile)
-__KSEQ_BASIC(gzFile)
#define MAX_ADAPTERS 1000
#define MATCH_SCORE 1
#define MISMATCH_SCORE -1
#define IS_FASTQ(quality_type) INTEGER(quality_type)[0] >= 0
+#define min(a,b) ((a)>(b)?(b):(a))
typedef enum {
PHRED,
@@ -63,18 +63,13 @@ typedef struct adapter_array {
int n;
} adapter_array;
-enum contam {
- NOT_CONTAM,
- CONTAM
-};
-
typedef struct likelihood_set {
double random;
double contam;
} likelihood_set;
typedef struct posterior_set {
- enum contam is_contam;
+ int is_contam;
double random;
double contam;
} posterior_set;
@@ -90,28 +85,30 @@ typedef struct match {
} match;
/* prob.c prototypes */
-float *qual_to_probs(const char *qual, quality_type q_type);
-double p_data(const int *matches, float *p_quals, float p_prior_contam, float p_match, int n);
-posterior_set *posterior(const int *matches, float *p_quals, float p_prior, float p_match, int n);
-likelihood_set *likelihood(const int *matches, float *p_quals, float p_match, int n);
+float *qual_to_probs(const char *, quality_type);
+double p_data(const int *, float *, float, float, int);
+posterior_set *posterior(const int *, float *, float, float, int);
+likelihood_set *likelihood(const int *, float *, float, int);
/* util.c prototypes */
-void *xmalloc(size_t size);
-adapter_array *load_adapters(gzFile fp);
-void destroy_adapters(adapter_array *aa, int n);
-char *fmt_matches(const char *seqa, const char *seqb, const int *matches, const int n);
-void print_float_array(const float *array, int n);
-void fprint_float_array(FILE *fp, const float *array, int n);
-void print_int_array(const int *array, int n);
-void print_uint_array(const unsigned int *array, int n);
-void fprint_uint_array(FILE *fp, const unsigned int *array, int n);
-int sum(const int *x, int n);
-void write_fastq(gzFile output_fp, kseq_t *seq, int add_tag, char *tag, int match_n);
+void *xmalloc(size_t);
+adapter_array *load_adapters(gzFile);
+void destroy_adapters(adapter_array *, int);
+char *fmt_matches(const char *, const char *, const int *, const int);
+void print_float_array(const float *, int);
+void fprint_float_array(FILE *, const float *, int);
+void print_int_array(const int *, int);
+void print_uint_array(const unsigned int *, int);
+void fprint_uint_array(FILE *, const unsigned int *, int);
+int sum(const int *, int);
+void write_fastq(gzFile, kseq_t *, int, char *, int);
+void print_summary(adapter_array *, float, int, int, int);
/* match.c prototypes */
-int *score_sequence(const char *seqa, const char *seqb, int n);
-match *find_best_match(const adapter_array *aa, const char *read, float *p_quals, float prior, float p_match, int min);
-void destroy_match(match *m);
+int *score_sequence(const char *, const char *, int);
+match *find_best_match(const adapter_array *, const char *, float *, float, float, int);
+void print_match(kseq_t *, match *, gzFile, const adapter_array *, quality_type);
+void destroy_match(match *);
#endif /* SCYTHE_H */
View
53 src/util.c
@@ -12,11 +12,10 @@
#include "kseq.h"
#include "scythe.h"
-#define MIN(a,b) ((a)>(b)?(b):(a))
-
+__KSEQ_BASIC(static, gzFile)
__KS_GETC(gzread, BUFFER_SIZE)
__KS_GETUNTIL(gzread, BUFFER_SIZE)
-__KSEQ_READ
+__KSEQ_READ(static)
void *xmalloc(size_t size) {
@@ -189,25 +188,25 @@ void write_fastq(gzFile output_fp, kseq_t *seq, int add_tag, char *tag, int matc
if (seq->comment.s)
fprintf(output_fp,
"@%s %s%s-%d\n%.*s\n+%s %s%s-%d\n%.*s\n", seq->name.s, seq->comment.s, tag, match_n,
- (int) seq->seq.l-match_n, seq->seq.s, seq->name.s, seq->comment.s, tag, match_n,
- (int) seq->seq.l-match_n, seq->qual.s);
+ (int) match_n, seq->seq.s, seq->name.s, seq->comment.s, tag, match_n,
+ (int) match_n, seq->qual.s);
else
fprintf(output_fp,
"@%s%s-%d\n%.*s\n+%s%s-%d\n%.*s\n", seq->name.s, tag, match_n,
- (int) seq->seq.l-match_n, seq->seq.s, seq->name.s, tag, match_n,
- (int) seq->seq.l-match_n, seq->qual.s);
+ (int) match_n, seq->seq.s, seq->name.s, tag, match_n,
+ (int) match_n, seq->qual.s);
} else {
if (seq->comment.s)
fprintf(output_fp,
"@%s %s\n%.*s\n+%s %s\n%.*s\n", seq->name.s, seq->comment.s,
- (int) seq->seq.l-match_n, seq->seq.s, seq->name.s, seq->comment.s,
- (int) seq->seq.l-match_n, seq->qual.s);
+ (int) match_n, seq->seq.s, seq->name.s, seq->comment.s,
+ (int) match_n, seq->qual.s);
else
fprintf(output_fp,
"@%s\n%.*s\n+%s\n%.*s\n", seq->name.s,
- (int) seq->seq.l-match_n, seq->seq.s, seq->name.s,
- (int) seq->seq.l-match_n, seq->qual.s);
+ (int) match_n, seq->seq.s, seq->name.s,
+ (int) match_n, seq->qual.s);
}
} else {
@@ -219,3 +218,35 @@ void write_fastq(gzFile output_fp, kseq_t *seq, int add_tag, char *tag, int matc
"@%s\n%s\n+%s\n%s\n", seq->name.s, seq->seq.s, seq->name.s, seq->qual.s);
}
}
+
+void print_summary(adapter_array *aa, float prior, int uncontaminated,
+ int contaminated, int total) {
+ int i;
+ fprintf(stderr, "prior: %0.3f\n", prior);
+ fprintf(stderr, "\nAdapter Trimming Complete\ncontaminated: %d, uncontaminated: %d, total: %d\n",
+ contaminated, total-contaminated, total);
+ fprintf(stderr, "contamination rate: %f", contaminated/(float) total);
+ for (i = 0; i < aa->n; i++) {
+ fprintf(stderr, "\nAdapter %d '%s' contamination occurences:\n", i+1, aa->adapters[i].name);
+ fprint_uint_array(stderr, aa->adapters[i].occurrences, aa->adapters[i].length);
+ fprintf(stderr, "\n");
+ }
+}
+
+void print_match(kseq_t *seq, match *match, gzFile matches_fp,
+ const adapter_array *aa, quality_type qual_type) {
+ /* Make a string that indicates the position of the matches with "|"s. */
+ char *match_string;
+ match_string = fmt_matches((aa->adapters[match->adapter_index]).seq,
+ &(seq->seq.s)[match->n],
+ match->match, match->n);
+
+ fprintf(matches_fp, "p(c|s): %f; p(!c|s): %f; adapter: %s\n%s\n%s\n%s\n",
+ match->ps->contam, match->ps->random,
+ aa->adapters[match->adapter_index].name,
+ seq->name.s, match_string,
+ &(seq->qual.s)[match->n]);
+ fprint_float_array(matches_fp, qual_to_probs(&(seq->qual.s)[match->n], qual_type), match->n);
+ fprintf(matches_fp, "\n\n");
+ free(match_string);
+}

0 comments on commit b24583c

Please sign in to comment.
Something went wrong with that request. Please try again.