Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/walaj/SeqLib
Browse files Browse the repository at this point in the history
  • Loading branch information
walaj committed Apr 19, 2017
2 parents fdcdf4c + 770cd10 commit c645c4a
Show file tree
Hide file tree
Showing 18 changed files with 2,413 additions and 4,020 deletions.
2 changes: 1 addition & 1 deletion .travis.scripts/coveralls.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ then
## download the test data
mkdir test_data
cd test_data
wget -r -nH -nd -np -R index.html* https://data.broadinstitute.org/snowman/SeqLibTest/
wget -r -nH -nd -np -R index.html* https://data.broadinstitute.org/snowman/SeqLib/
cd ..

export LD_LIBRARY_PATH=${BOOST_ROOT}/lib:${LD_LIBRARY_PATH}
Expand Down
1 change: 0 additions & 1 deletion Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
Expand Down
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -301,21 +301,22 @@ w.Close(); // Optional. Will close on destruction
using namespace SeqLib;
// brv is some set of reads to train the error corrector
b.TrainCorrection(brv);
for (BamRecordVector::const_iterator r = brv.begin(); r != brv.end(); ++r)
b.AddSequence(r->Sequence().c_str(), r->Qualities().c_str(), r->Qname().c_str());
b.Train();
b.clear(); // clear the training sequences. Training parameters saved in BFC object
// brv2 is some set to correct
b.ErrorCorrect(brv2);
for (BamRecordVector::const_iterator r = brv2.begin(); r != brv2.end(); ++r)
b.AddSequence(r->Sequence().c_str(), r->Qualities().c_str(), r->Qname().c_str());
b.ErrorCorrect();
// retrieve the sequences
UnalignedSequenceVector v;
b.GetSequences(v);
// alternatively, to train and correct the same set of reads
b.TrainAndCorrect(brv);
b.GetSequences(v);
std::string name, seq;
while (b.GetSequences(seq, name))
v.push_back({name, seq});
// alternatively, train and correct, and modify the sequence in-place
b.TrainCorrection(brv);
b.ErrorCorrectInPlace(brv);
```

Support
Expand Down
64 changes: 27 additions & 37 deletions SeqLib/BFC.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace SeqLib {
public:
/** Construct a new BFC engine */
BFC() {
m_idx = 0;
bfc_opt_init(&bfc_opt);
ch = NULL;
kmer = 0;
Expand All @@ -43,57 +44,31 @@ namespace SeqLib {
bfc_ch_destroy(ch);
}

/** Allocate a block of memory for the reads if the amount to enter is known
* @note This is not necessary, as reads will dynamically reallocate
*/
bool AllocateMemory(size_t n);

/** Peform BFC error correction on the sequences stored in this object */
bool ErrorCorrect();

/** Train the error corrector using the reads stored in this object */
bool Train();

/** Add a sequence for either training or correction */
bool AddSequence(const BamRecord& r);

/** Add a sequence for either training or correction */
/** Add a sequence for either training or correction
* @param seq A sequence to be copied into this object (A, T, C, G)
*/
bool AddSequence(const char* seq, const char* qual, const char* name);

/** Set the k-mer size */
/** Set the k-mer size for training
* @note zero is auto
*/
void SetKmer(int k) { kmer = k; }

/** Train error correction using sequences from aligned reads */
void TrainCorrection(const BamRecordVector& brv);

/** Train error correction from raw character strings */
void TrainCorrection(const std::vector<char*>& v);

/** Train and error correction on same reads */
void TrainAndCorrect(const BamRecordVector& brv);

/** Error correct a collection of reads */
void ErrorCorrect(const BamRecordVector& brv);

/** Error correct in place, modify sequence, and the clear memory from this object */
void ErrorCorrectInPlace(BamRecordVector& brv);
/** Correct a single new sequence not stored in object
* @param str Sequence of string to correct (ACTG)
* @param q Quality score of sequence to correct
* @value Returns true if corrected */
bool CorrectSequence(std::string& str, const std::string& q);

/** Error correct and add tag with the corrected sequence data, and the clear memory from this object
* @param brv Aligned reads to error correct
* @param tag Tag to assign error corrected sequence to (eg KC)
* @exception Throws an invalid_argument if tag is not length 2
*/
void ErrorCorrectToTag(BamRecordVector& brv, const std::string& tag);

/** Return the reads (error corrected if ran ErrorCorrect) */
void GetSequences(UnalignedSequenceVector& v) const;

/** Clear the stored reads */
void clear();

/** Filter reads with unique k-mers. Do after error correction */
void FilterUnique();

/** Return the calculated kcov */
float GetKCov() const { return kcov; }

Expand All @@ -103,8 +78,21 @@ namespace SeqLib {
/** Return the number of sequences controlled by this */
int NumSequences() const { return n_seqs; }

/** Return the next sequence stored in object
* @param s Empty string to be filled.
* @param q Empty string name to be filled.
* @value True if string was filled with sequence. False if no more sequences.
*/
bool GetSequence(std::string& s, std::string& q);

/** Reset the sequence iterator inside GetSequence
*/
void ResetGetSequence() { m_idx = 0; };

private:

size_t m_idx;

// the amount of memory allocated
size_t m_seqs_size;

Expand Down Expand Up @@ -146,6 +134,8 @@ namespace SeqLib {
// assign names, qualities and seq to m_seqs
void allocate_sequences_from_char(const std::vector<char*>& v);

void allocate_sequences_from_strings(const std::vector<std::string>& v);

// do the actual read correction
void correct_reads();

Expand Down
23 changes: 17 additions & 6 deletions SeqLib/BamRecord.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,14 @@ class CigarField {

public:

/** Construct an empty CIGAR */
Cigar() {}

/** Construct from a CIGAR string
* @param cig CIGAR string, e.g. 54M46S
*/
Cigar(const std::string& cig);

typedef std::vector<CigarField>::iterator iterator; ///< Iterator for move between CigarField ops
typedef std::vector<CigarField>::const_iterator const_iterator; ///< Iterator (const) for move between CigarField ops
iterator begin() { return m_data.begin(); } ///< Iterator (aka std::vector<CigarField>.begin()
Expand Down Expand Up @@ -185,11 +193,8 @@ class CigarField {

};

//typedef std::vector<CigarField> Cigar;
typedef SeqHashMap<std::string, size_t> CigarMap;

Cigar cigarFromString(const std::string& cig);

/** Class to store and interact with a SAM alignment record
*
* HTSLibrary reads are stored in the bam1_t struct. Memory allocation
Expand Down Expand Up @@ -345,7 +350,10 @@ class BamRecord {
int32_t CountBWAChimericAlignments() const;

/** Get the end of the alignment */
inline int32_t PositionEnd() const { return b ? bam_endpos(b.get()) : -1; }
int32_t PositionEnd() const;

/** Get the end of the aligment mate pair */
int32_t PositionEndMate() const;

/** Get the chromosome ID of the read */
inline int32_t ChrID() const { return b ? b->core.tid : -1; }
Expand Down Expand Up @@ -414,7 +422,7 @@ class BamRecord {
if (b->core.tid != b->core.mtid || !PairMappedFlag())
return 0;

return std::abs(b->core.pos - b->core.mpos) + Length();
return std::abs(b->core.pos - b->core.mpos) + GetCigar().NumQueryConsumed();

}

Expand Down Expand Up @@ -848,7 +856,10 @@ class BamRecord {
*/
int OverlappingCoverage(const BamRecord& r) const;

private:
/** Return the shared pointer */
SeqPointer<bam1_t> shared_pointer() const { return b; }

protected:

SeqPointer<bam1_t> b; // bam1_t shared pointer

Expand Down
4 changes: 2 additions & 2 deletions SeqLib/SeqLibUtils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef SNOWUTILS_H
#define SNOWUTILS_H
#ifndef SEQLIB_UTILS_H
#define SEQLIB_UTILS_H

#include <string>
#include <time.h>
Expand Down
14 changes: 1 addition & 13 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,6 @@ infodir
docdir
oldincludedir
includedir
runstatedir
localstatedir
sharedstatedir
sysconfdir
Expand Down Expand Up @@ -781,7 +780,6 @@ datadir='${datarootdir}'
sysconfdir='${prefix}/etc'
sharedstatedir='${prefix}/com'
localstatedir='${prefix}/var'
runstatedir='${localstatedir}/run'
includedir='${prefix}/include'
oldincludedir='/usr/include'
docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
Expand Down Expand Up @@ -1034,15 +1032,6 @@ do
| -silent | --silent | --silen | --sile | --sil)
silent=yes ;;

-runstatedir | --runstatedir | --runstatedi | --runstated \
| --runstate | --runstat | --runsta | --runst | --runs \
| --run | --ru | --r)
ac_prev=runstatedir ;;
-runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
| --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
| --run=* | --ru=* | --r=*)
runstatedir=$ac_optarg ;;

-sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
ac_prev=sbindir ;;
-sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
Expand Down Expand Up @@ -1180,7 +1169,7 @@ fi
for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
libdir localedir mandir runstatedir
libdir localedir mandir
do
eval ac_val=\$$ac_var
# Remove trailing slashes.
Expand Down Expand Up @@ -1333,7 +1322,6 @@ Fine tuning of the installation directories:
--sysconfdir=DIR read-only single-machine data [PREFIX/etc]
--sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
--localstatedir=DIR modifiable single-machine data [PREFIX/var]
--runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run]
--libdir=DIR object code libraries [EPREFIX/lib]
--includedir=DIR C header files [PREFIX/include]
--oldincludedir=DIR C header files for non-gcc [/usr/include]
Expand Down
2 changes: 1 addition & 1 deletion seq_test/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ seq_test_LDADD = \
../htslib/libhts.a \
-lboost_unit_test_framework -lboost_system -lboost_timer -lboost_chrono

seq_test_LDFLAGS = --coverage
seq_test_LDFLAGS = --coverage -pthread

seq_test_SOURCES = seq_test.cpp \
../src/BFC.cpp ../src/GenomicRegion.cpp \
Expand Down
Loading

0 comments on commit c645c4a

Please sign in to comment.