Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/walaj/SeqLib into HEAD
Browse files Browse the repository at this point in the history
  • Loading branch information
walaj committed Jan 22, 2019
2 parents 282400e + 648417d commit af5dbb1
Show file tree
Hide file tree
Showing 19 changed files with 482 additions and 107 deletions.
6 changes: 2 additions & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
url = https://github.com/jwalabroad/fermi-lite
[submodule "htslib"]
path = htslib
url = https://github.com/walaj/htslib
branch = develop
url = https://github.com/samtools/htslib
[submodule "bwa"]
path = bwa
url = https://github.com/jwalabroad/bwa
branch = Apache2
url = https://github.com/walaj/bwa
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ provide excellent and high quality APIs. SeqLib provides further performance enh
bioinformatics problems.

Some differences:
* SeqLib has ~2-4x faster read/write speed over BamTools and SeqAn, and lower memory footprint.
* SeqLib has ~2-4x faster read/write speed over BamTools and lower memory footprint.
* SeqLib has support for CRAM file
* SeqLib provides in memory access to BWA-MEM, BLAT, chromosome aware interval tree, read correction, and sequence assembly with Fermi.
* SeqAn provide a substantial amount of additional capabilites not in SeqLib, including graph operations and an expanded suite of multi-sequence alignments.
Expand Down
27 changes: 24 additions & 3 deletions SeqLib/BamReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
#include <cassert>
#include "SeqLib/ReadFilter.h"
#include "SeqLib/BamWalker.h"
#include "SeqLib/ThreadPool.h"

// forward declare this from hts.c
extern "C" {
int hts_useek(htsFile *file, long uoffset, int where);
}

class BamReader;

namespace SeqLib {

class BamReader;

typedef SeqPointer<hts_idx_t> SharedIndex; ///< Shared pointer to the HTSlib index struct

typedef SeqPointer<htsFile> SharedHTSFile; ///< Shared pointer to the HTSlib file pointer
Expand Down Expand Up @@ -47,7 +49,13 @@ namespace SeqLib {
private:

// do the read loading
bool load_read(BamRecord& r);
// the return value here is just passed along from sam_read1
int32_t load_read(BamRecord& r);

void set_pool(ThreadPool t) {
if (t.IsOpen() && fp) // probably dont need this, it can handle null
hts_set_opt(fp.get(), HTS_OPT_THREAD_POOL, &t.p); //t.p is htsThreadPool
}

void reset() {
empty = true;
Expand Down Expand Up @@ -101,7 +109,7 @@ namespace SeqLib {
bool mark_for_closure;

// open the file pointer
bool open_BAM_for_reading();
bool open_BAM_for_reading(SeqLib::ThreadPool t);

// hold the reference for CRAM reading
std::string m_cram_reference;
Expand Down Expand Up @@ -145,6 +153,16 @@ class BamReader {
*/
bool SetRegion(const GenomicRegion& gp);

/** Assign this BamReader a thread pool
*
* The thread pool with stay with this object, but
* will not be created or destroyed. This must be done
* separately, which allows for multiple readers/writers
* to be connected to one thread pool
* @return false if the thread pool has not been opened
*/
bool SetThreadPool(ThreadPool p);

/** Set up multiple regions. Overwrites current regions.
*
* This will set the BAM pointer to the first element of the
Expand Down Expand Up @@ -275,6 +293,9 @@ class BamReader {
// hold the reference for CRAM reading
std::string m_cram_reference;

// for multicore reading/writing
ThreadPool pool;

};


Expand Down
78 changes: 66 additions & 12 deletions SeqLib/BamRecord.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,9 @@ class BamRecord {

/** Get the alignment position */
inline int32_t Position() const { return b ? b->core.pos : -1; }

/** Get the alignment position, including soft clips */
int32_t PositionWithSClips() const;

/** Get the alignment position of mate */
inline int32_t MatePosition() const { return b ? b->core.mpos: -1; }
Expand All @@ -352,6 +355,9 @@ class BamRecord {
/** Get the end of the alignment */
int32_t PositionEnd() const;

/** Get the end of the alignment, including soft clips */
int32_t PositionEndWithSClips() const;

/** Get the end of the aligment mate pair */
int32_t PositionEndMate() const;

Expand All @@ -364,6 +370,14 @@ class BamRecord {
/** Get the mapping quality */
inline int32_t MapQuality() const { return b ? b->core.qual : -1; }

/** Set the qc fail flag on/off (true -> on) */
inline void SetQCFail(bool f) {
if (f)
b->core.flag |= BAM_FQCFAIL;
else
b->core.flag &= ~BAM_FQCFAIL;
}

/** Set the mapping quality */
inline void SetMapQuality(int32_t m) { if (b) b->core.qual = m; }

Expand All @@ -376,11 +390,21 @@ class BamRecord {
/** Set the position of the mate read */
inline void SetPositionMate(int32_t i) { b->core.mpos = i; }

/** Set the pair mapped flag on */
inline void SetPairMappedFlag() { b->core.flag |= BAM_FPAIRED; }
/** Set the pair mapped flag on/off (true -> on) */
inline void SetPairMappedFlag(bool f) {
if (f)
b->core.flag |= BAM_FPAIRED;
else
b->core.flag &= ~BAM_FPAIRED;
}

/** Set the mate reverse flag on */
inline void SetMateReverseFlag() { b->core.flag |= BAM_FMREVERSE; }
/** Set the mate reverse flag on/off (true -> on) */
inline void SetMateReverseFlag(bool f) {
if (f)
b->core.flag |= BAM_FMREVERSE;
else
b->core.flag &= ~BAM_FMREVERSE;
}

/** Get the number of cigar fields */
inline int32_t CigarSize() const { return b ? b->core.n_cigar : -1; }
Expand Down Expand Up @@ -523,7 +547,7 @@ class BamRecord {
Cigar GetCigar() const {
uint32_t* c = bam_get_cigar(b);
Cigar cig;
for (int k = 0; k < b->core.n_cigar; ++k) {
for (size_t k = 0; k < b->core.n_cigar; ++k) {
cig.add(CigarField(c[k]));
}
return cig;
Expand Down Expand Up @@ -596,7 +620,7 @@ class BamRecord {
inline int32_t AlignmentEndPositionReverse() const {
uint32_t* c = bam_get_cigar(b);
int32_t p = 0;
for (int32_t i = 0; i < b->core.n_cigar; ++i) { // loop from the end
for (size_t i = 0; i < b->core.n_cigar; ++i) { // loop from the end
if ( (bam_cigar_opchr(c[i]) == 'S') || (bam_cigar_opchr(c[i]) == 'H'))
p += bam_cigar_oplen(c[i]);
else // not a clip, so stop counting
Expand All @@ -611,7 +635,7 @@ class BamRecord {
inline int32_t AlignmentPosition() const {
uint32_t* c = bam_get_cigar(b);
int32_t p = 0;
for (int32_t i = 0; i < b->core.n_cigar; ++i) {
for (size_t i = 0; i < b->core.n_cigar; ++i) {
if ( (bam_cigar_opchr(c[i]) == 'S') || (bam_cigar_opchr(c[i]) == 'H'))
p += bam_cigar_oplen(c[i]);
else // not a clip, so stop counting
Expand All @@ -638,7 +662,7 @@ class BamRecord {
inline int32_t NumSoftClip() const {
int32_t p = 0;
uint32_t* c = bam_get_cigar(b);
for (int32_t i = 0; i < b->core.n_cigar; ++i)
for (size_t i = 0; i < b->core.n_cigar; ++i)
if (bam_cigar_opchr(c[i]) == 'S')
p += bam_cigar_oplen(c[i]);
return p;
Expand All @@ -648,7 +672,7 @@ class BamRecord {
inline int32_t NumHardClip() const {
int32_t p = 0;
uint32_t* c = bam_get_cigar(b);
for (int32_t i = 0; i < b->core.n_cigar; ++i)
for (size_t i = 0; i < b->core.n_cigar; ++i)
if (bam_cigar_opchr(c[i]) == 'H')
p += bam_cigar_oplen(c[i]);
return p;
Expand All @@ -659,7 +683,7 @@ class BamRecord {
inline int32_t NumClip() const {
int32_t p = 0;
uint32_t* c = bam_get_cigar(b);
for (int32_t i = 0; i < b->core.n_cigar; ++i)
for (size_t i = 0; i < b->core.n_cigar; ++i)
if ( (bam_cigar_opchr(c[i]) == 'S') || (bam_cigar_opchr(c[i]) == 'H') )
p += bam_cigar_oplen(c[i]);
return p;
Expand All @@ -672,6 +696,13 @@ class BamRecord {
*/
bool GetZTag(const std::string& tag, std::string& s) const;

/** Get a string of either Z, f or i type. Useful if tag type not known at compile time.
* @param tag Name of the tag. eg "XP"
* @param s The string to be filled in with the tag information
* @return Returns true if the tag is present and is either Z or i, even if empty. Return false if no tag or not Z or i.
*/
bool GetTag(const std::string& tag, std::string& s) const;

/** Get a vector of type int from a Z tag delimited by "^"
* Smart-tags allow one to store vectors of strings, ints or doubles in the alignment tags, and
* do not require an additional data structure on top of bseq1_t.
Expand Down Expand Up @@ -708,6 +739,29 @@ class BamRecord {
if (!p)
return false;
t = bam_aux2i(p);
int type = *p++;
if (!(type == 'i' || type == 'C' || type=='S' || type=='s' || type =='I' || type=='c'))
return false;

return true;
}

/** Get a float (f) tag
* @param tag Name of the tag. eg "AS"
* @param t Value to be filled in with the tag value.
* @return Return true if the tag exists.
*/
inline bool GetFloatTag(const std::string& tag, float& t) const {
uint8_t* p = bam_aux_get(b.get(),tag.c_str());
if (!p)
return false;

t = bam_aux2f(p);
int type = *p;
type = *p++;
if (!(type == 'f' || type == 'd'))
return false;

return true;
}

Expand Down Expand Up @@ -744,7 +798,7 @@ class BamRecord {
inline std::string CigarString() const {
std::stringstream cig;
uint32_t* c = bam_get_cigar(b);
for (int k = 0; k < b->core.n_cigar; ++k)
for (size_t k = 0; k < b->core.n_cigar; ++k)
cig << bam_cigar_oplen(c[k]) << "MIDNSHP=XB"[c[k]&BAM_CIGAR_MASK];
return cig.str();
}
Expand Down Expand Up @@ -772,7 +826,7 @@ class BamRecord {
if (b->core.tid < 0)
return std::string();

if (h.isEmpty())
if (!h.isEmpty())
return h.IDtoName(b->core.tid);

// c++98
Expand Down
17 changes: 14 additions & 3 deletions SeqLib/BamWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <cassert>
#include "SeqLib/BamRecord.h"
#include "SeqLib/ThreadPool.h"

namespace SeqLib {

Expand Down Expand Up @@ -37,6 +38,16 @@ class BamWriter {
*/
bool WriteHeader() const;

/** Assign this BamWriter a thread pool
*
* The thread pool with stay with this object, but
* will not be created or destroyed. This must be done
* separately, which allows for multiple readers/writers
* to be connected to one thread pool
* @return false if the thread pool has not been opened
*/
bool SetThreadPool(ThreadPool p);

/** Provide a header to this writer
* @param h Header for this writer. Copies contents
*/
Expand Down Expand Up @@ -93,9 +104,6 @@ class BamWriter {
// path to output file
std::string m_out;

// open m_out, true if success
void open_BAM_for_writing();

// output format
std::string output_format;

Expand All @@ -104,6 +112,9 @@ class BamWriter {

// header
SeqLib::BamHeader hdr;

// for multicore reading/writing
ThreadPool pool;

};

Expand Down
5 changes: 4 additions & 1 deletion SeqLib/FastqReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ class FastqReader {
public:

/** Construct an empty FASTQ/FASTA reader */
FastqReader() {}
FastqReader() {
seq = NULL;
fp = NULL;
}

/** Construct a reader and open a FASTQ/FASTA reader
* @param file Path to a FASTQ or FASTA file
Expand Down
17 changes: 12 additions & 5 deletions SeqLib/GenomicRegionCollection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <algorithm>
#include <zlib.h>

#define GZBUFFER 4096
#define GZBUFFER 65472

//#define DEBUG_OVERLAPS 1

Expand Down Expand Up @@ -124,7 +124,7 @@ bool GenomicRegionCollection<T>::ReadBED(const std::string & file, const BamHead

gzFile fp = NULL;
fp = strcmp(file.c_str(), "-")? gzopen(file.c_str(), "r") : gzdopen(fileno(stdin), "r");

if (file.empty() || !fp) {
std::cerr << "BED file not readable: " << file << std::endl;
return false;
Expand Down Expand Up @@ -215,12 +215,19 @@ bool GenomicRegionCollection<T>::ReadVCF(const std::string & file, const BamHead
std::string val;
if (line.empty() || line.at(0) == '#')
continue;

// read first two columnes
iss_line >> chr >> pos;

// construct the GenomicRegion
T gr(chr, pos, pos, hdr);
T gr;
try {
gr = T(chr, pos, pos, hdr);
} catch (...) {
std::cerr << "...Could not parse pos: " << pos << std::endl << std::endl
<< "...on line " << line << std::endl;

}
if (gr.chr >= 0)
m_grv->push_back(gr);
}
Expand Down
5 changes: 5 additions & 0 deletions SeqLib/ReadFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,11 @@ class ReadFilterCollection {
return num;
}

/** Check that the filter collection includes something.
* If not, then give it a universal includer
*/
void CheckHasIncluder();

// Return the a tab-delimited tally of which filters were satisfied.
// Includes the header:
// total_seen_count total_passed_count region region_passed_count rule rule_passed_count
Expand Down
Loading

0 comments on commit af5dbb1

Please sign in to comment.