Skip to content

Commit

Permalink
white-spaced delimited BED/VCF file ok now (not just tab)
Browse files Browse the repository at this point in the history
  • Loading branch information
walaj committed Oct 28, 2016
1 parent 8f755b1 commit dd5804b
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 410 deletions.
87 changes: 6 additions & 81 deletions SeqLib/GenomicRegionCollection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,53 +116,6 @@ namespace SeqLib {

}

/*template<class T>
bool GenomicRegionCollection<T>::ReadMuTect(const std::string &file, const BamHeader& hdr) {
std::string curr_chr = "dum";
std::ifstream iss(file.c_str());
if (!iss || file.length() == 0) {
std::cerr << "MuTect call-stats file does not exist: " << file << std::endl;
return false;
}
std::string line;
while (std::getline(iss, line, '\n')) {
size_t counter = 0;
std::string chr, pos, judge;
std::istringstream iss_line(line);
std::string val;
if (line.find("KEEP") != std::string::npos) {
while(std::getline(iss_line, val, '\t')) {
switch (counter) {
case 0 : chr = val; break;
case 1 : pos = val; break;
}
if (counter >= 1)
break;
++counter;
if (curr_chr != chr) {
std::cerr << "...reading MuTect call-stats -- chr" << chr << std::endl;
curr_chr = chr;
}
}
// parse the strings and send to genomci region
T gr(chr, pos, pos, hdr);
if (gr.chr >= 0) {
m_grv->push_back(gr);
}
} // end "keep" conditional
} // end main while
return true;
}
*/

template<class T>
bool GenomicRegionCollection<T>::ReadBED(const std::string & file, const BamHeader& hdr) {

Expand Down Expand Up @@ -199,26 +152,17 @@ bool GenomicRegionCollection<T>::ReadBED(const std::string & file, const BamHead
}

// prepare to loop through each field of BED line
size_t counter = 0;
//size_t counter = 0;
std::string chr, pos1, pos2;
std::string line(buffer);
std::istringstream iss_line(line);
std::string val;
if (line.find("#") != std::string::npos)
continue;

// loop BED lines
while(std::getline(iss_line, val, '\t')) {
switch (counter) {
case 0 : chr = val; break;
case 1 : pos1 = val; break;
case 2 : pos2 = val; break;
}
if (counter >= 2)
break;
++counter;
}

// read first three BED columns
iss_line >> chr >> pos1 >> pos2;

// construct the GenomicRegion
T gr(chr, pos1, pos2, hdr);

Expand Down Expand Up @@ -265,24 +209,15 @@ bool GenomicRegionCollection<T>::ReadVCF(const std::string & file, const BamHead
}

// prepare to loop through each field of BED line
size_t counter = 0;
std::string chr, pos;
std::string line(buffer);
std::istringstream iss_line(line);
std::string val;
if (line.empty() || line.at(0) == '#')
continue;

// loop VCF lines
while(std::getline(iss_line, val, '\t')) {
switch (counter) {
case 0 : chr = val; break;
case 1 : pos = val; break;
}
if (counter >= 2)
break;
++counter;
}
// read first two columnes
iss_line >> chr >> pos;

// construct the GenomicRegion
T gr(chr, pos, pos, hdr);
Expand All @@ -300,16 +235,6 @@ GenomicRegionCollection<T>::GenomicRegionCollection(const std::string &file, con

idx = 0;

/*
// get the header line to check format
std::string header;
if (!std::getline(iss, header, '\n')) {
std::cerr << "Region file is empty: " << file << std::endl;
return;
}
iss.close();
*/

// check if it's samtools-style file
if (file.find(":") != std::string::npos) {
m_sorted = true; // only one, so sorted
Expand Down
12 changes: 5 additions & 7 deletions SeqLib/ReadFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ namespace SeqLib {
typedef SeqHashSet<std::string> StringSet;

namespace Filter {

#ifdef HAVE_C11
/** Tool for using the Aho-Corasick method for substring queries of
* using large dictionaries
* @note Trie construction / searching implemented by https://github.com/blockchaindev/aho_corasick
Expand All @@ -35,9 +37,7 @@ namespace SeqLib {

/** Allocate a new empty trie */
AhoCorasick() {
#ifdef HAVE_C11
aho_trie = SeqPointer<aho_corasick::trie>(new aho_corasick::trie());
#endif
inv = false;
count = 0;
}
Expand All @@ -51,9 +51,7 @@ namespace SeqLib {
* O(n) where (n) is length of query string.
*/
void AddMotif(const std::string& m) {
#ifdef HAVE_C11
aho_trie->insert(m);
#endif
}

/** Add a set of motifs to the trie from a file
Expand All @@ -68,9 +66,7 @@ namespace SeqLib {
*/
int QueryText(const std::string& t) const;

#ifdef HAVE_C11
SeqPointer<aho_corasick::trie> aho_trie; ///< The trie for the Aho-Corasick search
#endif

std::string file; ///< Name of the file holding the motifs

Expand All @@ -79,7 +75,7 @@ namespace SeqLib {
int count; ///< Number of motifs in dictionary

};

#endif

/** Stores a rule for a single alignment flag.
*
Expand Down Expand Up @@ -374,7 +370,9 @@ class AbstractRule {
size_t m_count;

// the aho-corasick trie
#ifdef HAVE_C11
AhoCorasick aho;
#endif

// id for this rule
std::string id;
Expand Down
Loading

0 comments on commit dd5804b

Please sign in to comment.