Skip to content

Commit

Permalink
Fix for 2 bugs in the fastq reader:
Browse files Browse the repository at this point in the history
- \r\n handling may cause reading a byte past end of buffer, parser fails
- checking end-of-file condition can only be reliably done
  after a call to getLine() returns NULL. One particular case
  is that some gzip files contain empty gzip blocks at the end
  of the file, which can´t be predicted by the current eof() code
  Tested with files provided in issue OpenGene#491.

This reverts commit 0ee1b3b, "fix a regression bug of FASTQ reader"
  • Loading branch information
wdu committed Jun 15, 2023
1 parent 7784d04 commit d940c2e
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 14 deletions.
49 changes: 35 additions & 14 deletions src/fastqreader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ SOFTWARE.

FastqReader::FastqReader(string filename, bool hasQuality, bool phred64){
mFilename = filename;
mSkipNewline = false;
mZipped = false;
mFile = NULL;
mStdinMode = false;
Expand Down Expand Up @@ -223,6 +224,15 @@ void FastqReader::getLine(string* line){
int end = start;

while(end < mBufDataLen) {
// May still need to skip \n from a \r\n pair
if (mSkipNewline) {
mSkipNewline = false;
if (mFastqBuf[end] == '\n') {
start++;
end++;
continue;
}
}
if(mFastqBuf[end] != '\r' && mFastqBuf[end] != '\n')
end++;
else
Expand All @@ -236,9 +246,9 @@ void FastqReader::getLine(string* line){

// skip \n or \r
end++;
// handle \r\n
if(end < mBufDataLen-1 && mFastqBuf[end-1]=='\r' && mFastqBuf[end] == '\n')
end++;
// handle \r\n - not now because we may be at end of buffer
if(mFastqBuf[end-1]=='\r')
mSkipNewline = true;

mBufUsedLen = end;

Expand All @@ -252,7 +262,18 @@ void FastqReader::getLine(string* line){
readToBuf();
start = 0;
end = 0;


while(end < mBufDataLen) {
// May still need to skip \n from a \r\n pair
if (mSkipNewline) {
mSkipNewline = false;
if (mFastqBuf[end] == '\n') {
start++;
end++;
continue;
}
}
if(mFastqBuf[end] != '\r' && mFastqBuf[end] != '\n')
end++;
else
Expand All @@ -265,9 +286,9 @@ void FastqReader::getLine(string* line){

// skip \n or \r
end++;
// handle \r\n
if(end < mBufDataLen-1 && mFastqBuf[end] == '\n')
end++;
// handle \r\n - not now because we may be at end of buffer
if(mFastqBuf[end-1] == '\r')
mSkipNewline = true;

mBufUsedLen = end;
return;
Expand All @@ -280,10 +301,6 @@ void FastqReader::getLine(string* line){
}

Read* FastqReader::read(){
if(mBufUsedLen >= mBufDataLen && bufferFinished()) {
return NULL;
}

string* name;
string* sequence;
string* strand;
Expand All @@ -306,12 +323,16 @@ Read* FastqReader::read(){
}

getLine(name);
if(name->empty() && mBufUsedLen >= mBufDataLen && bufferFinished()) {
// EOF is triggered only after reading past end of file; that
// can happen at the start of a new read.
return NULL;
}
// name should start with @
while((name->empty() && !(mBufUsedLen >= mBufDataLen && bufferFinished())) || (!name->empty() && (*name)[0]!='@')){
getLine(name);
if (name->empty() || (*name)[0]!='@') {
cerr << *name << endl;
error_exit("Read name line should start with '@'");
}
if(name->empty())
return NULL;

getLine(sequence);
getLine(strand);
Expand Down
1 change: 1 addition & 0 deletions src/fastqreader.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class FastqReader{

private:
string mFilename;
bool mSkipNewline;
struct isal_gzip_header mGzipHeader;
struct inflate_state mGzipState;
unsigned char *mGzipInputBuffer;
Expand Down

0 comments on commit d940c2e

Please sign in to comment.