Skip to content

fastq_syncpairs: implementation prototype

Frédéric Mahé edited this page Mar 1, 2023 · 2 revisions

Here is a C++11 draft-implementation of a potential memory-friendly fastq_syncpairs command (re-synchronize R1-R2 fastq files). This is not a priority, so I use this wiki page as an archival system.

#include <fstream>
#include <iostream>
#include <limits>
#include <unordered_map>
#include <utility>  // std::pair
#include <string>

// Summary:
// - assume entries on 4 lines, with headers ending with '/1' or '/2', no empty entries
// - for each R2 entry, record header string, begin and end,
// - for each R1 entry, if present in R2, then write to 'synced' files,
// - if not, write to 'orphan' files

// Properties:
// - C+11, no warning
// - fast enough (75 seconds for 9 million paired reads),
// - memory footprint is reduced (roughly 1/4 of the size of the R2 file),
// - works with files with more than 2 billion chars (int64),
// - R1 is read only once,
// - reads are sorted in R1's order,
// - R1 orphans are in R1 file's order, but R2 orphans are in no particular order

// const std::string R1_file {"input_R1.fastq"};
const std::string R1_file {"/tmp/Neotropical_soils_plastidial/data/2016-10-13/161007_SN234_A_L001_GYJ-7_R1.fastq"};
// const std::string R2_file {"input_R2.fastq"};
const std::string R2_file {"/tmp/Neotropical_soils_plastidial/data/2016-10-13/161007_SN234_A_L001_GYJ-7_R2.fastq"};

using Map = std::unordered_map<std::string, std::pair<long int, long int>>;


auto parse_R2 () -> Map {
  std::cout << "parse R2" << std::endl;
  std::ifstream R2_entries {R2_file};
  std::string header;
  constexpr auto unlimited_length { std::numeric_limits<std::streamsize>::max() };

  Map R2_map = {};
  auto start {R2_entries.tellg()};
  while (std::getline(R2_entries, header))
    {
      R2_entries.ignore(unlimited_length, '\n'); // sequence string
      R2_entries.ignore(unlimited_length, '\n'); // quality header
      R2_entries.ignore(unlimited_length, '\n'); // quality string
      const auto end {R2_entries.tellg()};
      R2_map[header] = {start, end - start};
      start = end;
    }

  return R2_map;
}


auto longest_R2 (const Map& R2_map) -> long unsigned int {
  std::cout << "search longest R2 entry" << std::endl;
  auto max_length {0L};

  for (auto const& entry : R2_map) {
    const auto offset { entry.second.second };
    if (offset > max_length) {
      max_length = offset;
    }
  }

  return static_cast<long unsigned int>(max_length);
}


auto fetch_R2 (const long int start,
               const long int offset,
               std::string & buffer,
               std::ifstream& R2_entries,
               std::ofstream& synced_R2) -> void {

  R2_entries.seekg(start);
  R2_entries.read(&buffer[0], offset);
  synced_R2.write(&buffer[0], offset);
  buffer.clear();
}


auto read_R1 (Map& R2_map, std::string & buffer) -> void {
  std::cout << "read R1" << std::endl;
  std::ifstream R1_entries {R1_file};
  std::ifstream R2_entries {R2_file};
  std::ofstream synced_R1 {"synced_R1.fastq"};
  std::ofstream orphan_R1 {"orphan_R1.fastq"};
  std::ofstream synced_R2 {"synced_R2.fastq"};
  std::string line;
  std::string header;

  while (std::getline(R1_entries, line)) {
    header = line;
    header.replace(header.length() - 1, 1, "2");
    const auto match { R2_map.find(header) }; // C++20: map.contains()
    if (match != R2_map.end()) {
      const auto start {match->second.first};
      const auto offset {match->second.second};
      fetch_R2(start, offset, buffer, R2_entries, synced_R2);
      R2_map.erase(match);
      synced_R1 << line << '\n';
      std::getline(R1_entries, line);
      synced_R1 << line << '\n';
      std::getline(R1_entries, line);
      synced_R1 << line << '\n';
      std::getline(R1_entries, line);
      synced_R1 << line << '\n';
    } else {
      orphan_R1 << line << '\n';
      std::getline(R1_entries, line);
      orphan_R1 << line << '\n';
      std::getline(R1_entries, line);
      orphan_R1 << line << '\n';
      std::getline(R1_entries, line);
      orphan_R1 << line << '\n';
    }
  }
}


auto flush_R2 (const Map& R2_map, std::string & buffer) -> void {
  std::cout << "ouput orphan R2 entries" << std::endl;
  std::ifstream R2_entries {R2_file};
  std::ofstream orphan_R2 {"orphan_R2.fastq"};

  for (auto const& entry : R2_map) {
    const auto start {entry.second.first};
    const auto offset {entry.second.second};
    R2_entries.seekg(start);
    R2_entries.read(&buffer[0], offset);
    orphan_R2.write(&buffer[0], offset);
  }
}


auto main() -> int {
  // printf is not used
  std::ios_base::sync_with_stdio(false);

  // map R2 entries
  Map R2_map = parse_R2();

  // find max entry length
  const auto max_length { longest_R2(R2_map) };
  std::string buffer (max_length, '\0');

  // read R1
  read_R1(R2_map, buffer);

  // deal with remaining R2 entries
  flush_R2(R2_map, buffer);

  return 0;
}