Skip to content

IOs API

Teo Lemane edited this page Jul 29, 2021 · 1 revision

This page describes how to use kmtricks API to read kmtricks's files.

I. Count files

A. Read a counted k-mer file

1. Open a KmerReader

KmerReader<BUF_SIZE> reader("./km_dir/counts/partition_0/D1.kmer");

1.a Stream it

uint32_t kmer_size = reader.infos().kmer_size;
Kmer<MAX_K> kmer; 
kmer.set_k(kmer_size);
count_type count;
while (reader.read<MAX_K, MAX_C>(kmer, count))
{
  std::cout << kmer.to_string() << " " << std::to_string(count) << "\n";
}

1.b Or dump it as text

reader.write_as_text(std::cout);

B. Aggregate counted k-mer files

1. Open a KmerFileAggregator

uint32_t kmer_size = 32;
std::vector<std::string> paths {"./km_dir/counts/partition_0/D1.kmer", "./km_dir/counts/partition_1/D1.kmer"};
KmerFileAggregator<MAX_K, MAX_C> agg(paths, kmer_size);

1.a Dump files in a single text file

agg.write_as_text(std::cout);

1.b Or dump files in a single binary file

bool lz4_compress = true;
agg.write_as_bin("./D1.agg.kmer", lz4_compress); 

C. Merge counted k-mer files (allows to obtains a single sorted k-mer file)

1. Open a KmerFileMerger

uint32_t kmer_size = 32;
std::vector<std::string> paths {"./km_dir/counts/partition_0/D1.kmer", "./km_dir/counts/partition_1/D1.kmer"};
KmerFileMerger<MAX_K, MAX_C> merger(paths, kmer_size);

1.a Iterate on sorted k-mers

while (merger.next())
{
   const Kmer<MAX_K>& kmer = merge.current();
   count_type count = merge.count();
   std::cout << kmer.to_string() << " " << std::to_string(count) << "\n";
}

1.b Or dump them in a single sorted text file

merger.write_as_text(std::cout);

1.c Or dump them in a single sorted binary file

bool lz4_compress = true;
merger.write_as_bin("./D1.agg.kmer", lz4_compress); 

II. Count Matrix files

A. Read count matrix file

1. Open a MatrixReader

MatrixReader<BUF_SIZE> reader("./km_dir/counts/matrices/matrix_0.count");

1.a Stream it

uint32_t kmer_size = reader.infos().kmer_size;
Kmer<MAX_K> kmer; 
kmer.set_k(kmer_size);
std::vector<count_type> counts(reader.infos().nb_counts); // count order follows the sample order in the input fof
while (reader.read<MAX_K, MAX_C>(kmer, counts))
{
  std::cout << kmer.to_string() << " ";
  for (auto& c : counts)
    std::cout << std::to_string(c) << " ";
  std::cout << "\n";
}

1.b Or dump it as text

reader.write_as_text(std::cout);

B. Aggregate count matrix files

1. Open a MatrixFileAggregator

std::vector<std::string> paths {"./km_dir/counts/matrices/matrix_0.count", "./km_dir/counts/matrices/matrix_1.count"};
MatrixFileAggregator<MAX_K, MAX_C> mfa(paths, kmer_size);

1.a Dump files in a single text file

mfa.write_as_text(std::cout);

1.b Dump files in a single binary file

bool lz4_compress = true;
mfa.write_as_bin("./matrix.count", lz4_compress); 

C. Merge count matrix files (allows to obtains a single sorted count matrix file)

1. Open a MatrixFileMerger

std::vector<std::string> paths {"./km_dir/counts/matrices/matrix_0.count", "./km_dir/counts/matrices/matrix_1.count"};
MatrixFileMerger<MAX_K, MAX_C> mfm(paths, kmer_size);

1.a Stream sorted k-mers and counts

while (mfm.next()) 
{
   const Kmer<MAX_K>& kmer = mfm.current();
   const std::vector<count_type>& count = merge.counts();
   std::cout << kmer.to_string() << " ";
   for (auto& c : counts)
     std::cout << std::to_string(c) << " ";
   std::cout << "\n";
}

1.b Or dump them in a single sorted file

mfm.write_as_text(std::cout);

1.c Or dump them in a single binary file

bool lz4_compress = true;
mfm.write_as_bin("./matrix.sorted.count", lz4_compress); 

III. Presence/absence Matrix files

A. Read a presence/absence matrix file

1. Open a PAMatrixReader

PAMatrixReader<BUF_SIZE> reader("./km_dir/counts/matrices/matrix_0.pa");

1.a Stream it

uint32_t kmer_size = reader.infos().kmer_size;
Kmer<MAX_K> kmer; 
kmer.set_k(kmer_size);
std::vector<uint8_t> bits(NBYTES(reader.infos().bits));
int sample_id = 10; // sample at line 10 in the input fof
while (reader.read<MAX_K>(kmer, bits))
{
  std::cout << kmer.to_string() << " ";
  if (BITCHECK(bits, sample_id))
    std::cout << "Found in sample ";
  else
    std::cout << "Not found in sample ";
  std::cout << sample_id << "\n";
}

1.b Or dump it as_text

reader.write_as_text(std::cout);1

B. Aggregate presence/absence matrix file

1. Open a PAMatrixFileAggregator

std::vector<std::string> paths {"./km_dir/counts/matrices/matrix_0.pa", "./km_dir/counts/matrices/matrix_1.pa"};
PAMatrixFileAggregator<MAX_K, MAX_C> pmfa(paths, );

1.a Dump files in a single text file

pmfa.write_as_text(std::cout);

1.b Dump files in a single binary file

bool lz4_compress = true;
pmfa.write_as_bin("./matrix.pa", lz4_compress); 

C. Merge presence/absence matrix files (allows to obtains a single sorted presence/absence matrix file)

1. Open a PAMatrixFileMerger

std::vector<std::string> paths {"./km_dir/counts/matrices/matrix_0.pa", "./km_dir/counts/matrices/matrix_1.pa"};
PAMatrixFileMerger<MAX_K> pmfm(paths, kmer_size);

1.a Stream k-mers and p/a bit-vectors

int sample_id = 10; // sample at line 10 in the input fof
while (pmfm.next()) 
{
   const Kmer<MAX_K>& kmer = pmfm.current();
   const std::vector<uint8_t>& bits = merge.bits();
   std::cout << kmer.to_string() << " ";
   if (BITCHECK(bits, sample_id))
     std::cout << "Found in sample ";
   else
     std::cout << "Not found in sample ";
   std::cout << sample_id << "\n";
}

1.b Or dump them in a single sorted text file

pmfm.write_as_text(std::cout);

1.c Or dump them in a single sorted binary file

bool lz4_compress = true;
pmfm.write_as_bin("./matrix.sorted.pa", lz4_compress); 

Work with hashes

The API for hash files is the same but use uint64_t instead of Kmer<MAX_K>. Mergers are not provided for hashes because hash spaces are specific and consecutive according to partitions. To obtains truly sorted outputs, partitions just need to be aggregated in the right order.

HashReader<BUF_SIZE> reader(const std::string& path);
HashFileAggregator<MAX_C> hfa(const std::vector<std::string>& paths);
HashMatrixReader<BUF_SIZE> reader(const std::string& path);
HashMatrixFileAggregator<MAX_C> hmfa(const std::vector<std::string>& paths);
PAHashMatrixReader<BUF_SIZE> reader(const std::string& path);
PAHashMatrixFileAggregator<MAX_C> phmfa(const std::vector<std::string>& paths);
Clone this wiki locally