Skip to content

Commit

Permalink
Merge pull request #3208 from vgteam/autoindex-giraffe
Browse files Browse the repository at this point in the history
Autoindexing for giraffe
  • Loading branch information
jeizenga committed Feb 23, 2021
2 parents 03c207b + d53c5f9 commit 8b04aee
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 9 deletions.
220 changes: 214 additions & 6 deletions src/index_registry.cpp
Expand Up @@ -15,6 +15,9 @@
#include <bdsg/odgi.hpp>
#include <xg.hpp>
#include <gbwt/variants.h>
#include <gbwtgraph/index.h>
#include <gbwtgraph/gbwtgraph.h>
#include <gbwtgraph/path_cover.h>
#include <vg/io/vpkg.hpp>
#include <gcsa/gcsa.h>
#include <gcsa/algorithms.h>
Expand Down Expand Up @@ -77,6 +80,13 @@ int IndexingParameters::gbwt_sampling_interval = gbwt::DynamicGBWT::SAMPLE_INTER
bool IndexingParameters::bidirectional_haplo_tx_gbwt = false;
string IndexingParameters::gff_feature_name = "exon";
string IndexingParameters::gff_transcript_tag = "transcript_id";
bool IndexingParameters::use_bounded_syncmers = false;
int IndexingParameters::minimizer_k = 29;
int IndexingParameters::minimizer_w = 11;
int IndexingParameters::minimizer_s = 18;
int IndexingParameters::path_cover_depth = gbwtgraph::PATH_COVER_DEFAULT_N;
int IndexingParameters::giraffe_gbwt_downsample = gbwtgraph::LOCAL_HAPLOTYPES_DEFAULT_N;
int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAULT_K;
bool IndexingParameters::verbose = false;

// return file size in bytes
Expand Down Expand Up @@ -209,6 +219,7 @@ vector<string> vcf_contigs(const string& filename) {

contigs.emplace(chrom);
}

bcf_destroy(bcf_record);
bcf_hdr_destroy(header);
hts_close(vcf);
Expand Down Expand Up @@ -265,13 +276,18 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
registry.register_index("GBWT", "gbwt");
registry.register_index("Spliced GBWT", "spliced.gbwt");
registry.register_index("Haplotype-Transcript GBWT", "haplotx.gbwt");
registry.register_index("Giraffe GBWT", "giraffe.gbwt");

registry.register_index("Snarls", "snarls");
registry.register_index("Spliced Snarls", "spliced.snarls");

registry.register_index("Distance Index", "dist");
registry.register_index("Spliced Distance Index", "spliced.dist");

registry.register_index("GBWTGraph", "gg");

registry.register_index("Minimizers", "min");


/*********************
* A few handy lambda functions
Expand Down Expand Up @@ -322,9 +338,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
return graph;
};




/*********************
* Register all recipes
***********************/
Expand Down Expand Up @@ -1143,6 +1156,96 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
return make_gbwt(inputs, plan, constructing);
});

// Giraffe will prefer to use a downsampled haplotype GBWT if possible
registry.register_recipe({"Giraffe GBWT"}, {"GBWT", "XG"},
[&](const vector<const IndexFile*>& inputs,
const IndexingPlan* plan,
AliasGraph& alias_graph,
const IndexGroup& constructing) {
if (IndexingParameters::verbose) {
cerr << "[IndexRegistry]: Downsampling full GBWT." << endl;
}

assert(inputs.size() == 2);
auto gbwt_filenames = inputs[0]->get_filenames();
auto xg_filenames = inputs[1]->get_filenames();
assert(gbwt_filenames.size() == 1);
assert(xg_filenames.size() == 1);
auto gbwt_filename = gbwt_filenames.front();
auto xg_filename = xg_filenames.front();

assert(constructing.size() == 1);
vector<vector<string>> all_outputs(constructing.size());
auto& output_names = all_outputs[0];
auto sampled_gbwt_output = *constructing.begin();

ifstream infile_xg, infile_gbwt;
init_in(infile_xg, xg_filename);
init_in(infile_gbwt, gbwt_filename);

auto output_name = plan->output_filepath(sampled_gbwt_output);
ofstream outfile_sampled_gbwt;
init_out(outfile_sampled_gbwt, output_name);

auto xg_index = vg::io::VPKG::load_one<xg::XG>(infile_xg);
auto gbwt_index = vg::io::VPKG::load_one<gbwt::GBWT>(infile_gbwt);

// compute a downsampled local haplotype cover (with path cover over missing components)
gbwt::GBWT cover = gbwtgraph::local_haplotypes(*xg_index, *gbwt_index,
IndexingParameters::giraffe_gbwt_downsample,
IndexingParameters::downsample_context_length,
200 * gbwt::MILLION, // buffer size
IndexingParameters::gbwt_sampling_interval,
IndexingParameters::verbose);

vg::io::VPKG::save(cover, output_name);
output_names.push_back(output_name);
return all_outputs;
});

// do a greedy haplotype cover if we don't have haplotypes
registry.register_recipe({"Giraffe GBWT"}, {"XG"},
[&](const vector<const IndexFile*>& inputs,
const IndexingPlan* plan,
AliasGraph& alias_graph,
const IndexGroup& constructing) {

if (IndexingParameters::verbose) {
cerr << "[IndexRegistry]: Constructing a greedy path cover GBWT" << endl;
}

assert(inputs.size() == 1);
auto xg_filenames = inputs[0]->get_filenames();
assert(xg_filenames.size() == 1);
auto xg_filename = xg_filenames.front();

assert(constructing.size() == 1);
vector<vector<string>> all_outputs(constructing.size());
auto& output_names = all_outputs[0];
auto path_cover_output = *constructing.begin();

ifstream infile_xg;
init_in(infile_xg, xg_filename);

auto output_name = plan->output_filepath(path_cover_output);
ofstream outfile_path_cover_gbwt;
init_out(outfile_path_cover_gbwt, output_name);

auto xg_index = vg::io::VPKG::load_one<xg::XG>(infile_xg);

// make a GBWT from a greedy path cover
gbwt::GBWT cover = gbwtgraph::path_cover_gbwt(*xg_index,
IndexingParameters::path_cover_depth,
IndexingParameters::downsample_context_length,
200 * gbwt::MILLION, // buffer size
IndexingParameters::gbwt_sampling_interval,
IndexingParameters::verbose);

vg::io::VPKG::save(cover, output_name);
output_names.push_back(output_name);
return all_outputs;
});

registry.register_recipe({"Haplotype-Transcript GBWT", "Spliced VG w/ Transcript Paths", "Unjoined Transcript Origin Table", },
{"GTF/GFF", "Spliced GBWT", "Spliced VG", },
[&](const vector<const IndexFile*>& inputs,
Expand Down Expand Up @@ -1810,6 +1913,111 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
return make_distance_index(inputs, plan, constructing);
});

////////////////////////////////////
// GBWTGraph Recipes
////////////////////////////////////

registry.register_recipe({"GBWTGraph"}, {"Giraffe GBWT", "XG"},
[&](const vector<const IndexFile*>& inputs,
const IndexingPlan* plan,
AliasGraph& alias_graph,
const IndexGroup& constructing) {
if (IndexingParameters::verbose) {
cerr << "[IndexRegistry]: Constructing GBWTGraph." << endl;
}

assert(inputs.size() == 2);
auto gbwt_filenames = inputs[0]->get_filenames();
auto xg_filenames = inputs[1]->get_filenames();
assert(gbwt_filenames.size() == 1);
assert(xg_filenames.size() == 1);
auto gbwt_filename = gbwt_filenames.front();
auto xg_filename = xg_filenames.front();

assert(constructing.size() == 1);
vector<vector<string>> all_outputs(constructing.size());
auto gbwtgraph_output = *constructing.begin();
auto& output_names = all_outputs[0];

ifstream infile_gbwt, infile_xg;
init_in(infile_gbwt, gbwt_filename);
init_in(infile_xg, xg_filename);
string output_name = plan->output_filepath(gbwtgraph_output);
ofstream outfile;
init_out(outfile, output_name);

auto gbwt_index = vg::io::VPKG::load_one<gbwt::GBWT>(infile_gbwt);
auto xg_index = vg::io::VPKG::load_one<xg::XG>(infile_xg);

// TODO: could add simplification to replace XG index with a gbwt::SequenceSource here
gbwtgraph::GBWTGraph ggraph(*gbwt_index, *xg_index);

vg::io::VPKG::save(ggraph, output_name);

output_names.push_back(output_name);
return all_outputs;
});

////////////////////////////////////
// Minimizers Recipes
////////////////////////////////////

registry.register_recipe({"Minimizers"}, {"Distance Index", "GBWTGraph", "Giraffe GBWT"},
[&](const vector<const IndexFile*>& inputs,
const IndexingPlan* plan,
AliasGraph& alias_graph,
const IndexGroup& constructing) {
if (IndexingParameters::verbose) {
cerr << "[IndexRegistry]: Constructing minimizer index." << endl;
}

// TODO: should the distance index input be a joint simplification

assert(inputs.size() == 3);
auto dist_filenames = inputs[0]->get_filenames();
auto ggraph_filenames = inputs[1]->get_filenames();
auto gbwt_filenames = inputs[2]->get_filenames();
assert(dist_filenames.size() == 1);
assert(gbwt_filenames.size() == 1);
assert(ggraph_filenames.size() == 1);
auto dist_filename = dist_filenames.front();
auto gbwt_filename = gbwt_filenames.front();
auto ggraph_filename = ggraph_filenames.front();

assert(constructing.size() == 1);
vector<vector<string>> all_outputs(constructing.size());
auto minimizer_output = *constructing.begin();
auto& output_names = all_outputs[0];

ifstream infile_gbwt, infile_ggraph, infile_dist;
init_in(infile_dist, dist_filename);
init_in(infile_gbwt, gbwt_filename);
init_in(infile_ggraph, ggraph_filename);
string output_name = plan->output_filepath(minimizer_output);
ofstream outfile;
init_out(outfile, output_name);

auto gbwt_index = vg::io::VPKG::load_one<gbwt::GBWT>(infile_gbwt);
auto ggraph_index = vg::io::VPKG::load_one<gbwtgraph::GBWTGraph>(infile_ggraph);
ggraph_index->set_gbwt(*gbwt_index);
auto dist_index = vg::io::VPKG::load_one<MinimumDistanceIndex>(infile_dist);

gbwtgraph::DefaultMinimizerIndex minimizers(IndexingParameters::minimizer_k,
IndexingParameters::use_bounded_syncmers ?
IndexingParameters::minimizer_s :
IndexingParameters::minimizer_w,
IndexingParameters::use_bounded_syncmers);

gbwtgraph::index_haplotypes(*ggraph_index, minimizers, [&](const pos_t& pos) -> gbwtgraph::payload_type {
return MIPayload::encode(dist_index->get_minimizer_distances(pos));
});

vg::io::VPKG::save(minimizers, output_name);

output_names.push_back(output_name);
return all_outputs;
});

return registry;
}

Expand All @@ -1836,10 +2044,10 @@ vector<IndexName> VGIndexes::get_default_mpmap_indexes() {

vector<IndexName> VGIndexes::get_default_giraffe_indexes() {
vector<IndexName> indexes{
"GBWT",
"Giraffe GBWT",
"GBWTGraph",
"Distance",
"Minimizer"
"Distance Index",
"Minimizers"
};
return indexes;
}
Expand Down
14 changes: 14 additions & 0 deletions src/index_registry.hpp
Expand Up @@ -91,6 +91,20 @@ struct IndexingParameters {
static string gff_feature_name;
// transcript tag in GTF/GFF ["transcript_id"]
static string gff_transcript_tag;
// if true, minimizer index uses bounded syncmers, otherwise uses minimizers [false]
static bool use_bounded_syncmers;
// length of k-mer used in minimizer index [29]
static int minimizer_k;
// length of window if using minimizers [11]
static int minimizer_w;
// length of internal s-mer if using bounded syncmers [18]
static int minimizer_s;
// the number of paths that will make up the path cover GBWT [16]
static int path_cover_depth;
// the number of haplotypes to downsample to in giraffe's GBWT [64]
static int giraffe_gbwt_downsample;
// Jordan actually doesn't know what this one does [4]
static int downsample_context_length;
// whether indexing algorithms will log progress (if available) [false]
static bool verbose;
};
Expand Down
3 changes: 1 addition & 2 deletions src/subcommand/autoindex_main.cpp
Expand Up @@ -207,8 +207,7 @@ int main_autoindex(int argc, char** argv) {
else if (optarg == string("giraffe")) {
for (auto& target : VGIndexes::get_default_giraffe_indexes()) {
targets.emplace_back(move(target));
} cerr << "giraffe indexing not yet implemented" << endl;
return 1;
}
}
else {
cerr << "error: Unrecognized workflow (-w): " << optarg << endl;
Expand Down
16 changes: 15 additions & 1 deletion test/t/52_vg_autoindex.t
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 18
plan tests 21

vg autoindex -p auto -w map -r tiny/tiny.fa -v tiny/tiny.vcf.gz -t 1 --force-unphased
is $(echo $?) 0 "autoindexing successfully completes indexing for vg map with basic input"
Expand Down Expand Up @@ -58,3 +58,17 @@ vg autoindex -p auto -w mpmap -r small/xy.fa -v small/x.vcf.gz -v small/y.vcf.gz
is $(echo $?) 0 "autoindexing successfully completes indexing for vg mpmap with another partially chunked input"

rm auto.*

vg autoindex -p auto -w giraffe -r tiny/tiny.fa -v tiny/tiny.vcf.gz -t 1
is $(echo $?) 0 "autoindexing successfully completes indexing for vg giraffe with unchunked input"
is $(ls auto.* | wc -l) 4 "autoindexing creates 4 inputs for vg giraffe"
vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > t.vg
vg index -x t.xg t.vg
vg sim -x t.xg -n 20 -a -l 10 | vg giraffe -g auto.gg -H auto.giraffe.gbwt -m auto.min -d auto.dist -G - > /dev/null
is $(echo $?) 0 "basic autoindexing results can be used by vg giraffe"

rm auto.*
rm t.*



1 comment on commit 8b04aee

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 11671 seconds

Please sign in to comment.