diff --git a/src/index_registry.cpp b/src/index_registry.cpp index e2ef3499a8..a09d34c452 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -77,6 +80,13 @@ int IndexingParameters::gbwt_sampling_interval = gbwt::DynamicGBWT::SAMPLE_INTER bool IndexingParameters::bidirectional_haplo_tx_gbwt = false; string IndexingParameters::gff_feature_name = "exon"; string IndexingParameters::gff_transcript_tag = "transcript_id"; +bool IndexingParameters::use_bounded_syncmers = false; +int IndexingParameters::minimizer_k = 29; +int IndexingParameters::minimizer_w = 11; +int IndexingParameters::minimizer_s = 18; +int IndexingParameters::path_cover_depth = gbwtgraph::PATH_COVER_DEFAULT_N; +int IndexingParameters::giraffe_gbwt_downsample = gbwtgraph::LOCAL_HAPLOTYPES_DEFAULT_N; +int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAULT_K; bool IndexingParameters::verbose = false; // return file size in bytes @@ -209,6 +219,7 @@ vector vcf_contigs(const string& filename) { contigs.emplace(chrom); } + bcf_destroy(bcf_record); bcf_hdr_destroy(header); hts_close(vcf); @@ -265,6 +276,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("GBWT", "gbwt"); registry.register_index("Spliced GBWT", "spliced.gbwt"); registry.register_index("Haplotype-Transcript GBWT", "haplotx.gbwt"); + registry.register_index("Giraffe GBWT", "giraffe.gbwt"); registry.register_index("Snarls", "snarls"); registry.register_index("Spliced Snarls", "spliced.snarls"); @@ -272,6 +284,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("Distance Index", "dist"); registry.register_index("Spliced Distance Index", "spliced.dist"); + registry.register_index("GBWTGraph", "gg"); + + registry.register_index("Minimizers", "min"); + /********************* * A few handy lambda functions @@ -322,9 +338,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return graph; }; - - - /********************* * Register all recipes ***********************/ @@ -1143,6 +1156,96 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return make_gbwt(inputs, plan, constructing); }); + // Giraffe will prefer to use a downsampled haplotype GBWT if possible + registry.register_recipe({"Giraffe GBWT"}, {"GBWT", "XG"}, + [&](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbose) { + cerr << "[IndexRegistry]: Downsampling full GBWT." << endl; + } + + assert(inputs.size() == 2); + auto gbwt_filenames = inputs[0]->get_filenames(); + auto xg_filenames = inputs[1]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(xg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + auto sampled_gbwt_output = *constructing.begin(); + + ifstream infile_xg, infile_gbwt; + init_in(infile_xg, xg_filename); + init_in(infile_gbwt, gbwt_filename); + + auto output_name = plan->output_filepath(sampled_gbwt_output); + ofstream outfile_sampled_gbwt; + init_out(outfile_sampled_gbwt, output_name); + + auto xg_index = vg::io::VPKG::load_one(infile_xg); + auto gbwt_index = vg::io::VPKG::load_one(infile_gbwt); + + // compute a downsampled local haplotype cover (with path cover over missing components) + gbwt::GBWT cover = gbwtgraph::local_haplotypes(*xg_index, *gbwt_index, + IndexingParameters::giraffe_gbwt_downsample, + IndexingParameters::downsample_context_length, + 200 * gbwt::MILLION, // buffer size + IndexingParameters::gbwt_sampling_interval, + IndexingParameters::verbose); + + vg::io::VPKG::save(cover, output_name); + output_names.push_back(output_name); + return all_outputs; + }); + + // do a greedy haplotype cover if we don't have haplotypes + registry.register_recipe({"Giraffe GBWT"}, {"XG"}, + [&](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + + if (IndexingParameters::verbose) { + cerr << "[IndexRegistry]: Constructing a greedy path cover GBWT" << endl; + } + + assert(inputs.size() == 1); + auto xg_filenames = inputs[0]->get_filenames(); + assert(xg_filenames.size() == 1); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + auto path_cover_output = *constructing.begin(); + + ifstream infile_xg; + init_in(infile_xg, xg_filename); + + auto output_name = plan->output_filepath(path_cover_output); + ofstream outfile_path_cover_gbwt; + init_out(outfile_path_cover_gbwt, output_name); + + auto xg_index = vg::io::VPKG::load_one(infile_xg); + + // make a GBWT from a greedy path cover + gbwt::GBWT cover = gbwtgraph::path_cover_gbwt(*xg_index, + IndexingParameters::path_cover_depth, + IndexingParameters::downsample_context_length, + 200 * gbwt::MILLION, // buffer size + IndexingParameters::gbwt_sampling_interval, + IndexingParameters::verbose); + + vg::io::VPKG::save(cover, output_name); + output_names.push_back(output_name); + return all_outputs; + }); + registry.register_recipe({"Haplotype-Transcript GBWT", "Spliced VG w/ Transcript Paths", "Unjoined Transcript Origin Table", }, {"GTF/GFF", "Spliced GBWT", "Spliced VG", }, [&](const vector& inputs, @@ -1810,6 +1913,111 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return make_distance_index(inputs, plan, constructing); }); + //////////////////////////////////// + // GBWTGraph Recipes + //////////////////////////////////// + + registry.register_recipe({"GBWTGraph"}, {"Giraffe GBWT", "XG"}, + [&](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbose) { + cerr << "[IndexRegistry]: Constructing GBWTGraph." << endl; + } + + assert(inputs.size() == 2); + auto gbwt_filenames = inputs[0]->get_filenames(); + auto xg_filenames = inputs[1]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(xg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbwtgraph_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + ifstream infile_gbwt, infile_xg; + init_in(infile_gbwt, gbwt_filename); + init_in(infile_xg, xg_filename); + string output_name = plan->output_filepath(gbwtgraph_output); + ofstream outfile; + init_out(outfile, output_name); + + auto gbwt_index = vg::io::VPKG::load_one(infile_gbwt); + auto xg_index = vg::io::VPKG::load_one(infile_xg); + + // TODO: could add simplification to replace XG index with a gbwt::SequenceSource here + gbwtgraph::GBWTGraph ggraph(*gbwt_index, *xg_index); + + vg::io::VPKG::save(ggraph, output_name); + + output_names.push_back(output_name); + return all_outputs; + }); + + //////////////////////////////////// + // Minimizers Recipes + //////////////////////////////////// + + registry.register_recipe({"Minimizers"}, {"Distance Index", "GBWTGraph", "Giraffe GBWT"}, + [&](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbose) { + cerr << "[IndexRegistry]: Constructing minimizer index." << endl; + } + + // TODO: should the distance index input be a joint simplification + + assert(inputs.size() == 3); + auto dist_filenames = inputs[0]->get_filenames(); + auto ggraph_filenames = inputs[1]->get_filenames(); + auto gbwt_filenames = inputs[2]->get_filenames(); + assert(dist_filenames.size() == 1); + assert(gbwt_filenames.size() == 1); + assert(ggraph_filenames.size() == 1); + auto dist_filename = dist_filenames.front(); + auto gbwt_filename = gbwt_filenames.front(); + auto ggraph_filename = ggraph_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto minimizer_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + ifstream infile_gbwt, infile_ggraph, infile_dist; + init_in(infile_dist, dist_filename); + init_in(infile_gbwt, gbwt_filename); + init_in(infile_ggraph, ggraph_filename); + string output_name = plan->output_filepath(minimizer_output); + ofstream outfile; + init_out(outfile, output_name); + + auto gbwt_index = vg::io::VPKG::load_one(infile_gbwt); + auto ggraph_index = vg::io::VPKG::load_one(infile_ggraph); + ggraph_index->set_gbwt(*gbwt_index); + auto dist_index = vg::io::VPKG::load_one(infile_dist); + + gbwtgraph::DefaultMinimizerIndex minimizers(IndexingParameters::minimizer_k, + IndexingParameters::use_bounded_syncmers ? + IndexingParameters::minimizer_s : + IndexingParameters::minimizer_w, + IndexingParameters::use_bounded_syncmers); + + gbwtgraph::index_haplotypes(*ggraph_index, minimizers, [&](const pos_t& pos) -> gbwtgraph::payload_type { + return MIPayload::encode(dist_index->get_minimizer_distances(pos)); + }); + + vg::io::VPKG::save(minimizers, output_name); + + output_names.push_back(output_name); + return all_outputs; + }); + return registry; } @@ -1836,10 +2044,10 @@ vector VGIndexes::get_default_mpmap_indexes() { vector VGIndexes::get_default_giraffe_indexes() { vector indexes{ - "GBWT", + "Giraffe GBWT", "GBWTGraph", - "Distance", - "Minimizer" + "Distance Index", + "Minimizers" }; return indexes; } diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 439081001f..d847d369e0 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -91,6 +91,20 @@ struct IndexingParameters { static string gff_feature_name; // transcript tag in GTF/GFF ["transcript_id"] static string gff_transcript_tag; + // if true, minimizer index uses bounded syncmers, otherwise uses minimizers [false] + static bool use_bounded_syncmers; + // length of k-mer used in minimizer index [29] + static int minimizer_k; + // length of window if using minimizers [11] + static int minimizer_w; + // length of internal s-mer if using bounded syncmers [18] + static int minimizer_s; + // the number of paths that will make up the path cover GBWT [16] + static int path_cover_depth; + // the number of haplotypes to downsample to in giraffe's GBWT [64] + static int giraffe_gbwt_downsample; + // Jordan actually doesn't know what this one does [4] + static int downsample_context_length; // whether indexing algorithms will log progress (if available) [false] static bool verbose; }; diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp index f22ab61a97..707178e805 100644 --- a/src/subcommand/autoindex_main.cpp +++ b/src/subcommand/autoindex_main.cpp @@ -207,8 +207,7 @@ int main_autoindex(int argc, char** argv) { else if (optarg == string("giraffe")) { for (auto& target : VGIndexes::get_default_giraffe_indexes()) { targets.emplace_back(move(target)); - } cerr << "giraffe indexing not yet implemented" << endl; - return 1; + } } else { cerr << "error: Unrecognized workflow (-w): " << optarg << endl; diff --git a/test/t/52_vg_autoindex.t b/test/t/52_vg_autoindex.t index f4eb7f44a6..7a936c1377 100644 --- a/test/t/52_vg_autoindex.t +++ b/test/t/52_vg_autoindex.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 18 +plan tests 21 vg autoindex -p auto -w map -r tiny/tiny.fa -v tiny/tiny.vcf.gz -t 1 --force-unphased is $(echo $?) 0 "autoindexing successfully completes indexing for vg map with basic input" @@ -58,3 +58,17 @@ vg autoindex -p auto -w mpmap -r small/xy.fa -v small/x.vcf.gz -v small/y.vcf.gz is $(echo $?) 0 "autoindexing successfully completes indexing for vg mpmap with another partially chunked input" rm auto.* + +vg autoindex -p auto -w giraffe -r tiny/tiny.fa -v tiny/tiny.vcf.gz -t 1 +is $(echo $?) 0 "autoindexing successfully completes indexing for vg giraffe with unchunked input" +is $(ls auto.* | wc -l) 4 "autoindexing creates 4 inputs for vg giraffe" +vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > t.vg +vg index -x t.xg t.vg +vg sim -x t.xg -n 20 -a -l 10 | vg giraffe -g auto.gg -H auto.giraffe.gbwt -m auto.min -d auto.dist -G - > /dev/null +is $(echo $?) 0 "basic autoindexing results can be used by vg giraffe" + +rm auto.* +rm t.* + + +