From 63f1b51d9b4935a0db92ec7a3cd5b41625f7143d Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 28 Oct 2020 10:45:47 -0700 Subject: [PATCH] Pre-index non-alt paths to fix #3054 --- src/graph_synchronizer.cpp | 19 ++++++++++++++++--- src/variant_adder.cpp | 12 +++++++----- src/vg.cpp | 4 ++-- test/add/multi.json | 24 ++++++++++++++++++++++++ test/add/multi.vcf | 15 +++++++++++++++ test/t/31_vg_add.t | 4 +++- 6 files changed, 67 insertions(+), 11 deletions(-) create mode 100644 test/add/multi.json create mode 100644 test/add/multi.vcf diff --git a/src/graph_synchronizer.cpp b/src/graph_synchronizer.cpp index 605d03b2a4f..e427aecc91c 100644 --- a/src/graph_synchronizer.cpp +++ b/src/graph_synchronizer.cpp @@ -8,7 +8,17 @@ namespace vg { using namespace std; GraphSynchronizer::GraphSynchronizer(VG& graph) : graph(graph) { - // Nothing to do! + // Because in general paths can overlap each other, and because we can't + // build a path index after a path has been modified (since we don't keep + // the ranks up to date internally), we need to build all the indexes up + // front, even if we're just working on a single path. + graph.for_each_path_handle([&](const path_handle_t& path) { + string name = graph.get_path_name(path); + if (!Paths::is_alt(name)) { + // We only care about reference paths. + get_path_index(name); + } + }); } void GraphSynchronizer::with_path_index(const string& path_name, const function& to_run) { @@ -29,7 +39,10 @@ const string& GraphSynchronizer::get_path_sequence(const string& path_name) { // We need a function to grab the index for a path PathIndex& GraphSynchronizer::get_path_index(const string& path_name) { - + + // We don't work on alt paths; there could be too many to pre-index. + assert(!Paths::is_alt(path_name)); + if (!indexes.count(path_name)) { // Not already made. Generate it. indexes.emplace(piecewise_construct, @@ -115,7 +128,7 @@ void GraphSynchronizer::Lock::lock() { cerr << endl; } #endif - + // Make them into pos_ts that point left to right, the way Jordan thinks. pos_t left_pos = make_pos_t(start_left.node, start_left.is_end, 0); pos_t right_pos = make_pos_t(end_right.node, !end_right.is_end, diff --git a/src/variant_adder.cpp b/src/variant_adder.cpp index e3175d565c3..e6cb84b75e2 100644 --- a/src/variant_adder.cpp +++ b/src/variant_adder.cpp @@ -8,7 +8,13 @@ namespace vg { using namespace std; using namespace vg::io; -VariantAdder::VariantAdder(VG& graph) : graph(graph), sync(graph) { +VariantAdder::VariantAdder(VG& graph) : graph(graph), sync([&](VG& g) -> VG& { + // Dice nodes in the graph for GCSA indexing *before* constructing the synchronizer. + g.dice_nodes(max_node_size); + return g; + }(this->graph)) { + + graph.paths.for_each_name([&](const string& name) { // Save the names of all the graph paths, so we don't need to lock the // graph to check them. @@ -18,10 +24,6 @@ VariantAdder::VariantAdder(VG& graph) : graph(graph), sync(graph) { // Show progress if the graph does. show_progress = graph.show_progress; - // Make sure to dice nodes to 1024 or smaller, the max size that GCSA2 - // supports, in case we need to GCSA-index part of the graph. - graph.dice_nodes(max_node_size); - // Configure the aligner to use a full length bonus aligner.full_length_bonus = 5; } diff --git a/src/vg.cpp b/src/vg.cpp index fb13b519d3e..f03f630b0be 100644 --- a/src/vg.cpp +++ b/src/vg.cpp @@ -4004,7 +4004,7 @@ void VG::divide_node(Node* node, vector& positions, vector& parts) { #ifdef debug_divide #pragma omp critical (cerr) - cerr << omp_get_thread_num() << ": dividing mapping " << pb2json(*m) << endl; + cerr << omp_get_thread_num() << ": dividing mapping " << *m << endl; #endif string path_name = paths.mapping_path_name(m); @@ -4077,7 +4077,7 @@ void VG::divide_node(Node* node, vector& positions, vector& parts) { #pragma omp critical (cerr) cerr << omp_get_thread_num() << ": produced mappings:" << endl; for(auto mapping : mapping_parts) { - cerr << "\t" << pb2json(mapping) << endl; + cerr << "\t" << mapping << endl; } #endif } diff --git a/test/add/multi.json b/test/add/multi.json new file mode 100644 index 00000000000..c46d98a0568 --- /dev/null +++ b/test/add/multi.json @@ -0,0 +1,24 @@ +{ + "node": [ + {"id": 1, "sequence": "CTTAAAATGATCGGGACTTTTCAAATCTTATTT"} + ], + "edge": [ + ], + "path": [ + {"name": "ref", "mapping": [ + {"rank": 1, "edit": [ + {"from_length": 33, "to_length": 33} + ], "position": {"node_id": 1, "offset": 0, "is_reverse": true}} + ]}, + {"name": "ref2", "mapping": [ + {"rank": 1, "edit": [ + {"from_length": 33, "to_length": 33} + ], "position": {"node_id": 1, "offset": 0}} + ]}, + {"name": "ref3", "mapping": [ + {"rank": 1, "edit": [ + {"from_length": 33, "to_length": 33} + ], "position": {"node_id": 1, "offset": 0, "is_reverse": true}} + ]} + ] +} diff --git a/test/add/multi.vcf b/test/add/multi.vcf new file mode 100644 index 00000000000..803a9c138dc --- /dev/null +++ b/test/add/multi.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##FILTER= +##FILTER= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 SAMPLE4 +ref 18 . TC T 100 PASS . GT 1/0 0/0 0|0 ././1 +ref 21 . CGA GAC 100 PASS . GT 0/1 0/0 ./1 ./1/. +ref 23 . A AC 100 PASS . GT 0/0 1/0 . ./0 +ref3 18 . TC T 100 PASS . GT 1/0 0/0 0|0 ././1 +ref3 21 . CGA GAC 100 PASS . GT 0/1 0/0 ./1 ./1/. +ref3 23 . A AC 100 PASS . GT 0/0 1/0 . ./0 diff --git a/test/t/31_vg_add.t b/test/t/31_vg_add.t index 40a1bb66c05..e0bc23ae1d5 100644 --- a/test/t/31_vg_add.t +++ b/test/t/31_vg_add.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 11 +plan tests 12 vg construct -r add/ref.fa > ref.vg vg add -v add/benedict.vcf ref.vg > benedict.vg @@ -44,6 +44,8 @@ is "$(vg view -c x.vg | jq -c '.path[].mapping[] | select(.rank | not)' | wc -l) is "$(vg view -Jv add/backward.json | vg add -v add/benedict.vcf - | vg mod --unchop - | vg stats -N -)" "5" "graphs with backward nodes can be added to" +is "$(vg view -Jv add/multi.json | vg add -v add/multi.vcf - | vg mod --unchop - | vg stats -N -)" "5" "graphs with multiple overlapping paths nodes can be added to" + is "$(vg view -Jv add/backward_and_forward.json | vg add -v add/benedict.vcf - | vg mod --unchop - | vg stats -N -)" "5" "graphs with backward and forward nodes can be added to" rm -rf ref.vg ref.pg benedict.vg benedict2.vg benedict3.vg x-ref.vg x.vg refN.vg no-n.vg with-n.vg ngap.vg ngap-add.vg