Merge remote-tracking branch 'upstream/master' into rna-gbwt-anno

vgteam · Sep 20, 2021 · 2db683d · 2db683d · adamnovak · Sep 20, 2021
2 parents 2929d90 + 9e02cca
commit 2db683d
Show file tree

Hide file tree

Showing 46 changed files with 2,053 additions and 627 deletions.
diff --git a/Makefile b/Makefile
@@ -674,7 +674,7 @@ $(LIB_DIR)/libfml.a: $(FERMI_DIR)/*.h $(FERMI_DIR)/*.c
 $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB_DIR)/libhts.a
 	. ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/
 
-$(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/Makefile $(LIBBDSG_DIR)/bdsg/src/*.cpp $(LIBBDSG_DIR)/bdsg/include/bdsg/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/internal/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/overlays/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp
+$(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/Makefile $(LIBBDSG_DIR)/bdsg/src/*.cpp $(LIBBDSG_DIR)/bdsg/include/bdsg/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/internal/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/overlays/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp $(INC_DIR)/mio/mmap.hpp
 	+. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR)
 
 $(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/*

diff --git a/README.md b/README.md
@@ -241,8 +241,10 @@ Call only variants that are present in the graph (use `-g`):
 
 <!-- !test check Pack and call -->
 ```sh
-# Compute the read support from the gam (ignoring mapping and base qualitiy < 5)
-vg pack -x x.xg -g aln.gam -Q 5 -o aln.pack
+# Compute the read support from the gam
+# -Q 5: ignore mapping and base qualitiy < 5
+# -s 5: ignore first and last 5bp from each read
+vg pack -x x.xg -g aln.gam -Q 5 -s 5 -o aln.pack
 
 # Generate a VCF from the support.  
 vg call x.xg -k aln.pack > graph_calls.vcf
@@ -261,8 +263,8 @@ In order to also consider *novel* variants from the reads, use the augmented gra
 # Index our augmented graph
 vg index aug.vg -x aug.xg
 
-# Compute the read support from the augmented gam (with ignoring qualitiy < 5)
-vg pack -x aug.xg -g aug.gam -Q 5 -o aln_aug.pack
+# Compute the read support from the augmented gam (ignoring qualitiy < 5, and 1st and last 5bp of each read)
+vg pack -x aug.xg -g aug.gam -Q 5 -s 5 -o aln_aug.pack
 
 # Generate a VCF from the support
 vg call aug.xg -k aln_aug.pack > calls.vcf

diff --git a/deps/libvgio b/deps/libvgio
diff --git a/src/aligner.cpp b/src/aligner.cpp
@@ -443,21 +443,34 @@ int32_t GSSWAligner::score_gap(size_t gap_length) const {
     return gap_length ? -gap_open - (gap_length - 1) * gap_extension : 0;
 }
 
+double GSSWAligner::first_mapping_quality_exact(const vector<double>& scaled_scores,
+                                                const vector<double>* multiplicities) {
+    return maximum_mapping_quality_exact(scaled_scores, nullptr, multiplicities);
+}
+
+double GSSWAligner::first_mapping_quality_approx(const vector<double>& scaled_scores,
+                                                 const vector<double>* multiplicities) {
+    return maximum_mapping_quality_approx(scaled_scores, nullptr, multiplicities);
+}
+
 double GSSWAligner::maximum_mapping_quality_exact(const vector<double>& scaled_scores, size_t* max_idx_out,
                                                   const vector<double>* multiplicities) {
+
+    // TODO: this isn't very well-named now that it also supports computing non-maximum
+    // mapping qualities
 
     // work in log transformed values to avoid risk of overflow
     double log_sum_exp = numeric_limits<double>::lowest();
-    double max_score = numeric_limits<double>::lowest();
+    double to_score = numeric_limits<double>::lowest();
 
     // go in reverse order because this has fewer numerical problems when the scores are sorted (as usual)
     for (int64_t i = scaled_scores.size() - 1; i >= 0; i--) {
         // get the value of one copy of the score and check if it's the max
         double score = scaled_scores.at(i);
-        if (score >= max_score) {
+        if (max_idx_out && score >= to_score) {
             // Since we are going in reverse order, make sure to break ties in favor of the earlier item.
             *max_idx_out = i;
-            max_score = score;
+            to_score = score;
         }
 
         // add all copies of the score
@@ -479,7 +492,11 @@ double GSSWAligner::maximum_mapping_quality_exact(const vector<double>& scaled_s
         }
     }
 
-    double direct_mapq = -quality_scale_factor * subtract_log(0.0, max_score - log_sum_exp);
+    if (!max_idx_out) {
+        to_score = scaled_scores.empty() ? 0.0 : scaled_scores.front();
+    }
+
+    double direct_mapq = -quality_scale_factor * subtract_log(0.0, to_score - log_sum_exp);
     return std::isinf(direct_mapq) ? (double) numeric_limits<int32_t>::max() : direct_mapq;
 }
 
@@ -522,6 +539,9 @@ double GSSWAligner::maximum_mapping_quality_approx(const vector<double>& scaled_
                                                    const vector<double>* multiplicities) {
     assert(!scaled_scores.empty());
 
+    // TODO: this isn't very well-named now that it also supports computing non-maximum
+    // mapping qualities
+
     // determine the maximum score and the count of the next highest score
     double max_score = scaled_scores.at(0);
     size_t max_idx = 0;
@@ -571,9 +591,19 @@ double GSSWAligner::maximum_mapping_quality_approx(const vector<double>& scaled_
     }
 
     // record the index of the highest score
-    *max_idx_out = max_idx;
-
-    return max(0.0, quality_scale_factor * (max_score - next_score - (next_count > 1.0 ? log(next_count) : 0.0)));
+    if (max_idx_out) {
+        *max_idx_out = max_idx;
+    }
+    if (max_idx_out || max_idx == 0) {
+        // we're either returning the mapping quality of whichever was the best, or we're
+        // returning the mapping quality of the first, which also is the best
+        return max(0.0, quality_scale_factor * (max_score - next_score - (next_count > 1.0 ? log(next_count) : 0.0)));
+    }
+    else {
+        // we're returning the mapping quality of the first, which is not the best. the approximation
+        // gets complicated here, so lets just fall back on the exact computation
+        return maximum_mapping_quality_exact(scaled_scores, nullptr, multiplicities);
+    }
 }
 
 double GSSWAligner::group_mapping_quality_exact(const vector<double>& scaled_scores, const vector<size_t>& group,
@@ -696,8 +726,8 @@ void GSSWAligner::compute_mapping_quality(vector<Alignment>& alignments,
     }
 }
 
-int32_t GSSWAligner::compute_mapping_quality(const vector<double>& scores, bool fast_approximation,
-                                             const vector<double>* multiplicities) const {
+int32_t GSSWAligner::compute_max_mapping_quality(const vector<double>& scores, bool fast_approximation,
+                                                 const vector<double>* multiplicities) const {
 
     vector<double> scaled_scores(scores.size());
     for (size_t i = 0; i < scores.size(); i++) {
@@ -708,6 +738,16 @@ int32_t GSSWAligner::compute_mapping_quality(const vector<double>& scores, bool
                                          : maximum_mapping_quality_exact(scaled_scores, &idx, multiplicities));
 }
 
+int32_t GSSWAligner::compute_first_mapping_quality(const vector<double>& scores, bool fast_approximation,
+                                                   const vector<double>* multiplicities) const {
+    vector<double> scaled_scores(scores.size());
+    for (size_t i = 0; i < scores.size(); i++) {
+        scaled_scores[i] = log_base * scores[i];
+    }
+    return (int32_t) (fast_approximation ? first_mapping_quality_approx(scaled_scores, multiplicities)
+                                         : first_mapping_quality_exact(scaled_scores, multiplicities));
+}
+
 int32_t GSSWAligner::compute_group_mapping_quality(const vector<double>& scores, const vector<size_t>& group,
                                                    const vector<double>* multiplicities) const {
 

diff --git a/src/aligner.hpp b/src/aligner.hpp
@@ -98,6 +98,15 @@ namespace vg {
         /// the mapping quality is always calculated as if its multiplicity is 1.
         static double maximum_mapping_quality_approx(const vector<double>& scaled_scores, size_t* max_idx_out,
                                                      const vector<double>* multiplicities = nullptr);
+
+        /// Same as maximum_mapping_quality_exact except alway s computes mapping
+        /// quality for the first score
+        static double first_mapping_quality_exact(const vector<double>& scaled_scores,
+                                                  const vector<double>* multiplicities = nullptr);
+        /// Same as maximum_mapping_quality_approx except alway s computes mapping
+        /// quality for the first score
+        static double first_mapping_quality_approx(const vector<double>& scaled_scores,
+                                                   const vector<double>* multiplicities = nullptr);
     protected:
         double group_mapping_quality_exact(const vector<double>& scaled_scores, const vector<size_t>& group,
                                            const vector<double>* multiplicities = nullptr) const;
@@ -217,11 +226,17 @@ namespace vg {
                                             double maybe_mq_threshold,
                                             double identity_weight) const;
 
+        /// Computes mapping quality for the first score in a vector of scores.
+        /// Optionally includes a vector of implicit counts >= 1 for the scores, but
+        /// the first mapping quality is always calculated as if it multiplicity is 1.
+        int32_t compute_first_mapping_quality(const vector<double>& scores, bool fast_approximation,
+                                              const vector<double>* multiplicities = nullptr) const;
+
         /// Computes mapping quality for the optimal score in a vector of scores.
         /// Optionally includes a vector of implicit counts >= 1 for the scores, but
         /// the mapping quality is always calculated as if it multiplicity is 1.
-        int32_t compute_mapping_quality(const vector<double>& scores, bool fast_approximation,
-                                        const vector<double>* multiplicities = nullptr) const;
+        int32_t compute_max_mapping_quality(const vector<double>& scores, bool fast_approximation,
+                                            const vector<double>* multiplicities = nullptr) const;
 
         /// Computes mapping quality for a group of scores in a vector of scores (group given by indexes).
         /// Optionally includes a vector of implicit counts >= 1 for the score, but the mapping quality is always