From e6d9f491c0c3bf48a85b735005011a13a1da1bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torbj=C3=B8rn=20Rognes?= <torognes@ifi.uio.no>
Date: Thu, 25 Apr 2024 19:29:56 +0200
Subject: [PATCH] Add sintax_random option, fix bug, and improve performance
 #535

---
 man/vsearch.1  |  29 +++++--
 src/sintax.cc  | 224 ++++++++++++++++++++++++++++++++++++++++---------
 src/vsearch.cc |   9 ++
 src/vsearch.h  |   1 +
 4 files changed, 214 insertions(+), 49 deletions(-)

diff --git a/man/vsearch.1 b/man/vsearch.1
index e9e18074..cbede33b 100644
--- a/man/vsearch.1
+++ b/man/vsearch.1
@@ -3575,13 +3575,13 @@ reference sequence database is specified with the \-\-db option. The
 results are written in a tab delimited text file whose name is
 specified with the \-\-tabbedout option. The \-\-sintax_cutoff option
 may be used to set a minimum level of bootstrap support for the
-taxonomic ranks to be reported. The `--randseed` option may be
+taxonomic ranks to be reported. The \-\-randseed option may be
 included to specify a seed for initialisation of the random number
 generator used by the algorithm. Please note that when using multiple
-threads, the `--randseed` option may not work as intended, because
+threads, the \-\-randseed option may not work as intended, because
 sequences may be processed in a random order by different threads. To
-ensure the same results each time, use a single thread (`--threads 1`)
-in combination with a fixed random seed specified with `--randseed`.
+ensure the same results each time, use a single thread \-\-threads 1)
+in combination with a fixed random seed specified with \-\-randseed.
 .PP
 Multithreading is supported. Databases in UDB files are supported.
 The strand option may be specified.
@@ -3600,6 +3600,13 @@ Example: ">X80725_S000004313;\:tax=d:Bacteria,\:p:Proteobacteria,\:c:Gammaproteo
 The option \-\-notrunclabels is turned on by default for this command,
 allowing spaces in the taxonomic identifiers.
 .PP
+If two sequences in the reference database has equally many kmer
+matches with the query, the shortest sequence will be chosen by
+default. If they are equally long, the sequence appearing first in the
+database will be chosen. If the recommended option \-\-sintax_random
+is specified, sequences with an equal number of kmer matches will be
+choosen by a random draw.
+.PP
 .TAG db
 .TP 9
 .BI \-\-db \0filename
@@ -3612,17 +3619,23 @@ Use \fIinteger\fR as seed for the random number generator used in the
 Sintax algorithm. A given seed always produces the same output order
 (useful for replicability). Set to 0 to use a pseudo-random seed
 (default behaviour). Does not work correctly with multiple threads;
-please use `--threads 1` to ensure correct behaviour.
+please use \-\-threads 1 to ensure correct behaviour.
+.TAG sintax
+.TP
+.BI \-\-sintax \0filename
+Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format.
 .TAG sintax_cutoff
 .TP
 .BI \-\-sintax_cutoff\~ "real"
 Specify a minimum level of bootstrap support for the taxonomic ranks
 that will be included in column 4 of the output file. For instance
 0.9, corresponding to 90%.
-.TAG sintax
+.TAG sintax_random
 .TP
-.BI \-\-sintax \0filename
-Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format.
+.B \-\-sintax_random
+Break ties between sequences with equally many kmer matches by a
+random draw. This option is recommended and may be made the default in
+the future.
 .TAG tabbedout
 .TP
 .BI \-\-tabbedout \0filename
diff --git a/src/sintax.cc b/src/sintax.cc
index 3c88328a..74d5d7d0 100644
--- a/src/sintax.cc
+++ b/src/sintax.cc
@@ -60,7 +60,7 @@
 
 /*
 
-  Implements the Sintax algorithm as desribed in Robert Edgar's preprint:
+  Implements the Sintax algorithm as described in Robert Edgar's preprint:
 
   Robert Edgar (2016)
   SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences
@@ -71,6 +71,10 @@
 
   https://www.drive5.com/usearch/manual/cmd_sintax.html
 
+
+  Note that due to the lack of details in the description, this implementation
+  in vsearch is surely somewhat different from the one in usearch.
+
 */
 
 #include "vsearch.h"
@@ -98,48 +102,87 @@ static int classified = 0;
 
 void sintax_analyse(char * query_head,
                     int strand,
-                    int best_seqno,
                     int * all_seqno,
                     int count)
 {
-  int best_level_start[tax_levels];
-  int best_level_len[tax_levels];
   int level_match[tax_levels];
+  int level_best[tax_levels];
 
-  /* check number of successful bootstraps */
-  if (count >= (bootstrap_count+1) / 2)
-    {
-      char * best_h = db_getheader(best_seqno);
+  char * cand_level_start[bootstrap_count][tax_levels];
+  int cand_level_len[bootstrap_count][tax_levels];
+  int cand_level_matches[bootstrap_count][tax_levels];
 
-      tax_split(best_seqno, best_level_start, best_level_len);
+  /* Check number of successful bootstraps, must be at least half */
 
-      for (int & j :
-             level_match)
-        {
-          j = 0;
-        }
+  bool enough = count >= (bootstrap_count + 1) / 2;
 
-      for (int i = 0; i < count; i++)
+  if (enough)
+    {
+      /* Find the most common name at each taxonomic rank,
+         but with the same names at higher ranks. */
+
+      for (int i = 0; i < count ; i++)
         {
-          /* For each bootstrap experiment */
+          /* Split headers of all candidates by taxonomy ranks */
 
-          int level_start[tax_levels];
-          int level_len[tax_levels];
-          tax_split(all_seqno[i], level_start, level_len);
+          int seqno = all_seqno[i];
+          int new_level_start[tax_levels];
+          int new_level_len[tax_levels];
+          tax_split(seqno, new_level_start, new_level_len);
+          for (int k = 0; k < tax_levels; k++)
+            {
+              cand_level_start[i][k] = db_getheader(seqno)+new_level_start[k];
+              cand_level_len[i][k] = new_level_len[k];
+            }
+        }
 
-          char * h = db_getheader(all_seqno[i]);
+      /* Count matching names among candidates */
 
-          for (int j = 0; j < tax_levels; j++)
+      for (int i = 0; i < count ; i++)
+        for (int k = 0; k < tax_levels; k++)
+          cand_level_matches[i][k] = 0;
+
+      for (int k = 0; k < tax_levels; k++)
+        for (int i = 0; i < count ; i++)
+          for (int c = 0; c <= i ; c++)
             {
-              /* For each taxonomic level */
+              /* check match at current and all higher levels */
+              bool match = true;
+              for (int j = 0; j <= k; j++)
+                {
+                  if ((cand_level_len[i][j] != cand_level_len[c][j]) ||
+                      (strncmp(cand_level_start[i][j],
+                               cand_level_start[c][j],
+                               cand_level_len[i][j]) != 0))
+                    {
+                      match = false;
+                      break;
+                    }
+                }
+              if (match)
+                {
+                  cand_level_matches[c][k]++;
+                  break; /* stop at first match */
+                }
+            }
+
+      /* Find most common name at each taxonomic level */
 
-              if ((level_len[j] == best_level_len[j]) &&
-                  (strncmp(best_h + best_level_start[j],
-                           h + level_start[j],
-                           level_len[j]) == 0))
+      for (int k = 0; k < tax_levels; k++)
+        {
+          level_best[k] = -1;
+          level_match[k] = 0;
+          int m = 0;
+          for (int i = 0; i < count ; i++)
+            {
+              m += cand_level_matches[i][k];
+              if (cand_level_matches[i][k] > level_match[k])
                 {
-                  level_match[j]++;
+                  level_best[k] = i;
+                  level_match[k] = cand_level_matches[i][k];
                 }
+              if (m >= count)
+                break;
             }
         }
     }
@@ -150,23 +193,22 @@ void sintax_analyse(char * query_head,
 
   queries++;
 
-  if (count >= bootstrap_count / 2)
+  if (enough)
     {
-      char * best_h = db_getheader(best_seqno);
-
       classified++;
 
       bool comma = false;
       for (int j = 0; j < tax_levels; j++)
         {
-          if (best_level_len[j] > 0)
+          int best = level_best[j];
+          if (cand_level_len[best][j] > 0)
             {
               fprintf(fp_tabbedout,
                       "%s%c:%.*s(%.2f)",
                       (comma ? "," : ""),
                       tax_letters[j],
-                      best_level_len[j],
-                      best_h + best_level_start[j],
+                      cand_level_len[best][j],
+                      cand_level_start[best][j],
                       1.0 * level_match[j] / count);
               comma = true;
             }
@@ -180,15 +222,16 @@ void sintax_analyse(char * query_head,
           bool comma = false;
           for (int j = 0; j < tax_levels; j++)
             {
-              if ((best_level_len[j] > 0) &&
+              int best = level_best[j];
+              if ((cand_level_len[best][j] > 0) &&
                   (1.0 * level_match[j] / count >= opt_sintax_cutoff))
                 {
                   fprintf(fp_tabbedout,
                           "%s%c:%.*s",
                           (comma ? "," : ""),
                           tax_letters[j],
-                          best_level_len[j],
-                          best_h + best_level_start[j]);
+                          cand_level_len[best][j],
+                          cand_level_start[best][j]);
                   comma = true;
                 }
             }
@@ -210,10 +253,111 @@ void sintax_analyse(char * query_head,
   xpthread_mutex_unlock(&mutex_output);
 }
 
+void sintax_search_topscores(struct searchinfo_s * si)
+{
+  /*
+    Count the kmer hits in each database sequence and
+    make a sorted list of a given number (th)
+    of the database sequences with the highest number of matching kmers.
+    These are stored in the min heap array.
+  */
+
+  /* count kmer hits in the database sequences */
+  const int indexed_count = dbindex_getcount();
+
+  /* zero counts */
+  memset(si->kmers, 0, indexed_count * sizeof(count_t));
+
+  for(unsigned int i = 0; i < si->kmersamplecount; i++)
+    {
+      unsigned int kmer = si->kmersample[i];
+      unsigned char * bitmap = dbindex_getbitmap(kmer);
+
+      if (bitmap)
+        {
+#ifdef __x86_64__
+          if (ssse3_present)
+            {
+              increment_counters_from_bitmap_ssse3(si->kmers,
+                                                   bitmap, indexed_count);
+            }
+          else
+            {
+              increment_counters_from_bitmap_sse2(si->kmers,
+                                                  bitmap, indexed_count);
+            }
+#else
+          increment_counters_from_bitmap(si->kmers, bitmap, indexed_count);
+#endif
+        }
+      else
+        {
+          unsigned int * list = dbindex_getmatchlist(kmer);
+          unsigned int count = dbindex_getmatchcount(kmer);
+          for(unsigned int j = 0; j < count; j++)
+            {
+              si->kmers[list[j]]++;
+            }
+        }
+    }
+
+  unsigned int tophits = 0;
+
+  elem_t best;
+  best.count = 0;
+  best.seqno = 0;
+  best.length = 0;
+
+  for(int i = 0; i < indexed_count; i++)
+    {
+      count_t count = si->kmers[i];
+      unsigned int seqno = dbindex_getmapping(i);
+      unsigned int length = db_getsequencelen(best.seqno);
+
+      if (count > best.count)
+        {
+          best.count = count;
+          best.seqno = seqno;
+          best.length = length;
+          tophits = 1;
+        }
+      else if (count == best.count)
+        {
+          if (opt_sintax_random)
+            {
+              tophits++;
+              if (random_int(tophits) == 0)
+                {
+                  best.seqno = seqno;
+                  best.length = length;
+                }
+            }
+          else
+            {
+              if (length < best.length)
+                {
+                  best.seqno = seqno;
+                  best.length = length;
+                }
+              else if (length == best.length)
+                {
+                  if (seqno < best.seqno)
+                    {
+                      best.seqno = seqno;
+                    }
+                }
+            }
+        }
+    }
+
+  minheap_empty(si->m);
+  if (best.count > 1)
+    minheap_add(si->m, & best);
+}
+
 void sintax_query(int64_t t)
 {
   int all_seqno[2][bootstrap_count];
-  int best_seqno[2] = {0, 0};
   int boot_count[2] = {0, 0};
   unsigned int best_count[2] = {0, 0};
   int qseqlen = si_plus[t].qseqlen;
@@ -258,9 +402,9 @@ void sintax_query(int64_t t)
               si->kmersamplecount = subsamples;
               si->kmersample = kmersample_subset;
 
-              search_topscores(si);
+              sintax_search_topscores(si);
 
-              while(!minheap_isempty(si->m))
+              if (! minheap_isempty(si->m))
                 {
                   elem_t e = minheap_poplast(si->m);
 
@@ -269,7 +413,6 @@ void sintax_query(int64_t t)
                   if (e.count > best_count[s])
                     {
                       best_count[s] = e.count;
-                      best_seqno[s] = e.seqno;
                     }
                 }
             }
@@ -307,7 +450,6 @@ void sintax_query(int64_t t)
 
   sintax_analyse(query_head,
                  best_strand,
-                 best_seqno[best_strand],
                  all_seqno[best_strand],
                  boot_count[best_strand]);
 
diff --git a/src/vsearch.cc b/src/vsearch.cc
index 4af30bbb..51d82d63 100644
--- a/src/vsearch.cc
+++ b/src/vsearch.cc
@@ -82,6 +82,7 @@ bool opt_relabel_self;
 bool opt_relabel_sha1;
 bool opt_samheader;
 bool opt_sff_clip;
+bool opt_sintax_random;
 bool opt_sizein;
 bool opt_sizeorder;
 bool opt_sizeout;
@@ -959,6 +960,7 @@ void args_init(int argc, char **argv)
   opt_shuffle = nullptr;
   opt_sintax = nullptr;
   opt_sintax_cutoff = 0.0;
+  opt_sintax_random = false;
   opt_sizein = false;
   opt_sizeorder = false;
   opt_sizeout = false;
@@ -1203,6 +1205,7 @@ void args_init(int argc, char **argv)
       option_shuffle,
       option_sintax,
       option_sintax_cutoff,
+      option_sintax_random,
       option_sizein,
       option_sizeorder,
       option_sizeout,
@@ -1449,6 +1452,7 @@ void args_init(int argc, char **argv)
       {"shuffle",               required_argument, nullptr, 0 },
       {"sintax",                required_argument, nullptr, 0 },
       {"sintax_cutoff",         required_argument, nullptr, 0 },
+      {"sintax_random",         no_argument,       nullptr, 0 },
       {"sizein",                no_argument,       nullptr, 0 },
       {"sizeorder",             no_argument,       nullptr, 0 },
       {"sizeout",               no_argument,       nullptr, 0 },
@@ -2547,6 +2551,10 @@ void args_init(int argc, char **argv)
           opt_chimeras_diff_pct = args_getdouble(optarg);
           break;
 
+        case option_sintax_random:
+          opt_sintax_random = true;
+          break;
+
         default:
           fatal("Internal error in option parsing");
         }
@@ -4082,6 +4090,7 @@ void args_init(int argc, char **argv)
         option_quiet,
         option_randseed,
         option_sintax_cutoff,
+        option_sintax_random,
         option_strand,
         option_tabbedout,
         option_threads,
diff --git a/src/vsearch.h b/src/vsearch.h
index d9590da5..a0bd406b 100644
--- a/src/vsearch.h
+++ b/src/vsearch.h
@@ -283,6 +283,7 @@ extern bool opt_relabel_self;
 extern bool opt_relabel_sha1;
 extern bool opt_samheader;
 extern bool opt_sff_clip;
+extern bool opt_sintax_random;
 extern bool opt_sizein;
 extern bool opt_sizeorder;
 extern bool opt_sizeout;