From e6d9f491c0c3bf48a85b735005011a13a1da1bf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torbj=C3=B8rn=20Rognes?= Date: Thu, 25 Apr 2024 19:29:56 +0200 Subject: [PATCH] Add sintax_random option, fix bug, and improve performance #535 --- man/vsearch.1 | 29 +++++-- src/sintax.cc | 224 ++++++++++++++++++++++++++++++++++++++++--------- src/vsearch.cc | 9 ++ src/vsearch.h | 1 + 4 files changed, 214 insertions(+), 49 deletions(-) diff --git a/man/vsearch.1 b/man/vsearch.1 index e9e18074..cbede33b 100644 --- a/man/vsearch.1 +++ b/man/vsearch.1 @@ -3575,13 +3575,13 @@ reference sequence database is specified with the \-\-db option. The results are written in a tab delimited text file whose name is specified with the \-\-tabbedout option. The \-\-sintax_cutoff option may be used to set a minimum level of bootstrap support for the -taxonomic ranks to be reported. The `--randseed` option may be +taxonomic ranks to be reported. The \-\-randseed option may be included to specify a seed for initialisation of the random number generator used by the algorithm. Please note that when using multiple -threads, the `--randseed` option may not work as intended, because +threads, the \-\-randseed option may not work as intended, because sequences may be processed in a random order by different threads. To -ensure the same results each time, use a single thread (`--threads 1`) -in combination with a fixed random seed specified with `--randseed`. +ensure the same results each time, use a single thread \-\-threads 1) +in combination with a fixed random seed specified with \-\-randseed. .PP Multithreading is supported. Databases in UDB files are supported. The strand option may be specified. @@ -3600,6 +3600,13 @@ Example: ">X80725_S000004313;\:tax=d:Bacteria,\:p:Proteobacteria,\:c:Gammaproteo The option \-\-notrunclabels is turned on by default for this command, allowing spaces in the taxonomic identifiers. .PP +If two sequences in the reference database has equally many kmer +matches with the query, the shortest sequence will be chosen by +default. If they are equally long, the sequence appearing first in the +database will be chosen. If the recommended option \-\-sintax_random +is specified, sequences with an equal number of kmer matches will be +choosen by a random draw. +.PP .TAG db .TP 9 .BI \-\-db \0filename @@ -3612,17 +3619,23 @@ Use \fIinteger\fR as seed for the random number generator used in the Sintax algorithm. A given seed always produces the same output order (useful for replicability). Set to 0 to use a pseudo-random seed (default behaviour). Does not work correctly with multiple threads; -please use `--threads 1` to ensure correct behaviour. +please use \-\-threads 1 to ensure correct behaviour. +.TAG sintax +.TP +.BI \-\-sintax \0filename +Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format. .TAG sintax_cutoff .TP .BI \-\-sintax_cutoff\~ "real" Specify a minimum level of bootstrap support for the taxonomic ranks that will be included in column 4 of the output file. For instance 0.9, corresponding to 90%. -.TAG sintax +.TAG sintax_random .TP -.BI \-\-sintax \0filename -Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format. +.B \-\-sintax_random +Break ties between sequences with equally many kmer matches by a +random draw. This option is recommended and may be made the default in +the future. .TAG tabbedout .TP .BI \-\-tabbedout \0filename diff --git a/src/sintax.cc b/src/sintax.cc index 3c88328a..74d5d7d0 100644 --- a/src/sintax.cc +++ b/src/sintax.cc @@ -60,7 +60,7 @@ /* - Implements the Sintax algorithm as desribed in Robert Edgar's preprint: + Implements the Sintax algorithm as described in Robert Edgar's preprint: Robert Edgar (2016) SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences @@ -71,6 +71,10 @@ https://www.drive5.com/usearch/manual/cmd_sintax.html + + Note that due to the lack of details in the description, this implementation + in vsearch is surely somewhat different from the one in usearch. + */ #include "vsearch.h" @@ -98,48 +102,87 @@ static int classified = 0; void sintax_analyse(char * query_head, int strand, - int best_seqno, int * all_seqno, int count) { - int best_level_start[tax_levels]; - int best_level_len[tax_levels]; int level_match[tax_levels]; + int level_best[tax_levels]; - /* check number of successful bootstraps */ - if (count >= (bootstrap_count+1) / 2) - { - char * best_h = db_getheader(best_seqno); + char * cand_level_start[bootstrap_count][tax_levels]; + int cand_level_len[bootstrap_count][tax_levels]; + int cand_level_matches[bootstrap_count][tax_levels]; - tax_split(best_seqno, best_level_start, best_level_len); + /* Check number of successful bootstraps, must be at least half */ - for (int & j : - level_match) - { - j = 0; - } + bool enough = count >= (bootstrap_count + 1) / 2; - for (int i = 0; i < count; i++) + if (enough) + { + /* Find the most common name at each taxonomic rank, + but with the same names at higher ranks. */ + + for (int i = 0; i < count ; i++) { - /* For each bootstrap experiment */ + /* Split headers of all candidates by taxonomy ranks */ - int level_start[tax_levels]; - int level_len[tax_levels]; - tax_split(all_seqno[i], level_start, level_len); + int seqno = all_seqno[i]; + int new_level_start[tax_levels]; + int new_level_len[tax_levels]; + tax_split(seqno, new_level_start, new_level_len); + for (int k = 0; k < tax_levels; k++) + { + cand_level_start[i][k] = db_getheader(seqno)+new_level_start[k]; + cand_level_len[i][k] = new_level_len[k]; + } + } - char * h = db_getheader(all_seqno[i]); + /* Count matching names among candidates */ - for (int j = 0; j < tax_levels; j++) + for (int i = 0; i < count ; i++) + for (int k = 0; k < tax_levels; k++) + cand_level_matches[i][k] = 0; + + for (int k = 0; k < tax_levels; k++) + for (int i = 0; i < count ; i++) + for (int c = 0; c <= i ; c++) { - /* For each taxonomic level */ + /* check match at current and all higher levels */ + bool match = true; + for (int j = 0; j <= k; j++) + { + if ((cand_level_len[i][j] != cand_level_len[c][j]) || + (strncmp(cand_level_start[i][j], + cand_level_start[c][j], + cand_level_len[i][j]) != 0)) + { + match = false; + break; + } + } + if (match) + { + cand_level_matches[c][k]++; + break; /* stop at first match */ + } + } + + /* Find most common name at each taxonomic level */ - if ((level_len[j] == best_level_len[j]) && - (strncmp(best_h + best_level_start[j], - h + level_start[j], - level_len[j]) == 0)) + for (int k = 0; k < tax_levels; k++) + { + level_best[k] = -1; + level_match[k] = 0; + int m = 0; + for (int i = 0; i < count ; i++) + { + m += cand_level_matches[i][k]; + if (cand_level_matches[i][k] > level_match[k]) { - level_match[j]++; + level_best[k] = i; + level_match[k] = cand_level_matches[i][k]; } + if (m >= count) + break; } } } @@ -150,23 +193,22 @@ void sintax_analyse(char * query_head, queries++; - if (count >= bootstrap_count / 2) + if (enough) { - char * best_h = db_getheader(best_seqno); - classified++; bool comma = false; for (int j = 0; j < tax_levels; j++) { - if (best_level_len[j] > 0) + int best = level_best[j]; + if (cand_level_len[best][j] > 0) { fprintf(fp_tabbedout, "%s%c:%.*s(%.2f)", (comma ? "," : ""), tax_letters[j], - best_level_len[j], - best_h + best_level_start[j], + cand_level_len[best][j], + cand_level_start[best][j], 1.0 * level_match[j] / count); comma = true; } @@ -180,15 +222,16 @@ void sintax_analyse(char * query_head, bool comma = false; for (int j = 0; j < tax_levels; j++) { - if ((best_level_len[j] > 0) && + int best = level_best[j]; + if ((cand_level_len[best][j] > 0) && (1.0 * level_match[j] / count >= opt_sintax_cutoff)) { fprintf(fp_tabbedout, "%s%c:%.*s", (comma ? "," : ""), tax_letters[j], - best_level_len[j], - best_h + best_level_start[j]); + cand_level_len[best][j], + cand_level_start[best][j]); comma = true; } } @@ -210,10 +253,111 @@ void sintax_analyse(char * query_head, xpthread_mutex_unlock(&mutex_output); } +void sintax_search_topscores(struct searchinfo_s * si) +{ + /* + Count the kmer hits in each database sequence and + make a sorted list of a given number (th) + of the database sequences with the highest number of matching kmers. + These are stored in the min heap array. + */ + + /* count kmer hits in the database sequences */ + const int indexed_count = dbindex_getcount(); + + /* zero counts */ + memset(si->kmers, 0, indexed_count * sizeof(count_t)); + + for(unsigned int i = 0; i < si->kmersamplecount; i++) + { + unsigned int kmer = si->kmersample[i]; + unsigned char * bitmap = dbindex_getbitmap(kmer); + + if (bitmap) + { +#ifdef __x86_64__ + if (ssse3_present) + { + increment_counters_from_bitmap_ssse3(si->kmers, + bitmap, indexed_count); + } + else + { + increment_counters_from_bitmap_sse2(si->kmers, + bitmap, indexed_count); + } +#else + increment_counters_from_bitmap(si->kmers, bitmap, indexed_count); +#endif + } + else + { + unsigned int * list = dbindex_getmatchlist(kmer); + unsigned int count = dbindex_getmatchcount(kmer); + for(unsigned int j = 0; j < count; j++) + { + si->kmers[list[j]]++; + } + } + } + + unsigned int tophits = 0; + + elem_t best; + best.count = 0; + best.seqno = 0; + best.length = 0; + + for(int i = 0; i < indexed_count; i++) + { + count_t count = si->kmers[i]; + unsigned int seqno = dbindex_getmapping(i); + unsigned int length = db_getsequencelen(best.seqno); + + if (count > best.count) + { + best.count = count; + best.seqno = seqno; + best.length = length; + tophits = 1; + } + else if (count == best.count) + { + if (opt_sintax_random) + { + tophits++; + if (random_int(tophits) == 0) + { + best.seqno = seqno; + best.length = length; + } + } + else + { + if (length < best.length) + { + best.seqno = seqno; + best.length = length; + } + else if (length == best.length) + { + if (seqno < best.seqno) + { + best.seqno = seqno; + } + } + } + } + } + + minheap_empty(si->m); + if (best.count > 1) + minheap_add(si->m, & best); +} + void sintax_query(int64_t t) { int all_seqno[2][bootstrap_count]; - int best_seqno[2] = {0, 0}; int boot_count[2] = {0, 0}; unsigned int best_count[2] = {0, 0}; int qseqlen = si_plus[t].qseqlen; @@ -258,9 +402,9 @@ void sintax_query(int64_t t) si->kmersamplecount = subsamples; si->kmersample = kmersample_subset; - search_topscores(si); + sintax_search_topscores(si); - while(!minheap_isempty(si->m)) + if (! minheap_isempty(si->m)) { elem_t e = minheap_poplast(si->m); @@ -269,7 +413,6 @@ void sintax_query(int64_t t) if (e.count > best_count[s]) { best_count[s] = e.count; - best_seqno[s] = e.seqno; } } } @@ -307,7 +450,6 @@ void sintax_query(int64_t t) sintax_analyse(query_head, best_strand, - best_seqno[best_strand], all_seqno[best_strand], boot_count[best_strand]); diff --git a/src/vsearch.cc b/src/vsearch.cc index 4af30bbb..51d82d63 100644 --- a/src/vsearch.cc +++ b/src/vsearch.cc @@ -82,6 +82,7 @@ bool opt_relabel_self; bool opt_relabel_sha1; bool opt_samheader; bool opt_sff_clip; +bool opt_sintax_random; bool opt_sizein; bool opt_sizeorder; bool opt_sizeout; @@ -959,6 +960,7 @@ void args_init(int argc, char **argv) opt_shuffle = nullptr; opt_sintax = nullptr; opt_sintax_cutoff = 0.0; + opt_sintax_random = false; opt_sizein = false; opt_sizeorder = false; opt_sizeout = false; @@ -1203,6 +1205,7 @@ void args_init(int argc, char **argv) option_shuffle, option_sintax, option_sintax_cutoff, + option_sintax_random, option_sizein, option_sizeorder, option_sizeout, @@ -1449,6 +1452,7 @@ void args_init(int argc, char **argv) {"shuffle", required_argument, nullptr, 0 }, {"sintax", required_argument, nullptr, 0 }, {"sintax_cutoff", required_argument, nullptr, 0 }, + {"sintax_random", no_argument, nullptr, 0 }, {"sizein", no_argument, nullptr, 0 }, {"sizeorder", no_argument, nullptr, 0 }, {"sizeout", no_argument, nullptr, 0 }, @@ -2547,6 +2551,10 @@ void args_init(int argc, char **argv) opt_chimeras_diff_pct = args_getdouble(optarg); break; + case option_sintax_random: + opt_sintax_random = true; + break; + default: fatal("Internal error in option parsing"); } @@ -4082,6 +4090,7 @@ void args_init(int argc, char **argv) option_quiet, option_randseed, option_sintax_cutoff, + option_sintax_random, option_strand, option_tabbedout, option_threads, diff --git a/src/vsearch.h b/src/vsearch.h index d9590da5..a0bd406b 100644 --- a/src/vsearch.h +++ b/src/vsearch.h @@ -283,6 +283,7 @@ extern bool opt_relabel_self; extern bool opt_relabel_sha1; extern bool opt_samheader; extern bool opt_sff_clip; +extern bool opt_sintax_random; extern bool opt_sizein; extern bool opt_sizeorder; extern bool opt_sizeout;