Skip to content

Commit

Permalink
Add sintax_random option, fix bug, and improve performance #535
Browse files Browse the repository at this point in the history
  • Loading branch information
torognes committed Apr 25, 2024
1 parent c37ccba commit e6d9f49
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 49 deletions.
29 changes: 21 additions & 8 deletions man/vsearch.1
Original file line number Diff line number Diff line change
Expand Up @@ -3575,13 +3575,13 @@ reference sequence database is specified with the \-\-db option. The
results are written in a tab delimited text file whose name is
specified with the \-\-tabbedout option. The \-\-sintax_cutoff option
may be used to set a minimum level of bootstrap support for the
taxonomic ranks to be reported. The `--randseed` option may be
taxonomic ranks to be reported. The \-\-randseed option may be
included to specify a seed for initialisation of the random number
generator used by the algorithm. Please note that when using multiple
threads, the `--randseed` option may not work as intended, because
threads, the \-\-randseed option may not work as intended, because
sequences may be processed in a random order by different threads. To
ensure the same results each time, use a single thread (`--threads 1`)
in combination with a fixed random seed specified with `--randseed`.
ensure the same results each time, use a single thread \-\-threads 1)
in combination with a fixed random seed specified with \-\-randseed.
.PP
Multithreading is supported. Databases in UDB files are supported.
The strand option may be specified.
Expand All @@ -3600,6 +3600,13 @@ Example: ">X80725_S000004313;\:tax=d:Bacteria,\:p:Proteobacteria,\:c:Gammaproteo
The option \-\-notrunclabels is turned on by default for this command,
allowing spaces in the taxonomic identifiers.
.PP
If two sequences in the reference database has equally many kmer
matches with the query, the shortest sequence will be chosen by
default. If they are equally long, the sequence appearing first in the
database will be chosen. If the recommended option \-\-sintax_random
is specified, sequences with an equal number of kmer matches will be
choosen by a random draw.
.PP
.TAG db
.TP 9
.BI \-\-db \0filename
Expand All @@ -3612,17 +3619,23 @@ Use \fIinteger\fR as seed for the random number generator used in the
Sintax algorithm. A given seed always produces the same output order
(useful for replicability). Set to 0 to use a pseudo-random seed
(default behaviour). Does not work correctly with multiple threads;
please use `--threads 1` to ensure correct behaviour.
please use \-\-threads 1 to ensure correct behaviour.
.TAG sintax
.TP
.BI \-\-sintax \0filename
Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format.
.TAG sintax_cutoff
.TP
.BI \-\-sintax_cutoff\~ "real"
Specify a minimum level of bootstrap support for the taxonomic ranks
that will be included in column 4 of the output file. For instance
0.9, corresponding to 90%.
.TAG sintax
.TAG sintax_random
.TP
.BI \-\-sintax \0filename
Read the input sequences from \fIfilename\fR, in FASTA or FASTQ format.
.B \-\-sintax_random
Break ties between sequences with equally many kmer matches by a
random draw. This option is recommended and may be made the default in
the future.
.TAG tabbedout
.TP
.BI \-\-tabbedout \0filename
Expand Down
224 changes: 183 additions & 41 deletions src/sintax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@

/*
Implements the Sintax algorithm as desribed in Robert Edgar's preprint:
Implements the Sintax algorithm as described in Robert Edgar's preprint:
Robert Edgar (2016)
SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences
Expand All @@ -71,6 +71,10 @@
https://www.drive5.com/usearch/manual/cmd_sintax.html
Note that due to the lack of details in the description, this implementation
in vsearch is surely somewhat different from the one in usearch.
*/

#include "vsearch.h"
Expand Down Expand Up @@ -98,48 +102,87 @@ static int classified = 0;

void sintax_analyse(char * query_head,
int strand,
int best_seqno,
int * all_seqno,
int count)
{
int best_level_start[tax_levels];
int best_level_len[tax_levels];
int level_match[tax_levels];
int level_best[tax_levels];

/* check number of successful bootstraps */
if (count >= (bootstrap_count+1) / 2)
{
char * best_h = db_getheader(best_seqno);
char * cand_level_start[bootstrap_count][tax_levels];
int cand_level_len[bootstrap_count][tax_levels];
int cand_level_matches[bootstrap_count][tax_levels];

tax_split(best_seqno, best_level_start, best_level_len);
/* Check number of successful bootstraps, must be at least half */

for (int & j :
level_match)
{
j = 0;
}
bool enough = count >= (bootstrap_count + 1) / 2;

for (int i = 0; i < count; i++)
if (enough)
{
/* Find the most common name at each taxonomic rank,
but with the same names at higher ranks. */

for (int i = 0; i < count ; i++)
{
/* For each bootstrap experiment */
/* Split headers of all candidates by taxonomy ranks */

int level_start[tax_levels];
int level_len[tax_levels];
tax_split(all_seqno[i], level_start, level_len);
int seqno = all_seqno[i];
int new_level_start[tax_levels];
int new_level_len[tax_levels];
tax_split(seqno, new_level_start, new_level_len);
for (int k = 0; k < tax_levels; k++)
{
cand_level_start[i][k] = db_getheader(seqno)+new_level_start[k];
cand_level_len[i][k] = new_level_len[k];
}
}

char * h = db_getheader(all_seqno[i]);
/* Count matching names among candidates */

for (int j = 0; j < tax_levels; j++)
for (int i = 0; i < count ; i++)
for (int k = 0; k < tax_levels; k++)
cand_level_matches[i][k] = 0;

for (int k = 0; k < tax_levels; k++)
for (int i = 0; i < count ; i++)
for (int c = 0; c <= i ; c++)
{
/* For each taxonomic level */
/* check match at current and all higher levels */
bool match = true;
for (int j = 0; j <= k; j++)
{
if ((cand_level_len[i][j] != cand_level_len[c][j]) ||
(strncmp(cand_level_start[i][j],
cand_level_start[c][j],
cand_level_len[i][j]) != 0))
{
match = false;
break;
}
}
if (match)
{
cand_level_matches[c][k]++;
break; /* stop at first match */
}
}

/* Find most common name at each taxonomic level */

if ((level_len[j] == best_level_len[j]) &&
(strncmp(best_h + best_level_start[j],
h + level_start[j],
level_len[j]) == 0))
for (int k = 0; k < tax_levels; k++)
{
level_best[k] = -1;
level_match[k] = 0;
int m = 0;
for (int i = 0; i < count ; i++)
{
m += cand_level_matches[i][k];
if (cand_level_matches[i][k] > level_match[k])
{
level_match[j]++;
level_best[k] = i;
level_match[k] = cand_level_matches[i][k];
}
if (m >= count)
break;
}
}
}
Expand All @@ -150,23 +193,22 @@ void sintax_analyse(char * query_head,

queries++;

if (count >= bootstrap_count / 2)
if (enough)
{
char * best_h = db_getheader(best_seqno);

classified++;

bool comma = false;
for (int j = 0; j < tax_levels; j++)
{
if (best_level_len[j] > 0)
int best = level_best[j];
if (cand_level_len[best][j] > 0)
{
fprintf(fp_tabbedout,
"%s%c:%.*s(%.2f)",
(comma ? "," : ""),
tax_letters[j],
best_level_len[j],
best_h + best_level_start[j],
cand_level_len[best][j],
cand_level_start[best][j],
1.0 * level_match[j] / count);
comma = true;
}
Expand All @@ -180,15 +222,16 @@ void sintax_analyse(char * query_head,
bool comma = false;
for (int j = 0; j < tax_levels; j++)
{
if ((best_level_len[j] > 0) &&
int best = level_best[j];
if ((cand_level_len[best][j] > 0) &&
(1.0 * level_match[j] / count >= opt_sintax_cutoff))
{
fprintf(fp_tabbedout,
"%s%c:%.*s",
(comma ? "," : ""),
tax_letters[j],
best_level_len[j],
best_h + best_level_start[j]);
cand_level_len[best][j],
cand_level_start[best][j]);
comma = true;
}
}
Expand All @@ -210,10 +253,111 @@ void sintax_analyse(char * query_head,
xpthread_mutex_unlock(&mutex_output);
}

void sintax_search_topscores(struct searchinfo_s * si)
{
/*
Count the kmer hits in each database sequence and
make a sorted list of a given number (th)
of the database sequences with the highest number of matching kmers.
These are stored in the min heap array.
*/

/* count kmer hits in the database sequences */
const int indexed_count = dbindex_getcount();

/* zero counts */
memset(si->kmers, 0, indexed_count * sizeof(count_t));

for(unsigned int i = 0; i < si->kmersamplecount; i++)
{
unsigned int kmer = si->kmersample[i];
unsigned char * bitmap = dbindex_getbitmap(kmer);

if (bitmap)
{
#ifdef __x86_64__
if (ssse3_present)
{
increment_counters_from_bitmap_ssse3(si->kmers,
bitmap, indexed_count);
}
else
{
increment_counters_from_bitmap_sse2(si->kmers,
bitmap, indexed_count);
}
#else
increment_counters_from_bitmap(si->kmers, bitmap, indexed_count);
#endif
}
else
{
unsigned int * list = dbindex_getmatchlist(kmer);
unsigned int count = dbindex_getmatchcount(kmer);
for(unsigned int j = 0; j < count; j++)
{
si->kmers[list[j]]++;
}
}
}

unsigned int tophits = 0;

elem_t best;
best.count = 0;
best.seqno = 0;
best.length = 0;

for(int i = 0; i < indexed_count; i++)
{
count_t count = si->kmers[i];
unsigned int seqno = dbindex_getmapping(i);
unsigned int length = db_getsequencelen(best.seqno);

if (count > best.count)
{
best.count = count;
best.seqno = seqno;
best.length = length;
tophits = 1;
}
else if (count == best.count)
{
if (opt_sintax_random)
{
tophits++;
if (random_int(tophits) == 0)
{
best.seqno = seqno;
best.length = length;
}
}
else
{
if (length < best.length)
{
best.seqno = seqno;
best.length = length;
}
else if (length == best.length)
{
if (seqno < best.seqno)
{
best.seqno = seqno;
}
}
}
}
}

minheap_empty(si->m);
if (best.count > 1)
minheap_add(si->m, & best);
}

void sintax_query(int64_t t)
{
int all_seqno[2][bootstrap_count];
int best_seqno[2] = {0, 0};
int boot_count[2] = {0, 0};
unsigned int best_count[2] = {0, 0};
int qseqlen = si_plus[t].qseqlen;
Expand Down Expand Up @@ -258,9 +402,9 @@ void sintax_query(int64_t t)
si->kmersamplecount = subsamples;
si->kmersample = kmersample_subset;

search_topscores(si);
sintax_search_topscores(si);

while(!minheap_isempty(si->m))
if (! minheap_isempty(si->m))
{
elem_t e = minheap_poplast(si->m);

Expand All @@ -269,7 +413,6 @@ void sintax_query(int64_t t)
if (e.count > best_count[s])
{
best_count[s] = e.count;
best_seqno[s] = e.seqno;
}
}
}
Expand Down Expand Up @@ -307,7 +450,6 @@ void sintax_query(int64_t t)

sintax_analyse(query_head,
best_strand,
best_seqno[best_strand],
all_seqno[best_strand],
boot_count[best_strand]);

Expand Down

0 comments on commit e6d9f49

Please sign in to comment.