Open
Description
Seems that when the site errors out, self.options.skipDuplicates
is se to false, but never set back to true in the current version. This lets duplicate urls to end up crawled by the system.
Better solution could be something like:
if (options.retries) {
self.options.skipDuplicates = false;
setTimeout(function() {
options.retries--;
const skipDuplicates = self.options.skipDuplicates;
self.options.skipDuplicates = false;
self.queue(options);
self.options.skipDuplicates = skipDuplicates;
options.release();
},options.retryTimeout);
} else{
options.callback(error,{options:options},options.release);
}
If .queue method can throw, then something like
if (options.retries) {
self.options.skipDuplicates = false;
setTimeout(function() {
options.retries--;
const skipDuplicates = self.options.skipDuplicates;
try {
self.options.skipDuplicates = false;
self.queue(options);
} finally {
self.options.skipDuplicates = skipDuplicates;
}
options.release();
},options.retryTimeout);
} else{
options.callback(error,{options:options},options.release);
}
Metadata
Metadata
Assignees
Labels
No labels