Skip to content

Commit

Permalink
fix: better cancelling, error logging, fix rescan for remain urls
Browse files Browse the repository at this point in the history
  • Loading branch information
popstas committed Apr 13, 2024
1 parent 856244a commit 273ae86
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions src/scrap-site.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ let SKIP_JS = true;
const finishTries = 5;
const saveAfterEvery = 100; // saveProgress after each N requests

let isMaxRequested = false;

let disconnectedLog = [];

// resend messages while disconnected
Expand Down Expand Up @@ -682,7 +684,11 @@ async function scrapSite ({baseUrl, options = {}}) {
}
catch (e) {
const err = e.message?.substring(0, 512);
console.log('Error while customCrawl:');
if (isCancalling || isMaxRequested) {
// Scan is cancelled, suppress errors
return;
}
log(`Error while scan ${crawler._options.url}`);
// console.log("result:", result);
if (!result.result) {
result = {
Expand All @@ -703,6 +709,7 @@ async function scrapSite ({baseUrl, options = {}}) {
// if (!result.response) result.response = { ok: true, url: crawler._options.url };
// console.log("err:", err);
const errText = `${err}`;
let isRetry = false;
if (errText.includes('net::ERR_CERT')) { // ERR_CERT_DATE_INVALID, net::ERR_CERT_AUTHORITY_INVALID, net::ERR_CERT_COMMON_NAME_INVALID
result.result.error = 'ssl_err';
result.response.status = -1;
Expand All @@ -713,7 +720,12 @@ async function scrapSite ({baseUrl, options = {}}) {
}
else if (errText.includes('Navigation Timeout Exceeded')) {
result.result.error = 'timeout';
throw e; // for retry
crawler._options.timeout = 30000;
isRetry = true;
}
else if (errText.includes('browser has disconnected')) {
result.result.error = 'browser_err';
isRetry = true;
}
else if (errText.includes('net::ERR_INVALID_RESPONSE')) {
result.result.error = 'invalid response';
Expand All @@ -731,10 +743,13 @@ async function scrapSite ({baseUrl, options = {}}) {
else if (errText.includes('URI malformed')) result.result.error = 'bad_url'; // not used?
else {
// console.log(err);
console.log("Unknown error, errText:", errText);
log("Unknown error, errText:", errText);
console.log(e.stack);
}
if (result.result.error) console.log(result.result.error);
// console.log("result.result.error:", result.result.error);
// console.log("errText:", errText);
if (result.result.error) log(`${result.result.error}${isRetry ? ', retry' : ''}`);
if (isRetry) throw e; // for retry
}


Expand Down Expand Up @@ -876,7 +891,7 @@ async function scrapSite ({baseUrl, options = {}}) {
};

// console.log("newOptions:", newOptions);
module.exports({baseUrl: baseUrl, options: newOptions});
scrapSite({baseUrl: baseUrl, options: newOptions});
}, seconds * 1000);
} catch (e) {
console.log(e);
Expand Down Expand Up @@ -960,6 +975,7 @@ async function scrapSite ({baseUrl, options = {}}) {
crawler.on('maxrequestreached', () => {
if (crawler._browser._connection._closed) return; // catch error after scan
console.log(`\n${color.yellow}Max requests reached${color.reset}`);
isMaxRequested = true;
// console.log(`${color.yellow}Please, ignore this error:${color.reset}`);
});

Expand Down

0 comments on commit 273ae86

Please sign in to comment.