Skip to content

Commit

Permalink
feat: --url-list, able to scan list of urls
Browse files Browse the repository at this point in the history
  • Loading branch information
popstas committed Dec 4, 2020
1 parent fda9d6e commit d0985d8
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 3 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ Options:
- Does not load images, css, js (configurable)
- Some URLs are ignored ([`preRequest` in `src/scrap-site.js`](src/scrap-site.js#L98))
- Analyse each page with Lighthouse (see below)
- Scan list of urls, `--url-list`

### XLSX features
- The first row and the first column are fixed
Expand Down Expand Up @@ -222,6 +223,7 @@ You can copy [.site-audit-seo.conf.js](.site-audit-seo.conf.js) to your home dir
- Не загружает картинки, css, js (настраивается)
- Некоторые URL игнорируются ([`preRequest` в `src/scrap-site.js`](src/scrap-site.js#L112))
- Можно прогнать каждую страницу по Lighthouse (см. ниже)
- Сканирование произвольного списка URL, `--url-list`

### Особенности XLSX:
- Первый ряд и первая колонка закрепляются
Expand Down
10 changes: 10 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ program.option('-u --urls <urls>', 'Comma separated url list for scan', list).
getConfigVal('followXmlSitemap', false)).
option('--ignore-robots-txt', `Ignore disallowed in robots.txt`,
getConfigVal('ignoreRobotsTxt', false)).
option('--url-list', `assume that --url contains url list, will set -d 1 --no-limit-domain --ignore-robots-txt`,
getConfigVal('ignoreRobotsTxt', false)).
option('-m, --max-requests <num>', `Limit max pages scan`,
getConfigVal('maxRequests', 0)).
option('--no-headless', `Show browser GUI while scan`,
Expand Down Expand Up @@ -209,6 +211,13 @@ async function start() {
os.cpus().length);
}

if (program.urlList) {
program.maxDepth = 1;
program.limitDomain = false;
program.ignoreRobotsTxt = true;
// program.defaultFilter = 'depth>1';
}

program.outDir = expandHomedir(program.outDir);
createDirIfNotExists(program.outDir);

Expand All @@ -226,6 +235,7 @@ async function start() {
skipStatic: program.skipStatic, // не пропускать подгрузку браузером статики (картинки, css, js)
followSitemapXml: program.followXmlSitemap, // чтобы найти больше страниц
limitDomain: program.limitDomain, // не пропускать подгрузку браузером статики (картинки, css, js)
urlList: program.urlList, // метка, что передаётся страница со списком url
maxRequest: program.maxRequests, // для тестов
headless: program.headless, // на десктопе открывает браузер визуально
docsExtensions: program.docsExtensions, // расширения, которые будут добавлены в таблицу
Expand Down
38 changes: 35 additions & 3 deletions src/scrap-site.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ const fs = require('fs');
const path = require('path');
const {saveAsXlsx, saveAsJson, uploadJson, publishGoogleDrive, startViewer} = require(
'./actions');
const axios = require('axios');
const HCCrawler = require('@popstas/headless-chrome-crawler');
const CSVExporter = require('@popstas/headless-chrome-crawler/exporter/csv');
const url = require('url');
Expand All @@ -29,6 +30,29 @@ module.exports = async (baseUrl, options = {}) => {
const domain = url.parse(baseUrl).hostname;
const protocol = url.parse(baseUrl).protocol;

let urls = [];
if (options.urlList) {
const regex = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&#\/%=~_|$?!:,.]*\)|[A-Z0-9+&#\/%=~_|$])/ig

let content;
if (false || fs.existsSync(baseUrl)) { // TODO: url list from file
content = fs.readFileSync(options.file, 'utf8');
} else {
res = await axios.get(baseUrl);
content = res.data;
}

while (pageUrl = regex.exec(content)){
if (pageUrl[0].match(/\.(png|jpg|js|css)$/)) continue;
urls.push(pageUrl[0]);
}

const onlyUnique = (value, index, self) => self.indexOf(value) === index;
urls = urls.filter(onlyUnique);

// console.log('urls: ', urls);
}

const baseName = options.outName || domain;
const csvPath = path.normalize(`${options.outDir}/${baseName}.csv`);
const xlsxPath = path.normalize(`${options.outDir}/${baseName}.xlsx`);
Expand Down Expand Up @@ -433,14 +457,22 @@ module.exports = async (baseUrl, options = {}) => {
console.error(`${color.yellow}Disallowed in robots.txt: ${decodeURI(
options.url)}${color.reset}`);
});
crawler.on('maxdepthreached', options => {
console.log(`${color.yellow}Max depth reached${color.reset}`);
crawler.on('maxdepthreached', opts => {
if (options.maxDepth > 1) console.log(`${color.yellow}Max depth reached${color.reset}`);
});
crawler.on('maxrequestreached', options => {
console.log(
`\n\n${color.yellow}Max requests reached\nPlease, ignore this error:${color.reset}`);
});
await crawler.queue(baseUrl);

if (options.urlList) {
for (let url of urls) {
await crawler.queue(url);
}
} else {
await crawler.queue(baseUrl);
}

await crawler.onIdle();
await crawler.close();

Expand Down

0 comments on commit d0985d8

Please sign in to comment.