Skip to content

Commit

Permalink
feat: now default output dir is ~/site-audit-seo/, not current dir
Browse files Browse the repository at this point in the history
  • Loading branch information
popstas committed Aug 22, 2020
1 parent 4d4930e commit e38af7d
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .site-audit-seo.conf.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ module.exports = {
// headless: true,
// removeCsv: true,
// removeJson: true,
// outDir: '~/site-audit-seo-reports/.',
// outDir: '~/site-audit-seo/.',
// gdrive: false,
// json: false,
// xlsx: false,
Expand Down
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ Options:
- Validation summary after scan
- Documents with the extensions `doc`,` docx`, `xls`,` xlsx`, `ppt`,` pptx`, `pdf`,` rar`, `zip` are added to the list with a depth == 0
- Search pages with SSL mixed content
- Each site is saved to a file with a domain name
- Each site is saved to a file with a domain name in `~/site-audit-seo/`
- Does not follow links outside the scanned domain (configurable)
- Does not load images, css, js (configurable)
- Some URLs are ignored ([`preRequest` in `src/scrap-site.js`](src/scrap-site.js#L112))
- Some URLs are ignored ([`preRequest` in `src/scrap-site.js`](src/scrap-site.js#L98))
- Analyse each page with Lighthouse (see below)

### XLSX features
Expand Down Expand Up @@ -139,6 +139,7 @@ Options:
- lighthouse.interactive
- lighthouse.total-blocking-time
- lighthouse.cumulative-layout-shift
- and 150 more lighthouse tests!


## Custom fields
Expand Down Expand Up @@ -174,9 +175,9 @@ site-audit-seo -u https://example.com --lighthouse
You can copy [.site-audit-seo.conf.js](.site-audit-seo.conf.js) to your home directory and tune options.

## Bugs
1. Sometimes it writes identical pages to csv. This happens in 2 cases:
1.1. Redirect from another page to this (solved by setting `skipRequestedRedirect: true`, hardcoded).
1.2. Simultaneous request of the same page in parallel threads.
1. Sometimes it writes identical pages to csv. This happens in 2 cases:
1.1. Redirect from another page to this (solved by setting `skipRequestedRedirect: true`, hardcoded).
1.2. Simultaneous request of the same page in parallel threads.
2. Sometimes a number appears instead of the URL, it occurs at the stage of converting csv to xlsx, don't know why.


Expand Down Expand Up @@ -269,9 +270,9 @@ site-audit-seo -u https://example.com --lighthouse


## Баги
1. Иногда пишет в csv одинаковые страницы. Это бывает в 2 случаях:
1.1. Редирект с другой страницы на эту (решается установкой `skipRequestedRedirect: true`, сделано).
1.2. Одновременный запрос одной и той же страницы в параллельных потоках.
1. Иногда пишет в csv одинаковые страницы. Это бывает в 2 случаях:
1.1. Редирект с другой страницы на эту (решается установкой `skipRequestedRedirect: true`, сделано).
1.2. Одновременный запрос одной и той же страницы в параллельных потоках.
2. Иногда вместо URL появляется цифра, происходит на этапе конвертации csv в xlsx, не знаю почему.


Expand Down
5 changes: 5 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"chrome-launcher": "^0.13.4",
"commander": "^5.0.0",
"csvtojson": "^2.0.10",
"expand-home-dir": "0.0.3",
"express": "^4.17.1",
"googleapis": "^59.0.0",
"lighthouse": "^6.2.0",
Expand Down
12 changes: 8 additions & 4 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const process = require('process');
const {program} = require('commander');
const packageJson = require('../package.json');
Expand All @@ -9,6 +10,7 @@ const {saveAsXlsx, saveAsJson, uploadJson, publishGoogleSheets, startViewer} = r
'./actions');
const {exec} = require('child_process');
const os = require('os');
const expandHomedir = require('expand-home-dir');
const color = require('./color');

const defaultDocs = [
Expand Down Expand Up @@ -93,7 +95,7 @@ program.option('-u --urls <urls>', 'Comma separated url list for scan', list).
getConfigVal('removeJson', true)).
option('--no-remove-csv', `No delete csv after xlsx generate`).
option('--no-remove-json', `No delete json after serve`).
option('--out-dir <dir>', `Output directory`, getConfigVal('outDir', '.')).
option('--out-dir <dir>', `Output directory`, getConfigVal('outDir', '~/site-audit-seo/')).
option('--csv <path>', `Skip scan, only convert csv to xlsx`).
option('--xlsx', `Save as XLSX`, getConfigVal('xlsx', false)).
option('--gdrive', `Publish sheet to google docs`,
Expand Down Expand Up @@ -146,9 +148,9 @@ async function start() {

if (program.csv) {
program.removeCsv = false;
const csvPath = program.csv;
const xlsxPath = csvPath.replace(/\.csv$/, '.xlsx');
let jsonPath = csvPath.replace(/\.csv$/, '.json');
const csvPath = expandHomedir(program.csv);
const xlsxPath = path.normalize(csvPath.replace(/\.csv$/, '.xlsx'));
let jsonPath = path.normalize(csvPath.replace(/\.csv$/, '.json'));
let webPath;
try {
if (program.xlsx) {
Expand Down Expand Up @@ -199,6 +201,8 @@ async function start() {
// c = 2, when lighthouse c = 1
if (!program.concurrency) program.concurrency = program.lighthouse ? 1 : 2;

program.outDir = expandHomedir(program.outDir);

outBrief(program);

for (let site of sites) {
Expand Down
30 changes: 23 additions & 7 deletions src/scrap-site.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// see API - https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#event-requeststarted
const fs = require('fs');
const path = require('path');
const {saveAsXlsx, saveAsJson, uploadJson, publishGoogleSheets, startViewer} = require(
'./actions');
const HCCrawler = require('@popstas/headless-chrome-crawler');
Expand All @@ -9,26 +10,29 @@ const {validateResults, getValidationSum} = require('./validate');
const {exec} = require('child_process');
const lighthouse = require('lighthouse');
const chromeLauncher = require('chrome-launcher');
// поля описаны в API по ссылке выше
const fieldsPresets = require('./presets/scraperFields');
const color = require('./color');

const DEBUG = true; // выключить, если не нужны console.log на каждый запрос (не будет видно прогресс)

const color = require('./color');

// запреты браузеру на подгрузку статики, ускоряет
let SKIP_IMAGES = true;
let SKIP_CSS = true;
let SKIP_JS = true;
const finishTries = 5;

// поля описаны в API по ссылке выше
const fieldsPresets = require('./presets/scraperFields');
// кол-во попыток выполнить actions
const finishTries = 5;

module.exports = async (baseUrl, options = {}) => {
createDirIfNotExists(options.outDir);
const domain = url.parse(baseUrl).hostname;
const protocol = url.parse(baseUrl).protocol;
const csvPath = `${options.outDir}/${domain}.csv`;
const xlsxPath = `${options.outDir}/${domain}.xlsx`;
const jsonPath = `${options.outDir}/${domain}.json`;

const csvPath = path.normalize(`${options.outDir}/${domain}.csv`);
const xlsxPath = path.normalize(`${options.outDir}/${domain}.xlsx`);
const jsonPath = path.normalize(`${options.outDir}/${domain}.json`);
let webPath;

if (!options.color) color.white = color.red = color.reset = color.yellow = '';
Expand Down Expand Up @@ -502,3 +506,15 @@ module.exports = async (baseUrl, options = {}) => {

await tryFinish(finishTries);
};

function createDirIfNotExists(path) {
const exists = fs.existsSync(path);
if(exists && fs.statSync(path).isFile()) {
throw new Error(`File exists, cannot save here: ${path}`);
return false;
}

if(!exists) fs.mkdirSync(path, { recursive: true })

return path;
}

0 comments on commit e38af7d

Please sign in to comment.