Skip to content

Commit

Permalink
feat: validation results in console, --no-console-validate
Browse files Browse the repository at this point in the history
  • Loading branch information
popstas committed Apr 20, 2020
1 parent 4e4df1a commit ee63716
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 35 deletions.
4 changes: 3 additions & 1 deletion src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ program
.option('--out-dir <dir>', `Output directory`, '.')
.option('--csv <path>', `Skip scan, only convert csv to xlsx`)
.option('--no-color', `No console colors`)
.option('--no-console-validate', `Don't output validate messages in console`)
.name("sites-scraper")
.version(packageJson.version)
.usage("-u https://example.com")
Expand Down Expand Up @@ -71,7 +72,8 @@ async function start() {
outDir: program.outDir, // папка, куда сохраняются csv
color: program.color, // раскрашивать консоль
fields: program.fields, // дополнительные поля
removeCsv: program.removeCsv // удалять csv после генерации xlsx
removeCsv: program.removeCsv, // удалять csv после генерации xlsx
consoleValidate: program.consoleValidate, // выводить данные валидации в консоль
});
}
}
Expand Down
31 changes: 1 addition & 30 deletions src/save-as-xlsx.js
Original file line number Diff line number Diff line change
@@ -1,38 +1,9 @@
const fs = require('fs');
const xlsx = require('@popstas/xlsx-style');
const xlsxOrig = require('xlsx');

const {colsValidate} = require('./validate')
module.exports = (csvPath, xlsxPath) => {
// validation functions for fields for xlsx
const colsValidate = {
mixed_content: {
error: (v) => !!v
},
is_canonical: {
error: (v) => v == 0
},
request_time: {
warning: (v) => v > 500,
error: (v) => v > 1000
},
status: {
error: (v) => v != 200
},
description: {
warning: (v) => v.length > 256
},
h1_count: {
warning: (v) => v == 0,
error: (v) => v > 1
},
dom_size: {
warning: (v) => v > 1500,
error: (v) => v > 3000
},
html_size: {
warning: (v) => v > 1000000
}
}

// limit max column width
const colWidths = {
Expand Down
17 changes: 13 additions & 4 deletions src/scrap-site.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const saveAsXlsx = require('./save-as-xlsx');
const HCCrawler = require('headless-chrome-crawler');
const CSVExporter = require('headless-chrome-crawler/exporter/csv');
const url = require('url');
const {validateResults} = require('./validate');

const DEBUG = true; // выключить, если не нужны console.log на каждый запрос (не будет видно прогресс)

Expand Down Expand Up @@ -192,6 +193,17 @@ module.exports = async (baseUrl, options = {}) => {

onSuccess: result => {
if (!result.result) return;

// console validate output
const msgs = [];
const validate = validateResults(result, fields); // TODO: fields declared implicitly
for(let name in validate) {
const res = validate[name];
const msgColor = { warning: color.yellow, error: color.red }[res.type];
msgs.push(`${name}: ${msgColor}${res.msg}${color.reset}`);
}
if(msgs.length > 0) console.log(msgs.join(', '));

if (result.result.error) console.error(`${color.red}Error collect page data: result.result.error${color.reset}`);
// console.log(`html_size: ${result.result.html_size}`);
},
Expand Down Expand Up @@ -259,11 +271,8 @@ module.exports = async (baseUrl, options = {}) => {
// The result contains options, links, cookies and etc.
const result = await crawl();

if(result.response.status != 200) {
console.error(`${color.red}Code: ${result.response.status}${color.reset}`);
}

result.result.mixed_content_url = mixedContentUrl;

// You can access the page object after requests
result.content = await page.content();
// You need to extend and return the crawled result
Expand Down
57 changes: 57 additions & 0 deletions src/validate.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
const {at} = require('lodash');

const colsValidate = {
mixed_content: {
error: (v) => !!v,
},
is_canonical: {
error: (v) => v === 0 || v === '0',
},
request_time: {
warning: (v) => v > 500,
error: (v) => v > 1000,
},
status: {
error: (v) => v != 200,
},
description: {
warning: (v) => v.length > 256,
warningMsg: (v) => v.length
},
h1_count: {
warning: (v) => v == 0,
error: (v) => v > 1,
},
dom_size: {
warning: (v) => v > 1500,
error: (v) => v > 3000,
},
html_size: {
warning: (v) => v > 1000000,
},
};

exports.colsValidate = colsValidate;

exports.validateResults = (results, fields) => {
const validate = {};
for(let fName of fields) {
// get value
const colVal = at(results, fName)[0];
const colName = fName.replace('result.', '').replace('response.', '');
let msg;

// validate
if(colsValidate[colName]){
if(colsValidate[colName].warning && colsValidate[colName].warning(colVal)){
msg = colsValidate[colName].warningMsg ? colsValidate[colName].warningMsg(colVal) : colVal;
validate[colName] = { type: 'warning', msg: msg };
}
if(colsValidate[colName].error && colsValidate[colName].error(colVal)){
msg = colsValidate[colName].errorMsg ? colsValidate[colName].errorMsg(colVal) : colVal;
validate[colName] = { type: 'error', msg: msg };
}
}
}
return validate;
};

0 comments on commit ee63716

Please sign in to comment.