Skip to content

Commit

Permalink
feat: style and validation of xlsx
Browse files Browse the repository at this point in the history
  • Loading branch information
popstas committed Apr 18, 2020
1 parent 31d0271 commit eb99928
Show file tree
Hide file tree
Showing 4 changed files with 221 additions and 17 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Сканирование одного или несколько сайтов в файлы csv и xlsx.

XLSX файл оформляется для удобного просмотра, подсвечиваются некоторые ошибки, найденные при сканировании.

## Особенности:
- Обходит весь сайт, собирает ссылки на страницы и документы
- Выгружает поля url и глубина, на которой найден url
Expand Down Expand Up @@ -81,7 +83,7 @@ sites-scraper -d 1 -u https://example -f '{ "title": "$(`title`).text()" }'
Иногда пишет в csv одинаковые страницы. Это бывает в 2 случаях:
1. Редирект с другой страницы на эту (решается установкой `skipRequestedRedirect: true`, сделано).
2. Одновременный запрос одной и той же страницы в параллельных потоках.
3. Иногда вместо URL появляется цифра, происходит на этапе конвертации csv в xlsx.

## TODO:
- Не учитывать страницы ?page= , но сканировать
- Unique links
102 changes: 102 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
"version": "2.1.0",
"description": "Generate CSV with pages and docs for site list with puppeteer",
"main": "src/index.js",
"repository": {
"type": "git",
"url": "https://github.com/viasite/sites-scraper.git"
},
"scripts": {
"start": "node src/index.js",
"version": "npm run changelog && git add CHANGELOG.md",
Expand All @@ -19,6 +23,7 @@
"dependencies": {
"commander": "^5.0.0",
"headless-chrome-crawler": "^1.8.0",
"xlsx": "^0.15.6"
"xlsx": "^0.15.6",
"xlsx-style": "github:protobi/js-xlsx"
}
}
125 changes: 110 additions & 15 deletions src/scrap-site.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// see API - https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#event-requeststarted
const fs = require('fs');
const xlsx = require('xlsx');
const xlsx = require('xlsx-style');
const xlsxOrig = require('xlsx');
const HCCrawler = require('headless-chrome-crawler');
const CSVExporter = require('headless-chrome-crawler/exporter/csv');
const url = require('url');
Expand Down Expand Up @@ -73,6 +74,36 @@ const fields_presets = {
]
};

// validation functions for fields for xlsx
const fields_validate = {
mixed_content: {
error: (v) => !!v
},
is_canonical: {
error: (v) => v == 0
},
request_time: {
warning: (v) => v > 500,
error: (v) => v > 1000
},
status: {
error: (v) => v != 200
},
description: {
warning: (v) => v.length > 256
},
h1_count: {
error: (v) => v > 1
},
dom_size: {
warning: (v) => v > 1500,
error: (v) => v > 3000
},
html_size: {
warning: (v) => v > 1000000
}
}

module.exports = async (baseUrl, options = {}) => {
const domain = url.parse(baseUrl).hostname;
const protocol = url.parse(baseUrl).protocol;
Expand Down Expand Up @@ -301,51 +332,115 @@ module.exports = async (baseUrl, options = {}) => {

const saveAsXlsx = () => {
// limit max column width
const widths = {
const colWidths = {
url: 60,
h1: 100,
title: 100,
description: 100,
keywords: 100,
keywords: 60,
og_title: 100,
}

// styles presets for validation
const styles = {
warning: {
font: {
color: { rgb: "FFA09600" }
}
},
error: {
font: {
color: { rgb: "FFFF0000" }
}
}
}

// styles presets for columns
const colStyles = {
title: {
alignment: {
horizontal: 'right'
}
},
description: {
alignment: {
wrapText: true,
indent: true
}
},
keywords: {
alignment: {
wrapText: true,
indent: true
}
}
}

const colNames = {};

// read csv to workbook
const csvRaw = fs.readFileSync(csvPath, 'utf-8');
const wb = xlsx.read(csvRaw, {type: 'string'});
// xlsx-style cannot read csv
const wb = xlsxOrig.read(csvRaw, {type: 'string'});
const ws = wb.Sheets[wb.SheetNames[0]];

const range = xlsx.utils.decode_range(ws['!ref']);
const cols = [];
const colsFixed = {};

// iterate rows
for(let r = 0; r <= range.e.r; r++){

// iterate cols
for(let c = 0; c <= range.e.c; c++) {
const addr = xlsx.utils.encode_cell({r:r, c:c});
if(!ws[addr]) continue;
const colVal = ws[addr].v

// header
if(r == 0) {
const val = ws[addr].v.replace('result.', '').replace('response.', '');
ws[addr].v = val
if(val) {
cols[c] = val;
colsFixed[c] = widths[val];
const colName = colVal.replace('result.', '').replace('response.', '');
ws[addr].v = colName
if(colName) {
cols[c] = colName.length;
colNames[c] = colName;
}
}

// columns width
const length = Object.values(ws[addr].v).length;
const length = Object.values(colVal).length;
if(!cols[c]) cols[c] = length;
else cols[c] = Math.max(cols[c], length);
if(colsFixed[c]) cols[c] = Math.min(colsFixed[c], cols[c]);

// not applicable to first row
if(r == 0) continue;

const colName = colNames[c];

// limit width
if(colWidths[colName]) cols[c] = Math.min(colWidths[colName], cols[c]);

// cell style
if(colStyles[colName]) ws[addr].s = colStyles[colName];

// url
// if(colName == 'url') ws[addr].l = colVal;

// validation
if(r > 0){
if(fields_validate[colName]){
if(fields_validate[colName].warning && fields_validate[colName].warning(colVal)) ws[addr].s = styles.warning;
if(fields_validate[colName].error && fields_validate[colName].error(colVal)) ws[addr].s = styles.error;
}
}
}
}
const colsObj = cols.map(length => { return {width: length} });
const colsObj = cols.map(length => { return {wch: length} });
ws['!cols'] = colsObj;

ws['!autofilter'] = { ref: ws['!ref'] };
// fix first row and first column
ws['!freeze'] = { xSplit: "1", ySplit: "1", topLeftCell: "B2", activePane: "bottomRight", state: "frozen" };

// work only on official sheetjs (without styles) and only in MS Office
// ws['!autofilter'] = { ref: ws['!ref'] };

xlsx.writeFile(wb, xlsxPath);
if(options.removeCsv) {
Expand Down

0 comments on commit eb99928

Please sign in to comment.