Skip to content

Commit

Permalink
feat: export to xlsx
Browse files Browse the repository at this point in the history
  • Loading branch information
popstas committed Apr 17, 2020
1 parent bdde095 commit 21406fb
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 21 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ node_modules/
data/*
!/**/.gitkeep
*.csv
.~*
*.xlsx
.~*
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Сканирование одного или несколько сайтов в файлы csv.
Сканирование одного или несколько сайтов в файлы csv и xlsx.

## Особенности:
- Обходит весь сайт, собирает ссылки на страницы и документы
Expand Down Expand Up @@ -46,14 +46,15 @@ Usage: sites-scraper -u https://example.com
Options:
-u --urls <urls> Comma separated url list for scan
-p, --preset <preset> Table preset (default: "seo")
-p, --preset <preset> Table preset (minimal, seo, headers, parse) (default: "seo")
-d, --max-depth <depth> Max scan depth (default: 10)
-c, --concurrenty Threads number
-f, --fields <json> JSON with custom fields
--no-skip-static Scan static files
--follow-xml-sitemap Follow sitemap.xml
--max-requests <num> Limit max pages scan (default: 0)
--no-headless Show browser GUI while scan
--encoding <enc> Result csv encoding (default: "win1251")
--no-remove-csv No delete csv after xlsx generate
--out-dir <dir> Output directory (default: ".")
--no-color No console colors
-V, --version output the version number
Expand Down
110 changes: 102 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@
"dependencies": {
"commander": "^5.0.0",
"headless-chrome-crawler": "^1.8.0",
"iconv-lite": "^0.5.1"
"xlsx": "^0.15.6"
}
}
5 changes: 3 additions & 2 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ program
.option('--follow-xml-sitemap', `Follow sitemap.xml`)
.option('--max-requests <num>', `Limit max pages scan`, 0)
.option('--no-headless', `Show browser GUI while scan`)
.option('--encoding <enc>', `Result csv encoding`, 'win1251')
.option('--no-remove-csv', `No delete csv after xlsx generate`)
.option('--out-dir <dir>', `Output directory`, '.')
.option('--no-color', `No console colors`)
.name("sites-scraper")
Expand Down Expand Up @@ -49,7 +49,8 @@ async function start() {
encoding: program.encoding, // для Excel
outDir: program.outDir, // папка, куда сохраняются csv
color: program.color, // раскрашивать консоль
fields: program.fields // дополнительные поля
fields: program.fields, // дополнительные поля
removeCsv: program.removeCsv // удалять csv после генерации xlsx
});
}
}
Expand Down
68 changes: 62 additions & 6 deletions src/scrap-site.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// see API - https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#event-requeststarted
const fs = require('fs');
const iconv = require('iconv-lite');
const xlsx = require('xlsx');
const HCCrawler = require('headless-chrome-crawler');
const CSVExporter = require('headless-chrome-crawler/exporter/csv');
const url = require('url');
Expand All @@ -26,10 +26,10 @@ const fields_presets = {
minimal: ['response.url'],
seo: [
'response.url',
'result.mixed_content_url',
'result.canonical',
'result.is_canonical',
'previousUrl',
'result.mixed_content_url',
'depth',
'response.status',
'result.request_time',
Expand Down Expand Up @@ -77,6 +77,7 @@ module.exports = async (baseUrl, options = {}) => {
const domain = url.parse(baseUrl).hostname;
const protocol = url.parse(baseUrl).protocol;
const csvPath = `${options.outDir}/${domain}.csv`; // файл вывода
const xlsxPath = `${options.outDir}/${domain}.xlsx`; // файл вывода
let currentUrl = ''; // для хака с документами

if(!options.color) color.white = color.red = color.reset = color.yellow = '';
Expand Down Expand Up @@ -124,6 +125,7 @@ module.exports = async (baseUrl, options = {}) => {
if (options.url.includes('/?catalog_view=')) return false; // bitrix display
if (options.url.includes('/?SORT=')) return false; // bitrix sort
if (options.url.includes('/filter/clear/apply/')) return false; // bitrix filter
// if (options.url.match(/\?(category|age|usage|madein|season|brand)=/)) return false; // bitrix filter

// http scan while first page was https
if(url.parse(options.url).protocol != protocol) return false;
Expand Down Expand Up @@ -254,6 +256,7 @@ module.exports = async (baseUrl, options = {}) => {
links: []
};
}

// The result contains options, links, cookies and etc.
const result = await crawl();

Expand All @@ -264,6 +267,7 @@ module.exports = async (baseUrl, options = {}) => {
return result;
}
};

const crawlerOptions = { ...defaultOptions, ...options };

const start = Date.now();
Expand Down Expand Up @@ -292,12 +296,64 @@ module.exports = async (baseUrl, options = {}) => {

const t = Math.round((Date.now() - start) / 1000);
const perPage = Math.round((t / requestedCount) * 100) / 100;
console.log(`${color.yellow}Saved to ${csvPath}${color.reset}`);
console.log(`Finish: ${t} sec (${perPage} per page)`);
await crawler.close();

if (crawlerOptions.encoding.toLowerCase() != 'utf-8') {
const saveAsXlsx = () => {
// limit max column width
const widths = {
url: 60,
h1: 100,
title: 100,
description: 100,
keywords: 100,
og_title: 100,
}

// read csv to workbook
const csvRaw = fs.readFileSync(csvPath, 'utf-8');
fs.writeFileSync(csvPath, iconv.encode(csvRaw, crawlerOptions.encoding));
const wb = xlsx.read(csvRaw, {type: 'string'});
const ws = wb.Sheets[wb.SheetNames[0]];

const range = xlsx.utils.decode_range(ws['!ref']);
const cols = [];
const colsFixed = {};

for(let r = 0; r <= range.e.r; r++){

for(let c = 0; c <= range.e.c; c++) {
const addr = xlsx.utils.encode_cell({r:r, c:c});
if(!ws[addr]) continue;

// header
if(r == 0) {
const val = ws[addr].v.replace('result.', '').replace('response.', '');
ws[addr].v = val
if(val) {
cols[c] = val;
colsFixed[c] = widths[val];
}
}

// columns width
const length = Object.values(ws[addr].v).length;
if(!cols[c]) cols[c] = length;
else cols[c] = Math.max(cols[c], length);
if(colsFixed[c]) cols[c] = Math.min(colsFixed[c], cols[c]);
}
}
const colsObj = cols.map(length => { return {width: length} });
ws['!cols'] = colsObj;

ws['!autofilter'] = { ref: ws['!ref'] };

xlsx.writeFile(wb, xlsxPath);
if(options.removeCsv) {
fs.unlinkSync(csvPath);
}
}

saveAsXlsx();

console.log(`${color.yellow}Saved to ${xlsxPath}${color.reset}`);
console.log(`Finish: ${t} sec (${perPage} per page)`);
};

0 comments on commit 21406fb

Please sign in to comment.