Skip to content

Commit

Permalink
feat: move sites list to data/sites.conf
Browse files Browse the repository at this point in the history
  • Loading branch information
popstas committed Mar 11, 2020
1 parent e6b0377 commit 68f16ca
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 6 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ npm start
```

## Использование:
1. Открыть `src/index.js`
2. Вписать нужные сайты
1. Открыть `data/sites.conf` (расширение выбрано исключительно ради подсветки и закомменчивания в vscode)
2. Вписать нужные сайты, по одному на строку (можно комментить ненужные сейчас через `#`, `//` или `;`)
3. Запустить: `npm start`

## Как посчитать контент по csv
Expand Down
Empty file removed data/.gitkeep
Empty file.
2 changes: 2 additions & 0 deletions data/sites.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
https://example.com
# https://www.example.com
26 changes: 22 additions & 4 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
const fs = require('fs');
const scrap_site = require('./scrap-site');

const sites = [
'http://example.com/'
];

async function start() {
const sites = parseSitesFile('./data/sites.conf');

for (site of sites) {
await scrap_site(site, {
fields_preset: 'seo', // варианты: default, seo, headers, minimal
Expand All @@ -17,4 +16,23 @@ async function start() {
}
}

function parseSitesFile(file){
const urlRegex = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&#\/%=~_|$?!:,.]*\)|[A-Z0-9+&#\/%=~_|$])/ig

if(!fs.existsSync(file)){
console.error(`${file} not found, please create sites list file!`);
return [];
}

let urls = [];
const lines = fs.readFileSync(file, 'utf8').split('\n');
lines.forEach ((line, ind) => {
if (line.match(/^\s*[#\/;]+/)) return; // commented line
let url = line.match(urlRegex);
if (!url || url[0].endsWith('.png') || url[0].endsWith('.jpg') || url[0].endsWith('.js') || url[0].endsWith('.css')) return;
urls.push(url[0]);
});
return urls;
}

start();

0 comments on commit 68f16ca

Please sign in to comment.