-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
105 lines (86 loc) · 3.36 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
(async () => {
const fs = require('fs')
const { checkDirExistThenCreate, checkFileExistThenCreate } = require('./utils/file')
const { onlyDigitChar,removeDomainFromURL } = require('./utils/string')
const csv = require('./utils/csv')
const htmlAnalyzer = require('./utils/htmlAnalyzer')
const webdriver = require('./utils/webdriver')
const TODAY_DATE_STRING = new Date().toISOString().slice(0, 10)
const _urlInTxt = fs.readFileSync('./urls.txt', 'utf8')
const urlInTxt = _urlInTxt.split('\n')
const dirName = `./${TODAY_DATE_STRING}_audit`
checkDirExistThenCreate(dirName)
// 1 -=-=-=-= download url to html
for(let i=0;i<urlInTxt.length;i++){
const url = urlInTxt[i]
const filename = onlyDigitChar(url)
if (fs.existsSync(`./${dirName}/${filename}.html`)) {
console.log(`[DOWNLOAD SKIP] ${i}/${urlInTxt.length-1}, ${url}`)
continue
}
console.log(`[START][${i}/${urlInTxt.length}] ${url}`)
const rawHtml = await webdriver.fetchHtml(url,{isHeadless: false})
// const rawHtml = await webdriver.fetchSsrHtml(url, {isHeadless: false})
fs.writeFileSync(`${dirName}/${filename}.html`, rawHtml, 'utf8')
}
// 2 -=-=-=-=
function findDuplication(arr) {
var counts = {};
for (var i = 0; i < arr.length; i++) {
var num = arr[i];
counts[num] = counts[num] ? counts[num] + 1 : 1;
}
var duplicates = {};
for (var num in counts) {
if (counts[num] > 1) {
duplicates[num] = counts[num];
}
}
return duplicates;
}
const json = []
const titleList = []
const descriptionList = []
for(let i=0;i<urlInTxt.length;i++){
const url = urlInTxt[i]
console.log(`[ANALYSIS][${i}/${urlInTxt.length}] ${url}`)
const filename = onlyDigitChar(url)
const html = fs.readFileSync(`${dirName}/${filename}.html`, 'utf8')
const title = `\"${htmlAnalyzer.title(html)}\"`
const description = `\"${htmlAnalyzer.description(html)}\"`
const canonical = htmlAnalyzer.canonical(html)
const h1 = htmlAnalyzer.h1(html)
const h2 = htmlAnalyzer.h2(html)
const h3 = htmlAnalyzer.h3(html)
const isRedirect = !(removeDomainFromURL(url) === removeDomainFromURL(canonical))
const size = htmlAnalyzer.size(html)
const {countStructureData, structureDataList} = htmlAnalyzer.structureDataAnalyzer(html)
let data = {
url,
title,
description,
canonical,
isRedirect,
...h1,
...h2,
...h3,
size,
...htmlAnalyzer.gtag(html),
...htmlAnalyzer.structureData(html),
countStructureData,
}
structureDataList.forEach(item =>{
data = {
...data,
...item
}
})
json.push(data)
titleList.push(title)
descriptionList.push(description)
}
console.log('Duplicate Title: ',findDuplication(titleList))
console.log('Duplicate Description: ',findDuplication(descriptionList))
const normalizedJson = csv.normalizeJsonForCsv(json)
csv.writeJsonToCsv(`${dirName}/final.csv`, normalizedJson,'utf8')
})()