-
Notifications
You must be signed in to change notification settings - Fork 43
/
extract_urls.go
108 lines (101 loc) · 2.52 KB
/
extract_urls.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package core
import (
"github.com/yaklang/yaklang/common/log"
"github.com/yaklang/yaklang/common/rpa/character"
"github.com/yaklang/yaklang/common/utils"
"github.com/yaklang/yaklang/common/yak/yaklib/codec"
"strings"
)
const findHref = `() => {
let nodes = document.createNodeIterator(document.getRootNode())
let hrefs = [];
let node;
while ((node = nodes.nextNode())) {
let {href, src} = node;
if (href) {
hrefs.push(href)
}
if (src) {
hrefs.push(src)
}
}
return hrefs
}`
func (m *Manager) extractUrls(page_block *PageBlock) error {
page := page_block.page
r, err := page.Eval(findHref)
if err != nil {
return utils.Errorf("eval failed: %s", err)
}
tmp := r.Value.Arr()
for _, r := range tmp {
urlStr := r.Str()
if urlStr == "" {
continue
}
//remove url param and calculate hash
hashStr := m.RemoveParamValue(urlStr)
hash := requestToUniqueHash(hashStr, "GET", "", nil)
if m.visited.Exist(hash) {
continue
} else {
m.visited.Insert(hash)
}
if !m.checkFileSuffixValid(urlStr) {
continue
}
if !m.checkHostIsValid(urlStr) {
continue
}
var ifDanger string
if m.rfmodel == nil {
ifDanger = "0"
} else if subString := character.CutLastSubUrl(urlStr); subString == "" {
ifDanger = "0"
} else {
ifDanger = m.rfmodel.PredictX(subString)
iffDanger := m.PredictX(subString)
if ifDanger != iffDanger {
ifDanger = "0"
}
if ifDanger == "1" {
log.Infof("danger url: %s : %s", urlStr, subString)
}
}
if page_block.depth < m.depth && ifDanger == "0" {
// go deptch
m.pageSizedWaitGroup.AddWithContext(m.rootContext)
go func() {
err = m.page(urlStr, page_block.depth+1)
if err != nil && !strings.Contains(err.Error(), "context canceled") {
log.Errorf("page error: %s", err)
}
}()
} else {
// do not go depth so need to send url data to channel
// or sensitive url can not click, send url data to channel directly
hash = codec.Sha256(urlStr)
if m.hijacked.Exist(hash) {
continue
}
m.hijacked.Insert(hash)
r := &MakeReq{}
r.url = urlStr
m.channel <- r
if m.urlCount != 0 && m.hijacked.Count() >= int64(m.urlCount) {
m.rootCancel()
}
}
}
return nil
}
// use key words directly to detect whether url is sensitive
// a complement of random forest used by detect sensitive url
func (m *Manager) PredictX(s string) string {
for _, sensiStr := range sensitiveWords {
if strings.Contains(s, sensiStr) {
return "1"
}
}
return "0"
}