forked from andeya/pholcus
/
parsejs.go
145 lines (134 loc) · 3.57 KB
/
parsejs.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
package spider
import (
"encoding/xml"
"io/ioutil"
"log"
"path"
"path/filepath"
"github.com/robertkrimen/otto"
"github.com/henrylee2cn/pholcus/config"
"github.com/henrylee2cn/pholcus/logs"
)
// 蜘蛛规则解释器模型
type (
SpiderModle struct {
Name string `xml:"Name"`
Description string `xml:"Description"`
Pausetime int64 `xml:"Pausetime"`
EnableLimit bool `xml:"EnableLimit"`
EnableKeyin bool `xml:"EnableKeyin"`
EnableCookie bool `xml:"EnableCookie"`
NotDefaultField bool `xml:"NotDefaultField"`
Namespace string `xml:"Namespace>Script"`
SubNamespace string `xml:"SubNamespace>Script"`
Root string `xml:"Root>Script"`
Trunk []RuleModle `xml:"Rule"`
}
RuleModle struct {
Name string `xml:"name,attr"`
ParseFunc string `xml:"ParseFunc>Script"`
AidFunc string `xml:"AidFunc>Script"`
}
)
func init() {
for _, _m := range getSpiderModles() {
m := _m //保证闭包变量
var sp = &Spider{
Name: m.Name,
Description: m.Description,
Pausetime: m.Pausetime,
EnableCookie: m.EnableCookie,
NotDefaultField: m.NotDefaultField,
RuleTree: &RuleTree{Trunk: map[string]*Rule{}},
}
if m.EnableLimit {
sp.Limit = LIMIT
}
if m.EnableKeyin {
sp.Keyin = KEYIN
}
if m.Namespace != "" {
sp.Namespace = func(self *Spider) string {
vm := otto.New()
vm.Set("self", self)
val, err := vm.Eval(m.Namespace)
if err != nil {
logs.Log.Error(" * 动态规则 [Namespace]: %v\n", err)
}
s, _ := val.ToString()
return s
}
}
if m.SubNamespace != "" {
sp.SubNamespace = func(self *Spider, dataCell map[string]interface{}) string {
vm := otto.New()
vm.Set("self", self)
vm.Set("dataCell", dataCell)
val, err := vm.Eval(m.SubNamespace)
if err != nil {
logs.Log.Error(" * 动态规则 [SubNamespace]: %v\n", err)
}
s, _ := val.ToString()
return s
}
}
sp.RuleTree.Root = func(ctx *Context) {
vm := otto.New()
vm.Set("ctx", ctx)
_, err := vm.Eval(m.Root)
if err != nil {
logs.Log.Error(" * 动态规则 [Root]: %v\n", err)
}
}
for _, rule := range m.Trunk {
r := new(Rule)
r.ParseFunc = func(parse string) func(*Context) {
return func(ctx *Context) {
vm := otto.New()
vm.Set("ctx", ctx)
_, err := vm.Eval(parse)
if err != nil {
logs.Log.Error(" * 动态规则 [ParseFunc]: %v\n", err)
}
}
}(rule.ParseFunc)
r.AidFunc = func(parse string) func(*Context, map[string]interface{}) interface{} {
return func(ctx *Context, aid map[string]interface{}) interface{} {
vm := otto.New()
vm.Set("ctx", ctx)
vm.Set("aid", aid)
val, err := vm.Eval(parse)
if err != nil {
logs.Log.Error(" * 动态规则 [AidFunc]: %v\n", err)
}
return val
}
}(rule.ParseFunc)
sp.RuleTree.Trunk[rule.Name] = r
}
sp.Register()
}
}
func getSpiderModles() (ms []*SpiderModle) {
defer func() {
if p := recover(); p != nil {
log.Printf("[E] HTML动态规则解析: %v\n", p)
}
}()
files, _ := filepath.Glob(path.Join(config.SPIDER_DIR, "*"+config.SPIDER_EXT))
for _, filename := range files {
b, err := ioutil.ReadFile(filename)
if err != nil {
log.Printf("[E] HTML动态规则[%s]: %v\n", filename, err)
continue
}
var m SpiderModle
err = xml.Unmarshal(b, &m)
if err != nil {
log.Printf("[E] HTML动态规则[%s]: %v\n", filename, err)
continue
}
ms = append(ms, &m)
}
return
}