This repository has been archived by the owner on Mar 27, 2023. It is now read-only.
/
pagemonitor.go
167 lines (149 loc) · 4.4 KB
/
pagemonitor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
package fetcher
import (
"bytes"
"fmt"
"io"
"net/http"
"regexp"
"strings"
"time"
"github.com/pmezard/go-difflib/difflib"
log "github.com/sirupsen/logrus"
"golang.org/x/net/html"
"github.com/zlogic/nanorss-go/data"
)
// getPreviousResult returns the previous value for the page (or an empty PagemonitorPage if no value exists).
func (fetcher *Fetcher) getPreviousResult(config *data.UserPagemonitor) *data.PagemonitorPage {
page, err := fetcher.DB.GetPage(config)
if err != nil {
log.WithField("page", config).WithError(err).Error("Failed to fetch previous result")
}
if page == nil {
return &data.PagemonitorPage{}
}
return page
}
// FetchPage fetches a page and performs a diff based on config.
// On success, it's saved into the database.
func (fetcher *Fetcher) FetchPage(config *data.UserPagemonitor) error {
err := func() error {
page := fetcher.getPreviousResult(config)
resp, err := fetcher.Client.Get(config.URL)
if err == nil {
defer resp.Body.Close()
}
if err == nil && resp.StatusCode != http.StatusOK {
err = fmt.Errorf("cannot GET page (status code %v)", resp.StatusCode)
}
if err != nil {
return fmt.Errorf("cannot GET page %v: %w", config, err)
}
text, err := convertHTMLtoText(resp.Body)
if err != nil {
return fmt.Errorf("cannot convert HTML to text %v: %w", config, err)
}
var textFiltered, previousTextFiltered string
if config.Match != "" {
regex, err := regexp.Compile(config.Match)
if err != nil {
return fmt.Errorf("cannot compile match regex %v: %w", config, err)
}
textFiltered = regex.ReplaceAllString(text, config.Replace)
previousTextFiltered = regex.ReplaceAllString(page.Contents, config.Replace)
} else {
textFiltered = text
previousTextFiltered = page.Contents
}
if previousTextFiltered == textFiltered {
// Save if nothing changed to update last seen time
return fetcher.DB.SavePage(page)
}
diff, err := difflib.GetUnifiedDiffString(difflib.UnifiedDiff{
A: difflib.SplitLines(previousTextFiltered),
B: difflib.SplitLines(textFiltered),
Context: 3,
})
if err != nil {
return fmt.Errorf("cannot create diff for page %v: %w", config, err)
}
page.Delta = diff
page.Contents = text
page.Updated = time.Now()
page.Config = config
err = fetcher.DB.SetReadStatusForAll(config.CreateKey(), false)
if err != nil {
return fmt.Errorf("cannot mark page %v as unread: %w", config, err)
}
log.WithField("value", page).WithField("page", config).WithField("delta", page.Delta).Debug("Page has changed")
return fetcher.DB.SavePage(page)
}()
fetchStatus := &data.FetchStatus{}
if err != nil {
log.WithField("page", config).WithError(err).Error("Failed to get page")
fetchStatus.LastFailure = time.Now()
} else {
fetchStatus.LastSuccess = time.Now()
}
fetchStatusKey := config.CreateKey()
if err := fetcher.DB.SetFetchStatus(fetchStatusKey, fetchStatus); err != nil {
log.WithField("page", config).WithError(err).Error("Failed to save fetch status for page")
}
return err
}
// FetchAllPages calls FetchPage for all pages for all users.
func (fetcher *Fetcher) FetchAllPages() error {
usernames, err := fetcher.DB.GetUsers()
if err != nil {
log.WithError(err).Error("Failed to get list of users")
return err
}
for _, username := range usernames {
user, err := fetcher.DB.GetUser(username)
if err != nil {
log.WithField("username", username).WithError(err).Error("Failed to get user")
return err
}
pages, err := user.GetPages()
if err != nil {
log.WithError(err).Error("Failed to get pages")
continue
}
countPages := len(pages)
completed := make(chan int)
for i, page := range pages {
go func(config data.UserPagemonitor, index int) {
// TODO: skip this page if it was already fetched this round.
fetcher.FetchPage(&config)
completed <- index
}(page, i)
}
for i := 0; i < countPages; i++ {
<-completed
}
}
return nil
}
func convertHTMLtoText(r io.Reader) (string, error) {
tokenizer := html.NewTokenizer(r)
buff := bytes.Buffer{}
for {
if tokenizer.Next() == html.ErrorToken {
err := tokenizer.Err()
if err == io.EOF {
return buff.String(), nil
}
return "", err
}
token := tokenizer.Token()
if token.Type == html.TextToken {
text := strings.TrimSpace(html.UnescapeString(token.Data))
if text == "" {
continue
}
if buff.Len() > 0 {
buff.WriteString("\n")
}
buff.WriteString(text)
}
}
}