/
readability.go
204 lines (179 loc) · 6.09 KB
/
readability.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// Package extractor uses altered version of go-readabilty and local rules to get articles
package extractor
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
log "github.com/go-pkgz/lgr"
"github.com/mauidude/go-readability"
"go.mongodb.org/mongo-driver/bson/primitive"
"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
)
// Rules interface with all methods to access datastore
type Rules interface {
Get(ctx context.Context, rURL string) (datastore.Rule, bool)
GetByID(ctx context.Context, id primitive.ObjectID) (datastore.Rule, bool)
Save(ctx context.Context, rule datastore.Rule) (datastore.Rule, error)
Disable(ctx context.Context, id primitive.ObjectID) error
All(ctx context.Context) []datastore.Rule
}
// UReadability implements fetcher & extractor for local readability-like functionality
type UReadability struct {
TimeOut time.Duration
SnippetSize int
Rules Rules
}
// Response from api calls
type Response struct {
Content string `json:"content"`
Rich string `json:"rich_content"`
Domain string `json:"domain"`
URL string `json:"url"`
Title string `json:"title"`
Excerpt string `json:"excerpt"`
Image string `json:"lead_image_url"`
AllImages []string `json:"images"`
AllLinks []string `json:"links"`
ContentType string `json:"type"`
Charset string `json:"charset"`
}
var (
reLinks = regexp.MustCompile(`(href|src|action|background)="([^"]*)"`)
reSpaces = regexp.MustCompile(`\s+`)
reDot = regexp.MustCompile(`\D(\.)\S`)
)
const userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15"
// Extract fetches page and retrieves article
func (f UReadability) Extract(ctx context.Context, reqURL string) (rb *Response, err error) {
log.Printf("[INFO] extract %s", reqURL)
rb = &Response{}
httpClient := &http.Client{Timeout: time.Second * f.TimeOut}
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
if err != nil {
log.Printf("[WARN] failed to create request for %s, error=%v", reqURL, err)
return nil, err
}
req.Close = true
req.Header.Set("User-Agent", userAgent)
resp, err := httpClient.Do(req)
if err != nil {
log.Printf("[WARN] failed to get anything from %s, error=%v", reqURL, err)
return nil, err
}
defer func() {
if err = resp.Body.Close(); err != nil {
log.Printf("[WARN] failed to close response body, error=%v", err)
}
}()
rb.URL = resp.Request.URL.String()
dataBytes, e := io.ReadAll(resp.Body)
if e != nil {
log.Printf("[WARN] failed to read data from %s, error=%v", reqURL, e)
return nil, e
}
var body string
rb.ContentType, rb.Charset, body = f.toUtf8(dataBytes, resp.Header)
rb.Content, rb.Rich, err = f.getContent(ctx, body, reqURL)
if err != nil {
log.Printf("[WARN] failed to parse %s, error=%v", reqURL, err)
return nil, err
}
dbody, err := goquery.NewDocumentFromReader(strings.NewReader(body))
if err != nil {
return nil, err
}
rb.Title = dbody.Find("title").First().Text()
if r, e := url.Parse(rb.URL); e == nil {
rb.Domain = r.Host
}
rb.Content = f.getText(rb.Content, rb.Title)
rb.Rich, rb.AllLinks = f.normalizeLinks(rb.Rich, resp.Request)
rb.Excerpt = f.getSnippet(rb.Content)
darticle, err := goquery.NewDocumentFromReader(strings.NewReader(rb.Rich))
if err != nil {
log.Printf("[WARN] failed to create document from reader, error=%v", err)
return nil, err
}
if im, allImages, ok := f.extractPics(darticle.Find("img"), reqURL); ok {
rb.Image = im
rb.AllImages = allImages
}
log.Printf("[INFO] completed for %s, url=%s", rb.Title, rb.URL)
return rb, nil
}
// gets content from raw body string, both content (text only) and rich (with html tags)
func (f UReadability) getContent(ctx context.Context, body, reqURL string) (content, rich string, err error) {
// general parser
genParser := func(body, _ string) (content, rich string, err error) {
doc, err := readability.NewDocument(body)
if err != nil {
return "", "", err
}
content, rich = doc.ContentWithHTML()
return content, rich, nil
}
// custom rules parser
customParser := func(body, reqURL string, rule datastore.Rule) (content, rich string, err error) {
log.Printf("[DEBUG] custom extractor for %s", reqURL)
dbody, err := goquery.NewDocumentFromReader(strings.NewReader(body))
if err != nil {
return "", "", err
}
var res string
dbody.Find(rule.Content).Each(func(i int, s *goquery.Selection) {
if html, err := s.Html(); err == nil {
res += html
}
})
if res == "" {
return "", "", fmt.Errorf("nothing extracted from %s, rule=%v", reqURL, rule)
}
log.Printf("[INFO] custom rule processed for %s", reqURL)
return f.getText(res, ""), res, nil
}
if f.Rules != nil {
r := f.Rules
if rule, found := r.Get(ctx, reqURL); found {
if content, rich, err = customParser(body, reqURL, rule); err == nil {
return content, rich, err
}
log.Printf("[WARN] custom extractor failed for %s, error=%v", reqURL, err) // back to general parser
}
} else {
log.Printf("[DEBUG] no rules defined!")
}
return genParser(body, reqURL)
}
// makes all links absolute and returns all found links
func (f UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) {
absoluteLink := func(link string) (absLink string, changed bool) {
if r, err := reqContext.URL.Parse(link); err == nil {
return r.String(), r.String() != link
}
return "", false
}
result = data
matches := reLinks.FindAllStringSubmatch(data, -1)
normalizedCount := 0
for _, m := range matches {
srcLink := m[len(m)-1] // link in last element of the group
dstLink := srcLink
if absLink, changed := absoluteLink(srcLink); changed {
dstLink = absLink
srcLink = fmt.Sprintf(`"%s"`, srcLink)
absLink = fmt.Sprintf(`"%s"`, absLink)
result = strings.ReplaceAll(result, srcLink, absLink)
log.Printf("[DEBUG] normalized %s -> %s", srcLink, dstLink)
normalizedCount++
}
links = append(links, dstLink)
}
log.Printf("[DEBUG] normalized %d links", normalizedCount)
return result, links
}