/
robots.go
138 lines (114 loc) · 4.42 KB
/
robots.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package thingfulx
import (
"context"
"errors"
"fmt"
"net/http"
"net/url"
"sync"
"github.com/temoto/robotstxt"
)
// permissiveString contains a robots.txt file that allows all access. Used when
// we cannot obtain a valid robots.txt from a domain.
const permissiveString = "User-Agent: *\nDisallow:"
var (
// robotsPath is a URL containing the well known path for robots.txt files
robotsPath, _ = url.Parse("/robots.txt")
)
func newRobots() *robots {
return &robots{
cache: map[string]*robotstxt.Group{},
}
}
// robots is a type that provides a method that checks whether a request is
// permitted to be sent to a domain - either because their robots.txt explicitly
// permits access, or that we are unable to obtain a valid robots.txt file from
// the well known path. robots type is thread safe and expected to be able to be
// called from multiple goroutines. Maintains a simple in memory cache of
// received robots.txt responses to avoid making repeated calls.
type robots struct {
sync.RWMutex
cache map[string]*robotstxt.Group
}
// permitted is a function on our robots instance that allows the caller to
// obtain a robots.txt agent and use that agent to test whether the client
// should be permitted to access the resource URL. Internally the robots
// instance maintains an in memory cache of parsers meaning that once running it
// will not keep requesting the remote robots.txt content for a specific host.
// If no robots.txt is available we use a permissive check that allows all
// requests.
func (r *robots) permitted(ctx context.Context, resourceURL *url.URL, userAgent string, client *http.Client) bool {
group := r.getGroup(ctx, resourceURL, userAgent, client)
return group.Test(resourceURL.Path)
}
// getGroup attempts to obtain a *robotstxt.Group instance for the specified
// URL. We cache any response here to speed subsequent requests. If any error
// happens on retrieiving a robots.txt, we assume permitted.
func (r *robots) getGroup(ctx context.Context, resourceURL *url.URL, userAgent string, client *http.Client) *robotstxt.Group {
// look in cache and return if present
group := r.getGroupFromCache(resourceURL.Host)
if group != nil {
return group
}
// if not present then attempt to fetch from remote
group, err := r.fetchGroup(ctx, resourceURL, userAgent, client)
if err != nil {
// on any error return our fully permissive group
group = permissiveGroup(userAgent)
}
// update cache
r.Lock()
defer r.Unlock()
r.cache[resourceURL.Host] = group
return group
}
// getGroupFromCache attempts to return our robotstxt.Group instance from the in
// memory cache (a map). Returns nil if an instance for the specified host is
// not present
func (r *robots) getGroupFromCache(host string) *robotstxt.Group {
r.RLock()
defer r.RUnlock()
group, ok := r.cache[host]
if ok {
return group
}
return nil
}
// fetchGroup attempts to fetch and parse the robots.txt content from the remote
// domain. On any error we return a fully permissive robotstxt.Group meaning
// requests will be permitted until we check again.
func (r *robots) fetchGroup(ctx context.Context, resourceURL *url.URL, userAgent string, client *http.Client) (*robotstxt.Group, error) {
robotsURL := resourceURL.ResolveReference(robotsPath)
req, err := http.NewRequest(http.MethodGet, robotsURL.String(), nil)
if err != nil {
return nil, fmt.Errorf("Failed to build request for robots.txt: %v", err)
}
// propagate our context onwards
req = req.WithContext(ctx)
req.Header.Add("User-Agent", userAgent)
// make request
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("Failed to make request for robots.txt: %v", err)
}
// ensure response body closed as robotstxt doesn't do this for us
defer resp.Body.Close()
// attempt to parse robotstxt Group from the response
robotsData, err := robotstxt.FromResponse(resp)
if err != nil {
return nil, fmt.Errorf("Failed to parse robots.txt response: %v", err)
}
// Attempt to obtain robotstxt.Group from the parsed response
group := robotsData.FindGroup(userAgent)
if group == nil {
return nil, errors.New("Failed to find matching group")
}
return group, nil
}
// permissiveGroup is a helper function that returns a fully permissive
// robotstxt.Group which we apply if we are unable to obtain an actual
// robots.txt from the remote data provider.
func permissiveGroup(userAgent string) *robotstxt.Group {
rd, _ := robotstxt.FromString(permissiveString)
return rd.FindGroup(userAgent)
}