Permalink
Browse files

readme update, FromResponse -> FromStatusAndBytes(), unexpected statu…

…s codes - error (was disallow)
  • Loading branch information...
1 parent 08fdba6 commit 164bac4d1e728ec409d267d3765a75e4fbe97cee @temoto committed Oct 4, 2012
Showing with 88 additions and 56 deletions.
  1. +45 −17 README.rst
  2. +23 −19 robotstxt.go
  3. +20 −20 robotstxt_test.go
View
@@ -13,49 +13,77 @@ To build and run tests run `go test` in source directory.
Usage
=====
+As usual, no special installation is required, just
+
+ import "github.com/temoto/robotstxt.go"
+
+run `go get` and you're ready.
+
1. Parse
^^^^^^^^
First of all, you need to parse robots.txt data. You can do it with
-function `FromString(body string) (*RobotsData, error)`::
+functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`::
+ robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:"))
robots, err := robotstxt.FromString("User-agent: *\nDisallow:")
-There is a convenient function `FromResponse(statusCode int, body string) (*RobotsData, error)`
-to init robots data from HTTP response status code and body::
+As of 2012-10-03, `FromBytes` is the most efficient method, everything else
+is a wrapper for this core function.
+
+There are few convenient constructors for various purposes:
+
+* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data
+from HTTP response. It *does not* call `response.Body.Close()`::
- robots, err := robotstxt.FromResponse(resp.StatusCode, resp.Body)
+ robots, err := robotstxt.FromResponse(resp)
+ resp.Body.Close()
if err != nil {
- // robots.txt parse error
- return false, err
+ log.Println("Error parsing robots.txt:", err.Error())
}
-Passing status code applies following logic in line with google's interpretation of robots.txt files:
+* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or
+`FromStatusAndString` if you prefer to read bytes (string) yourself.
+Passing status code applies following logic in line with Google's interpretation
+of robots.txt files:
- * status code = 4xx -> allow all (even 401/403, as recommended by Google).
- * status code = 2xx -> parse body with `FromString` and apply rules listed there.
- * other statuses (5xx) -> disallow all, consider this a temporary unavailability.
+ * status 2xx -> parse body with `FromBytes` and apply rules listed there.
+ * status 4xx -> allow all (even 401/403, as recommended by Google).
+ * other (5xx) -> disallow all, consider this a temporary unavailability.
2. Query
^^^^^^^^
Parsing robots.txt content builds a kind of logic database, which you can
query with `(r *RobotsData) TestAgent(url, agent string) (bool)`.
-Explicit passing of agent is useful if you want to query for different agents. For single agent
-users there is a convenient option: `(r *RobotsData) Test(url) (bool)` which is
-identical to `TestAgent`, but uses `r.DefaultAgent` as user agent for each query.
+Explicit passing of agent is useful if you want to query for different agents. For
+single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)`
+returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`.
-Query parsed robots data with explicit user agent.
+Simple query with explicit user agent. Each call will scan all rules.
::
allow := robots.TestAgent("/", "FooBot")
-Or with implicit user agent.
+Or query several paths against same user agent for performance.
::
- robots.DefaultAgent = "OtherBot"
- allow := robots.Test("/")
+ group := robots.FindGroup("BarBot")
+ group.Test("/")
+ group.Test("/download.mp3")
+ group.Test("/news/article-2012-1")
+
+
+Who
+===
+
+Honorable contributors (in undefined order):
+
+ * Ilya Grigorik (igrigorik)
+ * Martin Angers (PuerkitoBio)
+ * Micha Gorelick (mynameisfiber)
+Initial commit and other: Sergey Shepelev temotor@gmail.com
View
@@ -3,14 +3,16 @@
// with various extensions.
package robotstxt
-// Comments explaining the logic are taken from either the google's spec:
+// Comments explaining the logic are taken from either the Google's spec:
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
import (
"bytes"
+ "errors"
"io/ioutil"
"net/http"
"regexp"
+ "strconv"
"strings"
"time"
)
@@ -56,9 +58,11 @@ func (e ParseError) Error() string {
var allowAll = &RobotsData{allowAll: true}
var disallowAll = &RobotsData{disallowAll: true}
-func FromResponseBytes(statusCode int, body []byte) (*RobotsData, error) {
+func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {
switch {
- //
+ case statusCode >= 200 && statusCode < 300:
+ return FromBytes(body)
+
// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
//
// Google treats all 4xx errors in the same way and assumes that no valid
@@ -67,19 +71,19 @@ func FromResponseBytes(statusCode int, body []byte) (*RobotsData, error) {
// "Unauthorized" and 403 "Forbidden" HTTP result codes.
case statusCode >= 400 && statusCode < 500:
return allowAll, nil
- case statusCode >= 200 && statusCode < 300:
- return FromBytes(body)
- }
- // Conservative disallow all default
- //
- // From google's spec:
+
+ // From Google's spec:
// Server errors (5xx) are seen as temporary errors that result in a "full
// disallow" of crawling.
- return disallowAll, nil
+ case statusCode >= 500 && statusCode < 600:
+ return disallowAll, nil
+ }
+
+ return nil, errors.New("Unexpected status: " + strconv.FormatInt(int64(statusCode), 10))
}
-func FromResponseContent(statusCode int, body string) (*RobotsData, error) {
- return FromResponseBytes(statusCode, []byte(body))
+func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {
+ return FromStatusAndBytes(statusCode, []byte(body))
}
func FromResponse(res *http.Response) (*RobotsData, error) {
@@ -91,7 +95,7 @@ func FromResponse(res *http.Response) (*RobotsData, error) {
if e != nil {
return nil, e
}
- return FromResponseBytes(res.StatusCode, buf)
+ return FromStatusAndBytes(res.StatusCode, buf)
}
func FromBytes(body []byte) (r *RobotsData, err error) {
@@ -140,7 +144,7 @@ func (r *RobotsData) TestAgent(path, agent string) bool {
}
// Find a group of rules that applies to this agent
- // From google's spec:
+ // From Google's spec:
// The user-agent is non-case-sensitive.
if g := r.FindGroup(agent); g != nil {
// Find a rule that applies to this url
@@ -149,12 +153,12 @@ func (r *RobotsData) TestAgent(path, agent string) bool {
}
}
- // From google's spec:
- // By default, there are no restrictions for crawling for the designated crawlers.
+ // From Google's spec:
+ // By default, there are no restrictions for crawling for the designated crawlers.
return true
}
-// From google's spec:
+// From Google's spec:
// Only one group of group-member records is valid for a particular crawler.
// The crawler must determine the correct group of records by finding the group
// with the most specific user-agent that still matches. All other groups of
@@ -190,7 +194,7 @@ func (g *Group) Test(path string) bool {
return true
}
-// From google's spec:
+// From Google's spec:
// The path value is used as a basis to determine whether or not a rule applies
// to a specific URL on a site. With the exception of wildcards, the path is
// used to match the beginning of a URL (and any valid URLs that start with the
@@ -207,7 +211,7 @@ func (g *Group) findRule(path string) (ret *rule) {
if r.pattern != nil {
if r.pattern.MatchString(path) {
// Consider this a match equal to the length of the pattern.
- // From google's spec:
+ // From Google's spec:
// The order of precedence for rules with wildcards is undefined.
if l := len(r.pattern.String()); l > prefixLen {
prefixLen = len(r.pattern.String())
View
@@ -4,18 +4,18 @@ import (
"testing"
)
-func TestFromResponseBasic(t *testing.T) {
- if _, err := FromResponseContent(200, ""); err != nil {
- t.Fatal("FromResponse MUST accept 200/\"\"")
+func TestFromStatusAndStringBasic(t *testing.T) {
+ if _, err := FromStatusAndString(200, ""); err != nil {
+ t.Fatal("FromStatusAndString MUST accept 200/\"\"")
}
- if _, err := FromResponseContent(401, ""); err != nil {
- t.Fatal("FromResponse MUST accept 401/\"\"")
+ if _, err := FromStatusAndString(401, ""); err != nil {
+ t.Fatal("FromStatusAndString MUST accept 401/\"\"")
}
- if _, err := FromResponseContent(403, ""); err != nil {
- t.Fatal("FromResponse MUST accept 403/\"\"")
+ if _, err := FromStatusAndString(403, ""); err != nil {
+ t.Fatal("FromStatusAndString MUST accept 403/\"\"")
}
- if _, err := FromResponseContent(404, ""); err != nil {
- t.Fatal("FromResponse MUST accept 404/\"\"")
+ if _, err := FromStatusAndString(404, ""); err != nil {
+ t.Fatal("FromStatusAndString MUST accept 404/\"\"")
}
}
@@ -36,19 +36,19 @@ func ExpectDisallow(r *RobotsData, t *testing.T, msg string) {
}
}
-func TestResponse401(t *testing.T) {
- r, _ := FromResponseContent(401, "")
- ExpectAllow(r, t, "FromResponse(401, \"\") MUST allow everything.")
+func TestStatus401(t *testing.T) {
+ r, _ := FromStatusAndString(401, "")
+ ExpectAllow(r, t, "FromStatusAndString(401, \"\") MUST allow everything.")
}
-func TestResponse403(t *testing.T) {
- r, _ := FromResponseContent(403, "")
- ExpectAllow(r, t, "FromResponse(403, \"\") MUST allow everything.")
+func TestStatus403(t *testing.T) {
+ r, _ := FromStatusAndString(403, "")
+ ExpectAllow(r, t, "FromStatusAndString(403, \"\") MUST allow everything.")
}
-func TestResponse404(t *testing.T) {
- r, _ := FromResponseContent(404, "")
- ExpectAllow(r, t, "FromResponse(404, \"\") MUST allow everything.")
+func TestStatus404(t *testing.T) {
+ r, _ := FromStatusAndString(404, "")
+ ExpectAllow(r, t, "FromStatusAndString(404, \"\") MUST allow everything.")
}
func TestFromStringBasic(t *testing.T) {
@@ -225,8 +225,8 @@ func BenchmarkParseFromString002(b *testing.B) {
}
}
-func BenchmarkParseFromResponse401(b *testing.B) {
+func BenchmarkParseFromStatus401(b *testing.B) {
for i := 0; i < b.N; i++ {
- FromResponseContent(401, "")
+ FromStatusAndString(401, "")
}
}

0 comments on commit 164bac4

Please sign in to comment.