Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

clean-up API based on discussion with temoto

  • Loading branch information...
commit 64ca140e7cadc0f719c9e31cac53706f8c74e4aa 1 parent 0e01e6e
@PuerkitoBio PuerkitoBio authored committed
Showing with 90 additions and 71 deletions.
  1. +19 −18 google_test.go
  2. +5 −4 parser.go
  3. +42 −25 robotstxt.go
  4. +24 −24 robotstxt_test.go
View
37 google_test.go
@@ -3,6 +3,7 @@ package robotstxt
import (
"strings"
"testing"
+ "time"
)
const (
@@ -89,11 +90,11 @@ func TestGroupOrder(t *testing.T) {
agents := []string{"Googlebot-News (Googlebot)", "Googlebot", "Googlebot-Image (Googlebot)", "Otherbot (web)", "Otherbot (News)"}
groups := []int{1, 3, 3, 2, 2}
- if r, e := FromString(robots_case_order, false); e != nil {
+ if r, e := FromString(robots_case_order); e != nil {
t.Fatal(e)
} else {
for i, a := range agents {
- g := r.findGroup(a)
+ g := r.FindGroup(a)
gi := getIndexInSlice(r.groups, g) + 1
if gi != groups[i] {
t.Fatalf("Expected agent %s to have group number %d, got %d.", a, groups[i], gi)
@@ -103,7 +104,7 @@ func TestGroupOrder(t *testing.T) {
}
func TestGrouping(t *testing.T) {
- if r, e := FromString(robots_case_grouping, false); e != nil {
+ if r, e := FromString(robots_case_grouping); e != nil {
t.Fatal(e)
} else {
if len(r.groups) != 3 {
@@ -122,14 +123,14 @@ func TestGrouping(t *testing.T) {
}
func TestSitemaps(t *testing.T) {
- if r, e := FromString(robots_case_sitemaps, false); e != nil {
+ if r, e := FromString(robots_case_sitemaps); e != nil {
t.Fatal(e)
} else {
- if len(r.sitemaps) != 3 {
- for i, s := range r.sitemaps {
+ if len(r.Sitemaps) != 3 {
+ for i, s := range r.Sitemaps {
t.Logf("Sitemap %d: %s", i, s)
}
- t.Fatalf("Expected 3 sitemaps, got %d", len(r.sitemaps))
+ t.Fatalf("Expected 3 sitemaps, got %d", len(r.Sitemaps))
}
if len(r.groups) != 3 {
t.Fatalf("Expected 3 groups, got %d", len(r.groups))
@@ -138,26 +139,26 @@ func TestSitemaps(t *testing.T) {
}
func TestCrawlDelays(t *testing.T) {
- if r, e := FromString(robots_case_delays, false); e != nil {
+ if r, e := FromString(robots_case_delays); e != nil {
t.Fatal(e)
} else {
- if len(r.sitemaps) != 1 {
- t.Fatalf("Expected 1 sitemaps, got %d", len(r.sitemaps))
+ if len(r.Sitemaps) != 1 {
+ t.Fatalf("Expected 1 sitemaps, got %d", len(r.Sitemaps))
}
if len(r.groups) != 3 {
t.Fatalf("Expected 3 groups, got %d", len(r.groups))
}
- if r.groups[1].crawlDelay != 3.5 {
- t.Fatalf("Expected crawl delay of 3.5 for group 2, got %f", r.groups[1].crawlDelay)
+ if r.groups[1].CrawlDelay != time.Duration(3.5*float64(time.Second)) {
+ t.Fatalf("Expected crawl delay of 3.5 for group 2, got %f", r.groups[1].CrawlDelay)
}
- if r.groups[2].crawlDelay != 5 {
- t.Fatalf("Expected crawl delay of 5 for group 3, got %f", r.groups[2].crawlDelay)
+ if r.groups[2].CrawlDelay != (5 * time.Second) {
+ t.Fatalf("Expected crawl delay of 5 for group 3, got %v", r.groups[2].CrawlDelay)
}
}
}
func TestWildcards(t *testing.T) {
- if r, e := FromString(robots_case_wildcards, false); e != nil {
+ if r, e := FromString(robots_case_wildcards); e != nil {
t.Fatal(e)
} else {
if s := r.groups[0].rules[0].pattern.String(); s != "/path.*l$" {
@@ -243,7 +244,7 @@ func TestURLMatching(t *testing.T) {
"^/Fish.PHP",
},
}
- if r, e := FromString(robots_case_matching, false); e != nil {
+ if r, e := FromString(robots_case_matching); e != nil {
t.Fatal(e)
} else {
for k, ar := range cases {
@@ -286,7 +287,7 @@ func TestURLPrecedence(t *testing.T) {
"/",
},
}
- if r, e := FromString(robots_case_precedence, false); e != nil {
+ if r, e := FromString(robots_case_precedence); e != nil {
t.Fatal(e)
} else {
for k, ar := range cases {
@@ -303,7 +304,7 @@ func TestURLPrecedence(t *testing.T) {
}
}
-func getIndexInSlice(ar []*group, g *group) int {
+func getIndexInSlice(ar []*Group, g *Group) int {
for i, v := range ar {
if v == g {
return i
View
9 parser.go
@@ -13,6 +13,7 @@ import (
"regexp"
"strconv"
"strings"
+ "time"
)
type lineType uint
@@ -44,8 +45,8 @@ func newParser(tokens []string) *parser {
return &parser{tokens: tokens}
}
-func (p *parser) parseAll() (groups []*group, sitemaps []string, errs []error) {
- var curGroup *group
+func (p *parser) parseAll() (groups []*Group, sitemaps []string, errs []error) {
+ var curGroup *Group
var isEmptyGroup bool
// Reset internal fields, tokens are assigned at creation time, never change
@@ -74,7 +75,7 @@ func (p *parser) parseAll() (groups []*group, sitemaps []string, errs []error) {
curGroup = nil
}
if curGroup == nil {
- curGroup = new(group)
+ curGroup = new(Group)
isEmptyGroup = true
}
// Add the user agent
@@ -114,7 +115,7 @@ func (p *parser) parseAll() (groups []*group, sitemaps []string, errs []error) {
errs = append(errs, errors.New(fmt.Sprintf("Crawl-delay before User-agent at token #%d.", p.pos)))
} else {
isEmptyGroup = false
- curGroup.crawlDelay = li.vf
+ curGroup.CrawlDelay = time.Duration(li.vf * float64(time.Second))
}
}
}
View
67 robotstxt.go
@@ -10,24 +10,26 @@ import (
"bytes"
"errors"
"fmt"
+ "io/ioutil"
+ "net/http"
"os"
"regexp"
"strings"
+ "time"
)
type RobotsData struct {
- DefaultAgent string
// private
- groups []*group
+ groups []*Group
allowAll bool
disallowAll bool
- sitemaps []string
+ Sitemaps []string
}
-type group struct {
+type Group struct {
agents []string
rules []*rule
- crawlDelay float64
+ CrawlDelay time.Duration
}
type rule struct {
@@ -39,7 +41,7 @@ type rule struct {
var allowAll = &RobotsData{allowAll: true}
var disallowAll = &RobotsData{disallowAll: true}
-func FromResponseBytes(statusCode int, body []byte, print_errors bool) (*RobotsData, error) {
+func FromResponseBytes(statusCode int, body []byte) (*RobotsData, error) {
switch {
//
// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
@@ -51,7 +53,7 @@ func FromResponseBytes(statusCode int, body []byte, print_errors bool) (*RobotsD
case statusCode >= 400 && statusCode < 500:
return allowAll, nil
case statusCode >= 200 && statusCode < 300:
- return FromBytes(body, print_errors)
+ return FromBytes(body)
}
// Conservative disallow all default
//
@@ -62,11 +64,23 @@ func FromResponseBytes(statusCode int, body []byte, print_errors bool) (*RobotsD
return disallowAll, nil
}
-func FromResponse(statusCode int, body string, print_errors bool) (*RobotsData, error) {
- return FromResponseBytes(statusCode, []byte(body), print_errors)
+func FromResponseContent(statusCode int, body string) (*RobotsData, error) {
+ return FromResponseBytes(statusCode, []byte(body))
}
-func FromBytes(body []byte, print_errors bool) (r *RobotsData, err error) {
+func FromResponse(res *http.Response) (*RobotsData, error) {
+ if res == nil {
+ // Edge case, if res is nil, return nil data
+ return nil, nil
+ }
+ buf, e := ioutil.ReadAll(res.Body)
+ if e != nil {
+ return nil, e
+ }
+ return FromResponseBytes(res.StatusCode, buf)
+}
+
+func FromBytes(body []byte) (r *RobotsData, err error) {
var errs []error
// special case (probably not worth optimization?)
@@ -76,7 +90,7 @@ func FromBytes(body []byte, print_errors bool) (r *RobotsData, err error) {
}
sc := newByteScanner("bytes", false)
- sc.Quiet = !print_errors
+ //sc.Quiet = !print_errors
sc.Feed(body, true)
var tokens []string
tokens, err = sc.ScanAll()
@@ -91,12 +105,11 @@ func FromBytes(body []byte, print_errors bool) (r *RobotsData, err error) {
r = &RobotsData{}
parser := newParser(tokens)
- r.groups, r.sitemaps, errs = parser.parseAll()
+ r.groups, r.Sitemaps, errs = parser.parseAll()
if len(errs) > 0 {
- if print_errors {
- for _, e := range errs {
- fmt.Fprintln(os.Stderr, e)
- }
+ // TODO : Return error messages as a slice of messages on the error?
+ for _, e := range errs {
+ fmt.Fprintln(os.Stderr, e)
}
return nil, errors.New("Parse error. Use print_errors = true to print on stderr.")
}
@@ -104,12 +117,8 @@ func FromBytes(body []byte, print_errors bool) (r *RobotsData, err error) {
return r, nil
}
-func FromString(body string, print_errors bool) (r *RobotsData, err error) {
- return FromBytes([]byte(body), print_errors)
-}
-
-func (r *RobotsData) Test(path string) bool {
- return r.TestAgent(path, r.DefaultAgent)
+func FromString(body string) (r *RobotsData, err error) {
+ return FromBytes([]byte(body))
}
func (r *RobotsData) TestAgent(path, agent string) bool {
@@ -123,7 +132,7 @@ func (r *RobotsData) TestAgent(path, agent string) bool {
// Find a group of rules that applies to this agent
// From google's spec:
// The user-agent is non-case-sensitive.
- if g := r.findGroup(agent); g != nil {
+ if g := r.FindGroup(agent); g != nil {
// Find a rule that applies to this url
if r := g.findRule(path); r != nil {
return r.allow
@@ -141,7 +150,7 @@ func (r *RobotsData) TestAgent(path, agent string) bool {
// with the most specific user-agent that still matches. All other groups of
// records are ignored by the crawler. The user-agent is non-case-sensitive.
// The order of the groups within the robots.txt file is irrelevant.
-func (r *RobotsData) findGroup(agent string) (ret *group) {
+func (r *RobotsData) FindGroup(agent string) (ret *Group) {
var prefixLen int
agent = strings.ToLower(agent)
@@ -162,6 +171,14 @@ func (r *RobotsData) findGroup(agent string) (ret *group) {
return
}
+func (g *Group) Test(path string) bool {
+ if r := g.findRule(path); r != nil {
+ return r.allow
+ }
+
+ return true
+}
+
// From google's spec:
// The path value is used as a basis to determine whether or not a rule applies
// to a specific URL on a site. With the exception of wildcards, the path is
@@ -172,7 +189,7 @@ func (r *RobotsData) findGroup(agent string) (ret *group) {
// the most specific rule based on the length of the [path] entry will trump
// the less specific (shorter) rule. The order of precedence for rules with
// wildcards is undefined.
-func (g *group) findRule(path string) (ret *rule) {
+func (g *Group) findRule(path string) (ret *rule) {
var prefixLen int
for _, r := range g.rules {
View
48 robotstxt_test.go
@@ -5,16 +5,16 @@ import (
)
func TestFromResponseBasic(t *testing.T) {
- if _, err := FromResponse(200, "", true); err != nil {
+ if _, err := FromResponseContent(200, ""); err != nil {
t.Fatal("FromResponse MUST accept 200/\"\"")
}
- if _, err := FromResponse(401, "", true); err != nil {
+ if _, err := FromResponseContent(401, ""); err != nil {
t.Fatal("FromResponse MUST accept 401/\"\"")
}
- if _, err := FromResponse(403, "", true); err != nil {
+ if _, err := FromResponseContent(403, ""); err != nil {
t.Fatal("FromResponse MUST accept 403/\"\"")
}
- if _, err := FromResponse(404, "", true); err != nil {
+ if _, err := FromResponseContent(404, ""); err != nil {
t.Fatal("FromResponse MUST accept 404/\"\"")
}
}
@@ -37,41 +37,41 @@ func ExpectDisallow(r *RobotsData, t *testing.T, msg string) {
}
func TestResponse401(t *testing.T) {
- r, _ := FromResponse(401, "", true)
+ r, _ := FromResponseContent(401, "")
ExpectAllow(r, t, "FromResponse(401, \"\") MUST allow everything.")
}
func TestResponse403(t *testing.T) {
- r, _ := FromResponse(403, "", true)
+ r, _ := FromResponseContent(403, "")
ExpectAllow(r, t, "FromResponse(403, \"\") MUST allow everything.")
}
func TestResponse404(t *testing.T) {
- r, _ := FromResponse(404, "", true)
+ r, _ := FromResponseContent(404, "")
ExpectAllow(r, t, "FromResponse(404, \"\") MUST allow everything.")
}
func TestFromStringBasic(t *testing.T) {
- if _, err := FromString("", true); err != nil {
+ if _, err := FromString(""); err != nil {
t.Fatal("FromString MUST accept \"\"")
}
}
func TestFromStringEmpty(t *testing.T) {
- r, _ := FromString("", true)
+ r, _ := FromString("")
if allow := r.TestAgent("/", "Somebot"); !allow {
t.Fatal("FromString(\"\") MUST allow everything.")
}
}
func TestFromStringComment(t *testing.T) {
- if _, err := FromString("# comment", true); err != nil {
+ if _, err := FromString("# comment"); err != nil {
t.Fatal("FromString MUST accept \"# comment\"")
}
}
func TestFromString001(t *testing.T) {
- r, err := FromString("User-Agent: *\r\nDisallow: /\r\n", true)
+ r, err := FromString("User-Agent: *\r\nDisallow: /\r\n")
if err != nil {
t.Fatal(err.Error())
}
@@ -82,7 +82,7 @@ func TestFromString001(t *testing.T) {
}
func TestFromString002(t *testing.T) {
- r, err := FromString("User-Agent: *\r\nDisallow: /account\r\n", true)
+ r, err := FromString("User-Agent: *\r\nDisallow: /account\r\n")
if err != nil {
t.Fatal(err.Error())
}
@@ -95,7 +95,7 @@ func TestFromString002(t *testing.T) {
const robots_text_001 = "User-agent: * \nDisallow: /administrator/\nDisallow: /cache/\nDisallow: /components/\nDisallow: /editor/\nDisallow: /forum/\nDisallow: /help/\nDisallow: /images/\nDisallow: /includes/\nDisallow: /language/\nDisallow: /mambots/\nDisallow: /media/\nDisallow: /modules/\nDisallow: /templates/\nDisallow: /installation/\nDisallow: /getcid/\nDisallow: /tooltip/\nDisallow: /getuser/\nDisallow: /download/\nDisallow: /index.php?option=com_phorum*,quote=1\nDisallow: /index.php?option=com_phorum*phorum_query=search\nDisallow: /index.php?option=com_phorum*,newer\nDisallow: /index.php?option=com_phorum*,older\n\nUser-agent: Yandex\nAllow: /\nSitemap: http://www.pravorulya.com/sitemap.xml\nSitemap: http://www.pravorulya.com/sitemap1.xml"
func TestFromString003(t *testing.T) {
- r, err := FromString(robots_text_001, true)
+ r, err := FromString(robots_text_001)
if err != nil {
t.Fatal(err.Error())
}
@@ -106,7 +106,7 @@ func TestFromString003(t *testing.T) {
}
func TestFromString004(t *testing.T) {
- r, err := FromString(robots_text_001, true)
+ r, err := FromString(robots_text_001)
if err != nil {
t.Fatal(err.Error())
}
@@ -118,7 +118,7 @@ func TestFromString004(t *testing.T) {
func TestInvalidEncoding(t *testing.T) {
// Invalid UTF-8 encoding should not break parser.
- _, err := FromString("User-agent: H\xef\xbf\xbdm�h�kki\nDisallow: *", true)
+ _, err := FromString("User-agent: H\xef\xbf\xbdm�h�kki\nDisallow: *")
if err != nil {
t.Fatal(err.Error())
}
@@ -129,7 +129,7 @@ const robots_text_002 = ("User-agent: *\nDisallow: /search\nDisallow: /groups\nD
"Disallow: /cl2/ical/\nDisallow: /coop/directory\nDisallow: /coop/manage\nDisallow: /trends?\nDisallow: /trends/music?\nDisallow: /trends/hottrends?\nDisallow: /trends/viz?\nDisallow: /notebook/search?\nDisallow: /musica\nDisallow: /musicad\nDisallow: /musicas\nDisallow: /musicl\nDisallow: /musics\nDisallow: /musicsearch\nDisallow: /musicsp\nDisallow: /musiclp\nDisallow: /browsersync\nDisallow: /call\nDisallow: /archivesearch?\nDisallow: /archivesearch/url\nDisallow: /archivesearch/advanced_search\nDisallow: /base/reportbadoffer\nDisallow: /urchin_test/\nDisallow: /movies?\nDisallow: /codesearch?\nDisallow: /codesearch/feeds/search?\nDisallow: /wapsearch?\nDisallow: /safebrowsing\nAllow: /safebrowsing/diagnostic\nAllow: /safebrowsing/report_error/\nAllow: /safebrowsing/report_phish/\nDisallow: /reviews/search?\nDisallow: /orkut/albums\nAllow: /jsapi\nDisallow: /views?\nDisallow: /c/\nDisallow: /cbk\nDisallow: /recharge/dashboard/car\nDisallow: /recharge/dashboard/static/\nDisallow: /translate_a/\nDisallow: /translate_c\nDisallow: /translate_f\nDisallow: /translate_static/\nDisallow: /translate_suggestion\nDisallow: /profiles/me\nAllow: /profiles\nDisallow: /s2/profiles/me\nAllow: /s2/profiles\nAllow: /s2/photos\nAllow: /s2/static\nDisallow: /s2\nDisallow: /transconsole/portal/\nDisallow: /gcc/\nDisallow: /aclk\nDisallow: /cse?\nDisallow: /cse/home\nDisallow: /cse/panel\nDisallow: /cse/manage\nDisallow: /tbproxy/\nDisallow: /imesync/\nDisallow: /shenghuo/search?\nDisallow: /support/forum/search?\nDisallow: /reviews/polls/\nDisallow: /hosted/images/\nDisallow: /ppob/?\nDisallow: /ppob?\nDisallow: /ig/add?\nDisallow: /adwordsresellers\nDisallow: /accounts/o8\nAllow: /accounts/o8/id\nDisallow: /topicsearch?q=\nDisallow: /xfx7/\nDisallow: /squared/api\nDisallow: /squared/search\nDisallow: /squared/table\nDisallow: /toolkit/\nAllow: /toolkit/*.html\nDisallow: /globalmarketfinder/\nAllow: /globalmarketfinder/*.html\nDisallow: /qnasearch?\nDisallow: /errors/\nDisallow: /app/updates\nDisallow: /sidewiki/entry/\nDisallow: /quality_form?\nDisallow: /labs/popgadget/search\nDisallow: /buzz/post\nDisallow: /compressiontest/\nDisallow: /analytics/reporting/\nDisallow: /analytics/admin/\nDisallow: /analytics/web/\nDisallow: /analytics/feeds/\nDisallow: /analytics/settings/\nDisallow: /alerts/\nDisallow: /phone/compare/?\nAllow: /alerts/manage\nSitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml\nSitemap: http://www.google.com/hostednews/sitemap_index.xml\nSitemap: http://www.google.com/ventures/sitemap_ventures.xml\nSitemap: http://www.google.com/sitemaps_webmasters.xml\nSitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml\nSitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml")
func TestFromString005(t *testing.T) {
- r, err := FromString(robots_text_002, true)
+ r, err := FromString(robots_text_002)
if err != nil {
t.Fatal(err.Error())
}
@@ -137,7 +137,7 @@ func TestFromString005(t *testing.T) {
}
func TestFromString006(t *testing.T) {
- r, err := FromString(robots_text_002, true)
+ r, err := FromString(robots_text_002)
if err != nil {
t.Fatal(err.Error())
}
@@ -150,7 +150,7 @@ func TestFromString006(t *testing.T) {
const robots_text_003 = "User-Agent: * \nAllow: /"
func TestFromString007(t *testing.T) {
- r, err := FromString(robots_text_003, true)
+ r, err := FromString(robots_text_003)
if err != nil {
t.Fatal(err.Error())
}
@@ -163,7 +163,7 @@ func TestFromString007(t *testing.T) {
const robots_text_004 = "User-Agent: * \nDisallow: "
func TestFromString008(t *testing.T) {
- r, err := FromString(robots_text_004, true)
+ r, err := FromString(robots_text_004)
if err != nil {
t.Log(robots_text_004)
t.Fatal(err.Error())
@@ -180,7 +180,7 @@ User-agent: *
Disallow: /`
func TestRobotstxtOrgCase1(t *testing.T) {
- if r, err := FromString(robots_text_005, false); err != nil {
+ if r, err := FromString(robots_text_005); err != nil {
t.Fatal(err.Error())
} else if allow := r.TestAgent("/path/page1.html", "SomeBot"); allow {
t.Fatal("Must disallow.")
@@ -188,7 +188,7 @@ func TestRobotstxtOrgCase1(t *testing.T) {
}
func TestRobotstxtOrgCase2(t *testing.T) {
- if r, err := FromString(robots_text_005, false); err != nil {
+ if r, err := FromString(robots_text_005); err != nil {
t.Fatal(err.Error())
} else if allow := r.TestAgent("/path/page1.html", "Googlebot"); !allow {
t.Fatal("Must allow.")
@@ -197,20 +197,20 @@ func TestRobotstxtOrgCase2(t *testing.T) {
func BenchmarkParseFromString001(b *testing.B) {
for i := 0; i < b.N; i++ {
- FromString(robots_text_001, false)
+ FromString(robots_text_001)
b.SetBytes(int64(len(robots_text_001)))
}
}
func BenchmarkParseFromString002(b *testing.B) {
for i := 0; i < b.N; i++ {
- FromString(robots_text_002, false)
+ FromString(robots_text_002)
b.SetBytes(int64(len(robots_text_002)))
}
}
func BenchmarkParseFromResponse401(b *testing.B) {
for i := 0; i < b.N; i++ {
- FromResponse(401, "", false)
+ FromResponseContent(401, "")
}
}
Please sign in to comment.
Something went wrong with that request. Please try again.