Skip to content

Commit

Permalink
'Host' directive parsing added. It's often used in Yandex to specify …
Browse files Browse the repository at this point in the history
  • Loading branch information
brainm committed Jun 26, 2015
1 parent c51489e commit b8a039b
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 2 deletions.
11 changes: 10 additions & 1 deletion parser.go
Expand Up @@ -26,6 +26,7 @@ const (
lDisallow
lCrawlDelay
lSitemap
lHost
)

type parser struct {
Expand All @@ -45,7 +46,7 @@ func newParser(tokens []string) *parser {
return &parser{tokens: tokens}
}

func (p *parser) parseAll() (groups []*Group, sitemaps []string, errs []error) {
func (p *parser) parseAll() (groups []*Group, host string, sitemaps []string, errs []error) {
var curGroup *Group
var isEmptyGroup bool

Expand Down Expand Up @@ -106,6 +107,9 @@ func (p *parser) parseAll() (groups []*Group, sitemaps []string, errs []error) {
curGroup.rules = append(curGroup.rules, &rule{li.vs, true, nil})
}
}

case lHost:
host = li.vs

case lSitemap:
sitemaps = append(sitemaps, li.vs)
Expand Down Expand Up @@ -213,6 +217,11 @@ func (p *parser) parseLine() (li *lineInfo, err error) {
// When no path is specified, the directive is ignored.
return returnPathVal(lAllow)

case "host":
// Host directive to specify main site mirror
// Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
return returnStringVal(lHost)

case "sitemap":
// Non-group field, applies to the host as a whole, not to a specific user-agent
return returnStringVal(lSitemap)
Expand Down
3 changes: 2 additions & 1 deletion robotstxt.go
Expand Up @@ -22,6 +22,7 @@ type RobotsData struct {
groups []*Group
allowAll bool
disallowAll bool
Host string
Sitemaps []string
}

Expand Down Expand Up @@ -124,7 +125,7 @@ func FromBytes(body []byte) (r *RobotsData, err error) {

r = &RobotsData{}
parser := newParser(tokens)
r.groups, r.Sitemaps, errs = parser.parseAll()
r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()
if len(errs) > 0 {
return nil, newParseError(errs)
}
Expand Down
33 changes: 33 additions & 0 deletions robotstxt_test.go
Expand Up @@ -195,6 +195,39 @@ func TestRobotstxtOrgCase2(t *testing.T) {
}
}

const robots_text_006 = `
Host: site.ru`

func TestRobotstxtHostCase1(t *testing.T) {
if r, err := FromString(robots_text_006); err != nil {
t.Fatal(err.Error())
} else if r.Host != "site.ru" {
t.Fatal("Incorrect host detection")
}
}

const robots_text_007 = `
#Host: site.ru`

func TestRobotstxtHostCase2(t *testing.T) {
if r, err := FromString(robots_text_007); err != nil {
t.Fatal(err.Error())
} else if r.Host != "" {
t.Fatal("Incorrect host detection")
}
}

const robots_text_008 = `
Host: яндекс.рф`

func TestRobotstxtHostCase3(t *testing.T) {
if r, err := FromString(robots_text_008); err != nil {
t.Fatal(err.Error())
} else if r.Host != "яндекс.рф" {
t.Fatal("Incorrect host detection")
}
}

const robots_text_errs = `Disallow: /
User-agent: Google
Crawl-delay: fail`
Expand Down

0 comments on commit b8a039b

Please sign in to comment.