Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow testing of sites published to subfolders #215

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ htmltest uses a YAML configuration file. Put `.htmltest.yml` in the same directo
| `DirectoryIndex` | The file to look for when linking to a directory. | `index.html` |
| `FilePath` | Single file to test within `DirectoryPath`, omit to test all. | |
| `FileExtension` | Extension of your HTML documents, includes the dot. If `FilePath` is set we use the extension from that. | `.html` |
| `BaseURL` | Publication URL of the site, including subfolder if applicable. | |
| `CheckDoctype` | Enables checking the document type declaration. | `true` |
| `CheckAnchors` | Enables checking `<a…` tags. | `true` |
| `CheckLinks` | Enables checking `<link…` tags. | `true` |
Expand All @@ -149,6 +150,7 @@ htmltest uses a YAML configuration file. Put `.htmltest.yml` in the same directo
| `CheckTel` | Enables–albeit quite basic–`tel:` link checking. | `true` |
| `CheckFavicon` | Enables favicon checking, ensures every page has a favicon set. | `false` |
| `CheckMetaRefresh` | Enables checking meta refresh tags. | `true` |
| `CheckSelfReferencesAsInternal` | Check external references starting with `BaseURL` as though they are internal references. | `false` |
| `EnforceHTML5` | Fails when the doctype isn't `<!DOCTYPE html>`. | `false` |
| `EnforceHTTPS` | Fails when encountering an `http://` link. Useful to prevent mixed content errors when serving over HTTPS. | `false` |
| `IgnoreURLs` | Array of regexs of URLs to ignore. | empty |
Expand Down
15 changes: 14 additions & 1 deletion htmldoc/document_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
package htmldoc

import (
"net/url"
"os"
"path"
"regexp"
"strings"

"github.com/wjdp/htmltest/output"
)

// DocumentStore struct, store of Documents including Document discovery
type DocumentStore struct {
BasePath string // Path, relative to cwd, the site is located in
BaseURL *url.URL // Base URL of the site
IgnorePatterns []interface{} // Regexes of directories to ignore
Documents []*Document // All of the documents, used to iterate over
DocumentPathMap map[string]*Document // Maps slash separated paths to documents
Expand Down Expand Up @@ -104,7 +107,17 @@ func (dS *DocumentStore) ResolvePath(refPath string) (*Document, bool) {

if refPath[0] == '/' && len(refPath) > 1 {
// Is an absolute link, remove the leading slash for map lookup
refPath = refPath[1:]
if dS.BaseURL == nil {
// No base URL, so `/` means our root
refPath = refPath[1:]
} else {
// We have a Base URL, so need to trip off the base path if present
refPath = strings.TrimPrefix(refPath, dS.BaseURL.Path)

// We want to end up with a relative path, so remove leading '/' if present
// (This happens if BaseURL does *not* end in '/')
refPath = strings.TrimPrefix(refPath, "/")
}
}

// Try path as-is, path.ext
Expand Down
25 changes: 23 additions & 2 deletions htmltest/check-link.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ import (
)

// ignoredRels: List of rel values to ignore, dns-prefetch and preconnect are ignored as they are not links to be
// followed rather telling browser we want something on that host, if the root of that host is not valid,
// it's likely not a problem.
//
// followed rather telling browser we want something on that host, if the root of that host is not valid,
// it's likely not a problem.
var ignoredRels = [...]string{"dns-prefetch", "preconnect"}

func (hT *HTMLTest) checkLink(document *htmldoc.Document, node *html.Node) {
Expand Down Expand Up @@ -139,6 +140,26 @@ func (hT *HTMLTest) checkExternal(ref *htmldoc.Reference) {
return
}

// Is this an external reference to a local file?
if hT.opts.CheckSelfReferencesAsInternal && hT.documentStore.BaseURL != nil {

if ref.URL.Host == hT.documentStore.BaseURL.Host &&
hT.documentStore.BaseURL.User == nil &&
strings.HasPrefix(ref.URL.Path, hT.documentStore.BaseURL.Path) {
// Convert to internal reference
internalURL := *ref.URL
internalURL.Scheme = ""
internalURL.Host = ""

internalRef := *ref
internalRef.URL = &internalURL
internalRef.Path = internalURL.String()

hT.checkInternal(&internalRef)
return
}
}

urlStr := ref.URLString()

// Does this url match an url ignore rule?
Expand Down
47 changes: 47 additions & 0 deletions htmltest/check-link_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,53 @@ func TestAnchorBlankHTML4(t *testing.T) {
tExpectIssueCount(t, hT2, 1)
}

func TestAnchorInternalAbsolute(t *testing.T) {
// works for internal absolute links
hT := tTestFile("fixtures/links/absoluteLinks.html")
tExpectIssueCount(t, hT, 0)
}

func TestAnchorInternalAbsoluteRootPublication(t *testing.T) {
// works for internal absolute links when site is published to root of domain
hT := tTestFileOpts("fixtures/links/absoluteLinksRootPublication.html",
map[string]interface{}{"BaseURL": "http://example.com"})
tExpectIssueCount(t, hT, 0)
}

func TestAnchorInternalAbsoluteFolderPublication(t *testing.T) {
// works for internal absolute links when site is published to a folder
hT := tTestFileOpts("fixtures/links/absoluteLinksFolderPublication.html",
map[string]interface{}{"BaseURL": "http://www.example.com/blog"})
tExpectIssueCount(t, hT, 0)
}

func TestAnchorInternalBrokenAbsoluteFolderPublication(t *testing.T) {
// works for missing internal absolute links when site is published to a folder
hT := tTestFileOpts("fixtures/links/absoluteBrokenLinksFolderPublication.html",
map[string]interface{}{"BaseURL": "http://www.example.com/blog"})
tExpectIssueCount(t, hT, 2)
}

func TestAnchorExternalLinksToInternalFiles(t *testing.T) {
// works for external links that reference internal files
hT := tTestFileOpts("fixtures/links/externalLinksToInternalFiles.html",
map[string]interface{}{
"BaseURL": "http://www.example.com/blog",
"CheckSelfReferencesAsInternal": true,
})
tExpectIssueCount(t, hT, 0)
}

func TestAnchorExternalLinksToExternalPages(t *testing.T) {
// works for external links that don't reference internal files because the subfolder is different
hT := tTestFileOpts("fixtures/links/externalLinksToExternalPages.html",
map[string]interface{}{
"BaseURL": "http://www.example.com/blog",
"CheckSelfReferencesAsInternal": true,
})
tExpectIssueCount(t, hT, 2)
}

func TestSelfSignedLink(t *testing.T) {
tSkipShortExternal(t)
hT := tTestFileOpts("fixtures/links/selfSignedLink.html",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<a href="/blog/missing.html">Missing file in root of site</a>
<a href="/blog/nested/missing.html">Missing file in folder of site</a>
2 changes: 2 additions & 0 deletions htmltest/fixtures/links/absoluteLinks.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<a href="/anchors_in_pre.html">Relative to root</a>
<a href="/check_just_once.html">Also relative to root</a>
2 changes: 2 additions & 0 deletions htmltest/fixtures/links/absoluteLinksFolderPublication.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<a href="/blog/anchors_in_pre.html">Relative to root</a>
<a href="/blog/check_just_once.html">Also relative to root</a>
2 changes: 2 additions & 0 deletions htmltest/fixtures/links/absoluteLinksRootPublication.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<a href="/anchors_in_pre.html">Relative to root</a>
<a href="/check_just_once.html">Also relative to root</a>
2 changes: 2 additions & 0 deletions htmltest/fixtures/links/externalLinksToExternalPages.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<a href="http://www.example.com/folder/index.html">Path 'folder' does not match, should not be mapped to internal file</a>
<a href="https://www.example.com/issues/94.html">Path 'issues' does not match, should not map to known file</a>
2 changes: 2 additions & 0 deletions htmltest/fixtures/links/externalLinksToInternalFiles.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<a href="http://www.example.com/blog/anchors_in_pre.html">External HTTP reference</a>
<a href="https://www.example.com/blog/check_just_once.html">External HTTPS reference</a>
12 changes: 12 additions & 0 deletions htmltest/htmltest.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"net/http"
"net/url"
"os"
"path"
"strings"
Expand Down Expand Up @@ -149,6 +150,17 @@ func Test(optsUser map[string]interface{}) (*HTMLTest, error) {
hT.documentStore.DirectoryIndex = hT.opts.DirectoryIndex
hT.documentStore.IgnorePatterns = hT.opts.IgnoreDirs
hT.documentStore.IgnoreTagAttribute = hT.opts.IgnoreTagAttribute

if hT.opts.BaseURL != "" {
baseURL, err := url.Parse(hT.opts.BaseURL)
if err != nil {
err := fmt.Errorf("Could not parse BaseURL '%s': %w", hT.opts.BaseURL, err)
return &hT, err
}

hT.documentStore.BaseURL = baseURL
}

// Discover documents
hT.documentStore.Discover()

Expand Down
17 changes: 10 additions & 7 deletions htmltest/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ type Options struct {
FilePath string
FileExtension string

BaseURL string

CheckDoctype bool
CheckAnchors bool
CheckLinks bool
Expand All @@ -27,13 +29,14 @@ type Options struct {
CheckMeta bool
CheckGeneric bool

CheckExternal bool
CheckInternal bool
CheckInternalHash bool
CheckMailto bool
CheckTel bool
CheckFavicon bool
CheckMetaRefresh bool
CheckExternal bool
CheckInternal bool
CheckInternalHash bool
CheckMailto bool
CheckTel bool
CheckFavicon bool
CheckMetaRefresh bool
CheckSelfReferencesAsInternal bool

EnforceHTML5 bool
EnforceHTTPS bool
Expand Down