Skip to content

Commit

Permalink
Properly parse URI
Browse files Browse the repository at this point in the history
Use URI parse code based on net/uri to validate hostnames.
  • Loading branch information
erikdubbelboer committed Oct 1, 2021
1 parent 711e421 commit 542a203
Show file tree
Hide file tree
Showing 3 changed files with 245 additions and 1 deletion.
2 changes: 1 addition & 1 deletion allocation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func TestAllocationClient(t *testing.T) {
}

func TestAllocationURI(t *testing.T) {
uri := []byte("http://username:password@example.com/some/path?foo=bar#test")
uri := []byte("http://username:password@hello.%e4%b8%96%e7%95%8c.com/some/path?foo=bar#test")

n := testing.AllocsPerRun(100, func() {
u := AcquireURI()
Expand Down
221 changes: 221 additions & 0 deletions uri.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"io"
"strconv"
"sync"
)

Expand Down Expand Up @@ -298,6 +299,10 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error {
}
}

host, err := parseHost(host)
if err != nil {
return err
}
u.host = append(u.host, host...)
lowercaseBytes(u.host)

Expand Down Expand Up @@ -338,6 +343,222 @@ func (u *URI) parse(host, uri []byte, isTLS bool) error {
return nil
}

// parseHost parses host as an authority without user
// information. That is, as host[:port].
//
// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L619
func parseHost(host []byte) ([]byte, error) {
if len(host) > 0 && host[0] == '[' {
// Parse an IP-Literal in RFC 3986 and RFC 6874.
// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
i := bytes.LastIndexByte(host, ']')
if i < 0 {
return nil, errors.New("missing ']' in host")
}
colonPort := host[i+1:]
if !validOptionalPort(colonPort) {
return nil, fmt.Errorf("invalid port %q after host", colonPort)
}

// RFC 6874 defines that %25 (%-encoded percent) introduces
// the zone identifier, and the zone identifier can use basically
// any %-encoding it likes. That's different from the host, which
// can only %-encode non-ASCII bytes.
// We do impose some restrictions on the zone, to avoid stupidity
// like newlines.
zone := bytes.Index(host[:i], []byte("%25"))
if zone >= 0 {
host1, err := unescape(host[:zone], encodeHost)
if err != nil {
return nil, err
}
host2, err := unescape(host[zone:i], encodeZone)
if err != nil {
return nil, err
}
host3, err := unescape(host[i:], encodeHost)
if err != nil {
return nil, err
}
return append(host1, append(host2, host3...)...), nil
}
} else if i := bytes.LastIndexByte(host, ':'); i != -1 {
colonPort := host[i:]
if !validOptionalPort(colonPort) {
return nil, fmt.Errorf("invalid port %q after host", colonPort)
}
}

var err error
if host, err = unescape(host, encodeHost); err != nil {
return nil, err
}
return host, nil
}

type encoding int

const (
encodeHost encoding = 1 + iota
encodeZone
)

type EscapeError string

func (e EscapeError) Error() string {
return "invalid URL escape " + strconv.Quote(string(e))
}

type InvalidHostError string

func (e InvalidHostError) Error() string {
return "invalid character " + strconv.Quote(string(e)) + " in host name"
}

// unescape unescapes a string; the mode specifies
// which section of the URL string is being unescaped.
//
// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L199
func unescape(s []byte, mode encoding) ([]byte, error) {
// Count %, check that they're well-formed.
n := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
n++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[:3]
}
return nil, EscapeError(s)
}
// Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used
// for non-ASCII bytes.
// But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay.
if mode == encodeHost && unhex(s[i+1]) < 8 && !bytes.Equal(s[i:i+3], []byte("%25")) {
return nil, EscapeError(s[i : i+3])
}
if mode == encodeZone {
// RFC 6874 says basically "anything goes" for zone identifiers
// and that even non-ASCII can be redundantly escaped,
// but it seems prudent to restrict %-escaped bytes here to those
// that are valid host name bytes in their unescaped form.
// That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly.
// But Windows puts spaces here! Yay.
v := unhex(s[i+1])<<4 | unhex(s[i+2])
if !bytes.Equal(s[i:i+3], []byte("%25")) && v != ' ' && shouldEscape(v, encodeHost) {
return nil, EscapeError(s[i : i+3])
}
}
i += 3
default:
if (mode == encodeHost || mode == encodeZone) && s[i] < 0x80 && shouldEscape(s[i], mode) {
return nil, InvalidHostError(s[i : i+1])
}
i++
}
}

if n == 0 {
return s, nil
}

t := s[:0]
for i := 0; i < len(s); i++ {
switch s[i] {
case '%':
t = append(t, unhex(s[i+1])<<4|unhex(s[i+2]))
i += 2
default:
t = append(t, s[i])
}
}
return t, nil
}

// Return true if the specified character should be escaped when
// appearing in a URL string, according to RFC 3986.
//
// Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684.
//
// Based on https://github.com/golang/go/blob/8ac5cbe05d61df0a7a7c9a38ff33305d4dcfea32/src/net/url/url.go#L100
func shouldEscape(c byte, mode encoding) bool {
// §2.3 Unreserved characters (alphanum)
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
return false
}

if mode == encodeHost || mode == encodeZone {
// §3.2.2 Host allows
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// as part of reg-name.
// We add : because we include :port as part of host.
// We add [ ] because we include [ipv6]:port as part of host.
// We add < > because they're the only characters left that
// we could possibly allow, and Parse will reject them if we
// escape them (because hosts can't use %-encoding for
// ASCII bytes).
switch c {
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
return false
}
}

if c == '-' || c == '_' || c == '.' || c == '~' { // §2.3 Unreserved characters (mark)
return false
}

// Everything else must be escaped.
return true
}

func ishex(c byte) bool {
switch {
case '0' <= c && c <= '9':
return true
case 'a' <= c && c <= 'f':
return true
case 'A' <= c && c <= 'F':
return true
}
return false
}

func unhex(c byte) byte {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
}
return 0
}

// validOptionalPort reports whether port is either an empty string
// or matches /^:\d*$/
func validOptionalPort(port []byte) bool {
if len(port) == 0 {
return true
}
if port[0] != ':' {
return false
}
for _, b := range port[1:] {
if b < '0' || b > '9' {
return false
}
}
return true
}

func normalizePath(dst, src []byte) []byte {
dst = dst[:0]
dst = addLeadingSlash(dst, src)
Expand Down
23 changes: 23 additions & 0 deletions uri_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,18 @@ func TestURIParse(t *testing.T) {

testURIParse(t, &u, "", "//aaa.com\r\n\r\nGET x",
"http:///", "", "/", "", "", "")

testURIParse(t, &u, "", "http://[fe80::1%25en0]/",
"http://[fe80::1%en0]/", "[fe80::1%en0]", "/", "/", "", "")

testURIParse(t, &u, "", "http://[fe80::1%25en0]:8080/",
"http://[fe80::1%en0]:8080/", "[fe80::1%en0]:8080", "/", "/", "", "")

testURIParse(t, &u, "", "http://hello.世界.com/foo",
"http://hello.世界.com/foo", "hello.世界.com", "/foo", "/foo", "", "")

testURIParse(t, &u, "", "http://hello.%e4%b8%96%e7%95%8c.com/foo",
"http://hello.世界.com/foo", "hello.世界.com", "/foo", "/foo", "", "")
}

func testURIParse(t *testing.T, u *URI, host, uri,
Expand Down Expand Up @@ -404,3 +416,14 @@ func TestURIWithQuerystringOverride(t *testing.T) {
t.Fatalf("Expected Querystring to be overridden but was %s ", uriString)
}
}

func TestInvalidUrl(t *testing.T) {
url := `https://.çèéà@&~!&:=\\/\"'~<>|+-*()[]{}%$;,¥&&$22|||<>< 4ly8lzjmoNx233AXELDtyaFQiiUH-fd8c-CnXUJVYnGIs4Uwr-bptom5GCnWtsGMQxeM2ZhoKE973eKgs2Sjh6RePnyaLpCi6SiNSLevcMoraARrp88L-SgtKqd-XHAtSI8hiPRiXPQmDIA4BGhSgoc0nfn1PoYuGKKmDcZ04tANRc3iz4aF4-A1UrO8bLHTH7MEJvzx.someqa.fr/A/?&QS_BEGIN<&8{b'Ob=p*f> QS_END`

u := AcquireURI()
defer ReleaseURI(u)

if err := u.Parse(nil, []byte(url)); err == nil {
t.Fail()
}
}

0 comments on commit 542a203

Please sign in to comment.