Skip to content

Commit

Permalink
toml,xml: Fail fast on invalid content
Browse files Browse the repository at this point in the history
encoding/xml and github.com/BurntSushi/toml both reads a lot before detecting
that it can't decode. Now we instead read one UTF-8 and make sure it's valid
xml or toml.

Should speed up probing

Related to #586 bigzero-zip.zip
  • Loading branch information
wader committed Feb 22, 2023
1 parent aaf60ec commit 56edb59
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 7 deletions.
6 changes: 3 additions & 3 deletions format/toml/testdata/toml.fqtest
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ true = true
toml: top-level values must be Go maps or structs
----

error at position 0x0: root object has no values
error at position 0x0: EOF
----
$ fq -n '"" | from_toml'
$ fq -n '" " | from_toml'
exitcode: 5
stderr:
error: error at position 0x0: root object has no values
error: error at position 0x1: root object has no values
35 changes: 33 additions & 2 deletions format/toml/toml.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
package toml

import (
"bufio"
"bytes"
"embed"
"fmt"
"io"
"unicode/utf8"

"github.com/BurntSushi/toml"
"github.com/wader/fq/format"
Expand All @@ -29,11 +33,38 @@ func init() {
interp.RegisterFunc0("to_toml", toTOML)
}

func decodeTOMLSeekFirstValidRune(br io.ReadSeeker) error {
buf := bufio.NewReader(br)
r, sz, err := buf.ReadRune()
if err != nil {
return err
}
if _, err := br.Seek(0, io.SeekStart); err != nil {
return err
}
if r == utf8.RuneError && sz == 1 {
return fmt.Errorf("invalid UTF-8")
}
if r == 0 {
return fmt.Errorf("TOML can't contain null bytes")
}

return nil
}

func decodeTOML(d *decode.D) any {
br := d.RawLen(d.Len())
bbr := d.RawLen(d.Len())
var r any

if _, err := toml.NewDecoder(bitio.NewIOReader(br)).Decode(&r); err != nil {
br := bitio.NewIOReadSeeker(bbr)

// github.com/BurntSushi/toml currently does a ReadAll which might be expensive
// try find invalid toml (null bytes etc) faster and more efficient
if err := decodeTOMLSeekFirstValidRune(br); err != nil {
d.Fatalf("%s", err)
}

if _, err := toml.NewDecoder(br).Decode(&r); err != nil {
d.Fatalf("%s", err)
}
var s scalar.Any
Expand Down
46 changes: 44 additions & 2 deletions format/xml/xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,18 @@ package xml
// TODO: rewrite ns stack

import (
"bufio"
"bytes"
"embed"
"encoding/xml"
"errors"
"fmt"
"html"
"io"
"regexp"
"strconv"
"strings"
"unicode/utf8"

"github.com/wader/fq/format"
"github.com/wader/fq/internal/gojqex"
Expand Down Expand Up @@ -247,15 +250,54 @@ func fromXMLToArray(n xmlNode) any {
return f(n, nil)
}

// from golang encoding/xml, copyright 2009 The Go Authors
// the Char production of https://www.xml.com/axml/testaxml.htm,
// Section 2.2 Characters.
func isInCharacterRange(r rune) (inrange bool) {
return r == 0x09 ||
r == 0x0A ||
r == 0x0D ||
r >= 0x20 && r <= 0xD7FF ||
r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF
}

func decodeXMLSeekFirstValidRune(br io.ReadSeeker) error {
buf := bufio.NewReader(br)
r, sz, err := buf.ReadRune()
if err != nil {
return err
}
if _, err := br.Seek(0, io.SeekStart); err != nil {
return err
}
if r == utf8.RuneError && sz == 1 {
return fmt.Errorf("invalid UTF-8")
}
if !isInCharacterRange(r) {
return fmt.Errorf("illegal character code %U", r)
}

return nil
}

func decodeXML(d *decode.D) any {
var xi format.XMLIn
d.ArgAs(&xi)

br := d.RawLen(d.Len())
bbr := d.RawLen(d.Len())
var r any
var err error

xd := xml.NewDecoder(bitio.NewIOReader(br))
br := bitio.NewIOReadSeeker(bbr)

// this reimplements same xml rune range validation as ecoding/xml but fails faster
if err := decodeXMLSeekFirstValidRune(br); err != nil {
d.Fatalf("%s", err)
}

xd := xml.NewDecoder(br)

xd.Strict = false
var n xmlNode
if err := xd.Decode(&n); err != nil {
Expand Down

0 comments on commit 56edb59

Please sign in to comment.