Skip to content

Commit

Permalink
html: Add to probe group
Browse files Browse the repository at this point in the history
As decoder now can know they are decoding as part of probing we can now
use some heuristics to see if we should decode as html.
The reason heuristics is needed is that x/html parser will alwaus succeed.

Add lazyre package to help delay compile of RE and make it concurrency safe.
  • Loading branch information
wader committed May 11, 2023
1 parent f254b16 commit e2eb667
Show file tree
Hide file tree
Showing 16 changed files with 1,058 additions and 975 deletions.
1 change: 1 addition & 0 deletions doc/TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Optimize `Interp.Options` calls, now called per display. Cache per eval? needs to handle nested evals.
- `<array decode value>[{start: ...: end: ...}]` syntax a bit broken.
- REPL completion might have side effcts. Make interp.Function type know and wrap somehow? input, inputs, open, ...
- Rework group arguments so that `{is_probe:true}` is not needed. Look up group name and see if it has an argument somehow?

### TODO and ideas

Expand Down
2 changes: 1 addition & 1 deletion doc/formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@
|`ip_packet` |Group |<sub>`icmp` `icmpv6` `tcp_segment` `udp_datagram`</sub>|
|`link_frame` |Group |<sub>`bsd_loopback_frame` `ether8023_frame` `ipv4_packet` `ipv6_packet` `sll2_packet` `sll_packet`</sub>|
|`mp3_frame_tags` |Group |<sub>`mp3_frame_vbri` `mp3_frame_xing`</sub>|
|`probe` |Group |<sub>`adts` `aiff` `apple_bookmark` `ar` `avi` `avro_ocf` `bitcoin_blkdat` `bplist` `bzip2` `elf` `flac` `gif` `gzip` `jpeg` `json` `jsonl` `macho` `macho_fat` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `pcap` `pcapng` `png` `tar` `tiff` `toml` `tzif` `wasm` `wav` `webp` `xml` `yaml` `zip`</sub>|
|`probe` |Group |<sub>`adts` `aiff` `apple_bookmark` `ar` `avi` `avro_ocf` `bitcoin_blkdat` `bplist` `bzip2` `elf` `flac` `gif` `gzip` `html` `jpeg` `json` `jsonl` `macho` `macho_fat` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `pcap` `pcapng` `png` `tar` `tiff` `toml` `tzif` `wasm` `wav` `webp` `xml` `yaml` `zip`</sub>|
|`tcp_stream` |Group |<sub>`dns_tcp` `rtmp` `tls`</sub>|
|`udp_payload` |Group |<sub>`dns`</sub>|

Expand Down
1,880 changes: 943 additions & 937 deletions doc/formats.svg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions format/all/all.fqtest
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ $ fq -n _registry.groups.probe
"mpeg_ts",
"wav",
"json",
"html",
"jsonl",
"toml",
"xml",
Expand Down
2 changes: 1 addition & 1 deletion format/ar/ar.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func decodeAr(d *decode.D) any {
}
size := int64(sizeStr.SymUint()) * 8
d.FieldUTF8("ending_characters", 2)
d.FieldFormatOrRawLen("data", size, &probeGroup, nil)
d.FieldFormatOrRawLen("data", size, &probeGroup, format.Probe_In{})
padding := d.AlignBits(16)
if padding > 0 {
d.FieldRawLen("padding", int64(padding))
Expand Down
2 changes: 1 addition & 1 deletion format/bzip2/bzip2.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func bzip2Decode(d *decode.D) any {
compressedStart := d.Pos()

readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, nil)
d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, format.Probe_In{})
if uncompressedBR != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)
Expand Down
2 changes: 1 addition & 1 deletion format/gzip/gzip.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func gzDecode(d *decode.D) any {

if rFn != nil {
readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), d.BitsLeft(), rFn, &probeGroup, nil)
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), d.BitsLeft(), rFn, &probeGroup, format.Probe_In{})
if uncompressedBR != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)
Expand Down
2 changes: 1 addition & 1 deletion format/tar/tar.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func tarDecode(d *decode.D) any {
d.FieldUTF8("prefix", 155, mapTrimSpaceNull)
d.FieldRawLen("header_block_padding", blockPadding(d), d.BitBufIsZero())

d.FieldFormatOrRawLen("data", int64(size), &probeGroup, nil)
d.FieldFormatOrRawLen("data", int64(size), &probeGroup, format.Probe_In{})

d.FieldRawLen("data_block_padding", blockPadding(d), d.BitBufIsZero())
})
Expand Down
22 changes: 22 additions & 0 deletions format/xml/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"strings"

"github.com/wader/fq/format"
"github.com/wader/fq/internal/lazyre"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
Expand All @@ -21,6 +22,8 @@ func init() {
format.HTML,
&decode.Format{
Description: "HyperText Markup Language",
ProbeOrder: format.ProbeOrderTextFuzzy,
Groups: []*decode.Group{format.Probe},
DecodeFn: decodeHTML,
DefaultInArg: format.HTML_In{
Seq: false,
Expand Down Expand Up @@ -193,9 +196,28 @@ func fromHTMLToArray(n *html.Node) any {
return f(n)
}

var htmlMagicRe = &lazyre.RE{S: `` +
`^` + // anchor to start
`(?i)` + // case insensitive
`[[:graph:][:space:]]{0,64}?` + // 0-64 non-control ASCII lazily to allow comment etc
`(?:` +
`<\s{0,20}html|` + // <html
// or
`<!DOCTYPE\s{1,20}html` + // <!DOCTYPE html
`)`,
}

func decodeHTML(d *decode.D) any {
var hi format.HTML_In
var pi format.Probe_In
d.ArgAs(&hi)
if d.ArgAs(&pi) {
// if probing the input has to start with "<html" or "<!DOCTYPE html" this
// is because the html parser will always succeed so we have to be careful
if d.RE(htmlMagicRe.Must()) == nil {
d.Fatalf("no <html> or <!DOCTYPE html> found")
}
}

br := d.RawLen(d.Len())
var r any
Expand Down
24 changes: 8 additions & 16 deletions format/xml/testdata/doctype.fqtest
Original file line number Diff line number Diff line change
Expand Up @@ -34,29 +34,21 @@ $ fq -o array=true -d html . doctype.xml
]
$ fq . doctype.xml
{
"html": {
"head": {
"title": "aaa"
}
"head": {
"title": "aaa"
}
}
$ fq -o array=true . doctype.xml
[
"html",
"head",
null,
[
[
"head",
null,
[
[
"title",
{
"#text": "aaa"
},
[]
]
]
"title",
{
"#text": "aaa"
},
[]
]
]
]
4 changes: 1 addition & 3 deletions format/xml/testdata/doctype.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
<!DOCTYPE html>
<html>
<!DOCTYPE bla SYSTEM "" []>
<head>
<title>aaa</title>
</head>
</html>
27 changes: 27 additions & 0 deletions format/xml/testdata/html_probe.fqtest
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/html.html:

<html><a>

$ fq . html.html
{
"html": {
"body": {
"a": ""
},
"head": ""
}
}
/doctype.html:

<!DOCTYPE html>
<a>

$ fq . doctype.html
{
"html": {
"body": {
"a": ""
},
"head": ""
}
}
4 changes: 2 additions & 2 deletions format/zip/zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ func zipDecode(d *decode.D) any {
}

if compressionMethod == compressionMethodNone {
d.FieldFormatOrRawLen("uncompressed", compressedSize, &probeGroup, nil)
d.FieldFormatOrRawLen("uncompressed", compressedSize, &probeGroup, format.Probe_In{})
} else {
var rFn func(r io.Reader) io.Reader
if zi.Uncompress {
Expand All @@ -374,7 +374,7 @@ func zipDecode(d *decode.D) any {

if rFn != nil {
readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), compressedLimit, rFn, &probeGroup, nil)
d.TryFieldReaderRangeFormat("uncompressed", d.Pos(), compressedLimit, rFn, &probeGroup, format.Probe_In{})
if dv == nil && uncompressedBR != nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)
}
Expand Down
30 changes: 30 additions & 0 deletions internal/lazyre/lazyre.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// lazyre lazily compiles a *regexp.Regexp in concurrency safe way
// Use &lazyre.RE{S: `...`} or call New
package lazyre

import (
"regexp"
"sync"
)

type RE struct {
S string

m sync.RWMutex
re *regexp.Regexp
}

// New creates a new *lazyRE
func New(s string) *RE {
return &RE{S: s}
}

// Must compiles regexp, returned *regexp.Regexp can be stored away and reused
func (lr *RE) Must() *regexp.Regexp {
lr.m.Lock()
defer lr.m.Unlock()
if lr.re == nil {
lr.re = regexp.MustCompile(lr.S)
}
return lr.re
}
13 changes: 13 additions & 0 deletions internal/lazyre/lazyre_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package lazyre_test

import (
"testing"

"github.com/wader/fq/internal/lazyre"
)

func TestMust(t *testing.T) {
if !lazyre.New("a").Must().MatchString("a") {
t.Fatal("should compile and be non-nil and match a")
}
}
17 changes: 5 additions & 12 deletions pkg/decode/decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -1261,15 +1261,11 @@ func (d *D) FieldValue(name string, fn func() *Value) *Value {
return v
}

func (d *D) RE(reRef **regexp.Regexp, reStr string) []ranges.Range {
if *reRef == nil {
*reRef = regexp.MustCompile(reStr)
}

func (d *D) RE(re *regexp.Regexp) []ranges.Range {
startPos := d.Pos()

rr := ioex.ByteRuneReader{RS: bitio.NewIOReadSeeker(d.bitBuf)}
locs := (*reRef).FindReaderSubmatchIndex(rr)
locs := re.FindReaderSubmatchIndex(rr)
if locs == nil {
return nil
}
Expand All @@ -1292,13 +1288,10 @@ func (d *D) RE(reRef **regexp.Regexp, reStr string) []ranges.Range {
return rs
}

func (d *D) FieldRE(reRef **regexp.Regexp, reStr string, mRef *map[string]string, sms ...scalar.StrMapper) {
if *reRef == nil {
*reRef = regexp.MustCompile(reStr)
}
subexpNames := (*reRef).SubexpNames()
func (d *D) FieldRE(re *regexp.Regexp, mRef *map[string]string, sms ...scalar.StrMapper) {
subexpNames := re.SubexpNames()

rs := d.RE(reRef, reStr)
rs := d.RE(re)
for i, r := range rs {
if i == 0 || r.Start == -1 {
continue
Expand Down

0 comments on commit e2eb667

Please sign in to comment.