From 99bb6f3a68a3d347c13741d09052c5cac126390b Mon Sep 17 00:00:00 2001 From: Viacheslav Poturaev Date: Thu, 23 Mar 2023 17:47:57 +0100 Subject: [PATCH] Extend filtering to sequences of vowels/consonants (#5) --- .golangci.yml | 2 +- filter_test.go | 10 +++++----- fllter.go | 39 +++++++++++++++++++++++++++++++++++---- main.go | 2 +- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index fae5002..51b4602 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -7,7 +7,7 @@ linters-settings: check-type-assertions: true check-blank: true gocyclo: - min-complexity: 20 + min-complexity: 40 dupl: threshold: 100 misspell: diff --git a/filter_test.go b/filter_test.go index a42deef..9d5e8ba 100644 --- a/filter_test.go +++ b/filter_test.go @@ -5,7 +5,7 @@ import ( "testing" ) -const logLine = `foo-bar-16 i2 2022/09/02 09:48:29.199655 baz handler failed to get cox: failed to get cox type: unknown cox type string: {quux} [R method:GET path:/abcd5s8 ra:2022-09-02T09:48:29 form:'map[bar:[11_2022-09-02] lox_id:[] cucumber:[RedCat1509_2022-09-02] cucumber_id:[132072] faux_id:[afExxSDFKHgBJcwxDgIxDETR1vEAWVVHqXo6PcBjfoaDF29f_I8jYTZZVyKeiXzPlP9O9k3SrZtY3IeqA] cox_alarm:[{payout}}] cox_carrot:[OCD] cox_type:[{marks}] creative:[62_203206_123ebd32047fe640] foo_boo_99diks:[https://peebee.jeeass-foo.site/pushforw?lockid=sdd32432dUR1vEAWVVHqXo6PcBjfoaDF29f_Ik3SrZtY3FzXvq0fP1IeqA] goal:[99diks] gps_pos:[1231230D-0BE9-41EF-B146-123123123B9BC7] baz:[123123124-0BE9-41EF-B146-123123123123] ip_address:[1.2.333.4] labelle:[72_206706_125329e2047fe640] poob_id:[62_206706_550eb9e2047fe640] baz_limit:[1234]]' header:'map[User-Agent:[Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148] X-Boo-Id:[12312380d-0be9-41ef-b146-3452352323] X-Forwarded-For:[111.332.555.333] X-Forwarded-Proto:[https] X-Forwarded-For:[123.321.123.321]]' foocksorized gees_valeed deeedre_lee:0]` +const logLine = `foo-bar-16 i2 2022/09/02 09:48:29.199655 baz cdfgzaa aouios handler failed to get cox: failed to get cox type: unknown cox type string: {quux} [R method:GET path:/abcd5s8 ra:2022-09-02T09:48:29 form:'map[bar:[11_2022-09-02] lox_id:[] cucumber:[RedCat1509_2022-09-02] cucumber_id:[132072] faux_id:[afExxSDFKHgBJcwxDgIxDETR1vEAWVVHqXo6PcBjfoaDF29f_I8jYTZZVyKeiXzPlP9O9k3SrZtY3IeqA] cox_alarm:[{payout}}] cox_carrot:[OCD] cox_type:[{marks}] creative:[62_203206_123ebd32047fe640] foo_boo_99diks:[https://peebee.jeeass-foo.site/pushforw?lockid=sdd32432dUR1vEAWVVHqXo6PcBjfoaDF29f_Ik3SrZtY3FzXvq0fP1IeqA] goal:[99diks] gps_pos:[1231230D-0BE9-41EF-B146-123123123B9BC7] baz:[123123124-0BE9-41EF-B146-123123123123] ip_address:[1.2.333.4] labelle:[72_206706_125329e2047fe640] poob_id:[62_206706_550eb9e2047fe640] baz_limit:[1234]]' header:'map[User-Agent:[Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148] X-Boo-Id:[12312380d-0be9-41ef-b146-3452352323] X-Forwarded-For:[111.332.555.333] X-Forwarded-Proto:[https] X-Forwarded-For:[123.321.123.321]]' foocksorized gees_valeed deeedre_lee:0]` // BenchmarkFilterAlphanumeric checks bespoke implementation. // BenchmarkFilterAlphanumeric-12 1912300 614.7 ns/op 0 B/op 0 allocs/op. @@ -14,14 +14,14 @@ func BenchmarkFilterAlphanumeric(b *testing.B) { for i := 0; i < b.N; i++ { s := []byte(logLine) - a := filterAlphanumeric(s, 150) + a := filterDynamic(s, 150) _ = a } } func TestAlphanum(t *testing.T) { - filtered := string(filterAlphanumeric([]byte(logLine), 150)) - expected := `X X X/X/X X:X:X.X baz handler failed to get cox: failed to get cox type: unknown cox type string: {quux} [R method:GET path:/X ra:X:X:X form:'map[bar:` + filtered := string(filterDynamic([]byte(logLine), 150)) + expected := `X X X/X/X X:X:X.X baz X X handler failed to get cox: failed to get cox type: unknown cox type string: {quux} [R method:GET path:/X ra:X:X:X form:'map[` if expected != filtered { t.Fatalf("unexpected filtered: %s", filtered) @@ -31,7 +31,7 @@ func TestAlphanum(t *testing.T) { func TestShortLine(t *testing.T) { line := "foo-bar-12 i3 2022/09/15 11:24:10.689412 baz 0-275 foo bar" expected := "X X X/X/X X:X:X.X baz X foo bar" - filtered := string(filterAlphanumeric([]byte(line), 120)) + filtered := string(filterDynamic([]byte(line), 120)) if expected != filtered { t.Fatalf("unexpected filtered: %s", filtered) diff --git a/fllter.go b/fllter.go index 5ad1f3d..e0c323d 100644 --- a/fllter.go +++ b/fllter.go @@ -1,24 +1,42 @@ package main -// filterAlphanumeric replaces a-zA-Z_-% sequences that have at least one digit with X. +// filterDynamic replaces a-zA-Z_-% sequences that have at least one digit or 5+ consecutive consolants/vowels with X. // Does not allocate, uses original slice. -func filterAlphanumeric(data []byte, l int) []byte { +func filterDynamic(data []byte, l int) []byte { hasDigit := false wordStart := -1 + maxConsecutive := 0 + consecutive := 0 res := data[:0] - var i int + var ( + i int + prevCharType byte + ) for i = 0; i < len(data); i++ { c := data[i] isAlpha := false + charType := byte(0) switch { case c >= 'a' && c <= 'z': + if c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y' || c == 'w' { + charType = 'v' // Vowel. + } else { + charType = 'c' // Consonant. + } + isAlpha = true case c >= 'A' && c <= 'Z': + if c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U' || c == 'Y' || c == 'W' { + charType = 'v' + } else { + charType = 'c' + } + isAlpha = true case c >= '0' && c <= '9': isAlpha = true @@ -27,9 +45,20 @@ func filterAlphanumeric(data []byte, l int) []byte { isAlpha = true } + if charType == prevCharType { + consecutive++ + } else { + if consecutive > maxConsecutive { + maxConsecutive = consecutive + } + + prevCharType = charType + consecutive = 1 + } + // Finish current word. if wordStart >= 0 && !isAlpha { - if hasDigit { + if hasDigit || maxConsecutive > 4 { res = append(res, 'X') } else { res = append(res, data[wordStart:i]...) @@ -37,6 +66,8 @@ func filterAlphanumeric(data []byte, l int) []byte { wordStart = -1 hasDigit = false + maxConsecutive = 0 + consecutive = 0 } if wordStart == -1 { diff --git a/main.go b/main.go index a92e849..acc1fa3 100644 --- a/main.go +++ b/main.go @@ -134,7 +134,7 @@ func main() { } if top > 0 { - filtered := filterAlphanumeric(line, length) + filtered := filterDynamic(line, length) d.Reset()