Skip to content

Commit a544781

Browse files
committed
fixed #2
- now recognize all hex strings, mac, ipv6, fingerprints.. - mac address is TokenMac - ipv6 is TokenIPv6 - any other hex string is TokenLiteral - unfortunately performance dropped 8-12% since we can no longer short circuit the mac check...ipv6 is variable length and format..sux!
1 parent 4955f8e commit a544781

File tree

3 files changed

+257
-79
lines changed

3 files changed

+257
-79
lines changed

parser_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ var (
7272
"Jan 31 21:42:59 mail postfix/anvil[14606]: statistics: max cache size 1 at Jan 31 21:39:37",
7373
"%msgtime% %apphost% %appname% [ %integer% ] : statistics : max cache size %integer% at %time%",
7474
},
75+
{
76+
"Feb 06 13:37:00 box sshd[4388]: Accepted publickey for cryptix from dead:beef:1234:5678:223:32ff:feb1:2e50 port 58251 ssh2: RSA de:ad:be:ef:74:a6:bb:45:45:52:71:de:b2:12:34:56",
77+
"%msgtime% %apphost% %appname% [ %sessionid% ] : accepted publickey for %dstuser% from %srcipv6% port %integer% ssh2 : rsa %string%",
78+
},
7579
}
7680

7781
parsetests2 = []struct {

scanner.go

Lines changed: 169 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,31 @@ type Message struct {
6060
// cursor positions
6161
cur, start, end int
6262

63-
// should the next token be a value?
64-
nextisValue bool
65-
66-
// how far from the = is the value, immediate following is 0
67-
valueDistance int
63+
hexState int // Current hex string state
64+
hexStart bool // Is the first char a :?
65+
hexColons int // Total number of colons
66+
hexSuccColons int // The current number of successive colons
67+
hexMaxSuccColons int // Maximum number of successive colons
68+
hexSuccColonsSeries int // Number of successive colon series
6869
}
6970
}
7071

72+
const (
73+
hsStart = iota
74+
hsChar1
75+
hsChar2
76+
hsColon
77+
)
78+
79+
const (
80+
hexStart = iota
81+
hexChar1
82+
hexChar2
83+
hexChar3
84+
hexChar4
85+
hexColon
86+
)
87+
7188
func (this *Message) SetData(s string) {
7289
this.data = s
7390

@@ -191,12 +208,14 @@ func (this *Message) Scan() (Token, error) {
191208
}
192209

193210
// remove any trailing spaces
211+
s := 0 // trail space count
194212
for this.data[this.state.start+l-1] == ' ' && l > 0 {
195213
l--
214+
s++
196215
}
197216

198217
v := this.data[this.state.start : this.state.start+l]
199-
this.state.start += l
218+
this.state.start += l + s
200219

201220
token := Token{Type: t, Value: v, Field: FieldUnknown}
202221

@@ -234,21 +253,20 @@ func (this *Message) skipSpace(data string) int {
234253

235254
func (this *Message) scanToken(data string) (int, TokenType, error) {
236255
var (
237-
tnode *timeNode = timeFsmRoot
238-
timeStop, macStop, macType bool
239-
timeLen, tokenLen int
240-
l = len(data)
256+
tnode *timeNode = timeFsmRoot
257+
timeStop, hexStop, hexValid bool
258+
timeLen, hexLen, tokenLen int
259+
l = len(data)
241260
)
242261

243262
this.state.dots = 0
244263
this.state.tokenType = TokenUnknown
245264
this.state.tokenStop = false
265+
this.resetHexStates()
246266

247-
// short circuit the mac check
248-
// positions 2,5,8,11,14 must be ':'
249-
if l < 17 || data[2] != ':' || data[14] != ':' {
250-
macStop = true
251-
macType = false
267+
// short circuit the time check
268+
if l < 3 {
269+
hexStop = true
252270
}
253271

254272
// short circuit the time check
@@ -265,11 +283,11 @@ func (this *Message) scanToken(data string) (int, TokenType, error) {
265283
}
266284
}
267285

268-
if !macStop {
269-
macType, macStop = this.macStep(i, r)
286+
if !hexStop {
287+
hexValid, hexStop = this.hexStep(i, r)
270288

271-
if macType && macStop {
272-
return i + 1, TokenMac, nil
289+
if hexValid {
290+
hexLen = i + 1
273291
}
274292
}
275293

@@ -287,7 +305,23 @@ func (this *Message) scanToken(data string) (int, TokenType, error) {
287305
}
288306
}
289307

290-
if this.state.tokenStop && timeStop && macStop {
308+
//glog.Debugf("i=%d, r=%c, tokenStop=%t, timeStop=%t, hexStop=%t", i, r, this.state.tokenStop, timeStop, hexStop)
309+
// This means either we found something, or we have exhausted the string
310+
if (this.state.tokenStop && timeStop && hexStop) || i == l-1 {
311+
if timeLen > 0 {
312+
return timeLen, TokenTime, nil
313+
} else if hexValid && this.state.hexColons > 1 {
314+
if this.state.hexColons == 5 && this.state.hexMaxSuccColons == 1 {
315+
return hexLen, TokenMac, nil
316+
} else if this.state.hexSuccColonsSeries == 1 ||
317+
(this.state.hexColons == 7 && this.state.hexSuccColonsSeries == 0) {
318+
319+
return hexLen, TokenIPv6, nil
320+
} else {
321+
return hexLen, TokenLiteral, nil
322+
}
323+
}
324+
291325
// If token length is 0, it means we didn't find time, nor did we find
292326
// a word, it cannot be space since we skipped all space. This means it
293327
// is a single character literal, so return that.
@@ -299,10 +333,6 @@ func (this *Message) scanToken(data string) (int, TokenType, error) {
299333
}
300334
}
301335

302-
if timeLen > 0 {
303-
return timeLen, TokenTime, nil
304-
}
305-
306336
return len(data), this.state.tokenType, nil
307337
}
308338

@@ -497,59 +527,119 @@ func (this *Message) tokenStep(index int, r rune) {
497527
}
498528
}
499529

500-
// Returns bool, bool, first one is true if the it's a mac type, second is whether to stop scanning
501-
func (this *Message) macStep(index int, r rune) (bool, bool) {
502-
switch {
503-
case index == 0 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
504-
return true, false
530+
// hexStep steps through a string and try to match a hex string of the format
531+
// - dead:beef:1234:5678:223:32ff:feb1:2e50 (ipv6 address)
532+
// - de:ad:be:ef:74:a6:bb:45:45:52:71:de:b2:12:34:56 (mac address)
533+
// - 0:09:36 (literal)
534+
// - f0f0:f::1 (ipv6)
535+
// - and a few others in the scanner_test.go/hextests list
536+
//
537+
// The ipv6 rules are:
538+
// (http://computernetworkingnotes.com/ipv6-features-concepts-and-configurations/ipv6-address-types-and-formats.html)
539+
// - Whereas IPv4 addresses use a dotted-decimal format, where each byte ranges from
540+
// 0 to 255.
541+
// - IPv6 addresses use eight sets of four hexadecimal addresses (16 bits in each set),
542+
// separated by a colon (:), like this: xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
543+
// (x would be a hexadecimal value). This notation is commonly called string notation.
544+
// - Hexadecimal values can be displayed in either lower- or upper-case for the numbers
545+
// A–F.
546+
// - A leading zero in a set of numbers can be omitted; for example, you could either
547+
// enter 0012 or 12 in one of the eight fields—both are correct.
548+
// - If you have successive fields of zeroes in an IPv6 address, you can represent
549+
// them as two colons (::). For example,0:0:0:0:0:0:0:5 could be represented as ::5;
550+
// and ABC:567:0:0:8888:9999:1111:0 could be represented asABC:567::8888:9999:1111:0.
551+
// However, you can only do this once in the address: ABC::567::891::00 would be
552+
// invalid since ::appears more than once in the address. The reason for this
553+
// limitation is that if you had two or more repetitions, you wouldn’t know how many
554+
// sets of zeroes were being omitted from each part. An unspecified address is
555+
// represented as ::, since it contains all zeroes.
556+
//
557+
// first return value indicates whether this is a valid hex string
558+
// second return value indicates whether to stop scanning
559+
func (this *Message) hexStep(i int, r rune) (bool, bool) {
560+
switch this.state.hexState {
561+
case hexStart:
562+
switch {
563+
case isHex(r):
564+
this.state.hexState = hexChar1
565+
566+
case r == ':':
567+
this.state.hexState = hexColon
568+
this.state.hexColons++
569+
this.state.hexSuccColons++
570+
this.state.hexStart = true
571+
this.state.hexState = hexColon
572+
573+
if this.state.hexSuccColons > this.state.hexMaxSuccColons {
574+
this.state.hexMaxSuccColons = this.state.hexSuccColons
575+
}
505576

506-
case index == 1 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
507-
return true, false
577+
default:
578+
return false, true
579+
}
508580

509-
case index == 2 && r == ':':
510-
return true, false
581+
return false, false
511582

512-
case index == 3 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
513-
return true, false
583+
case hexColon:
584+
switch {
585+
case isHex(r):
586+
this.state.hexState = hexChar1
587+
this.state.hexSuccColons = 0
514588

515-
case index == 4 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
516-
return true, false
589+
if this.state.hexColons > 0 {
590+
return true, false
591+
}
517592

518-
case index == 5 && r == ':':
519-
return true, false
593+
case r == ':':
594+
this.state.hexSuccColons++
595+
this.state.hexColons++
520596

521-
case index == 6 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
522-
return true, false
597+
if this.state.hexSuccColons == 2 {
598+
this.state.hexSuccColonsSeries++
599+
}
523600

524-
case index == 7 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
525-
return true, false
601+
if this.state.hexSuccColons > this.state.hexMaxSuccColons {
602+
this.state.hexMaxSuccColons = this.state.hexSuccColons
603+
}
526604

527-
case index == 8 && r == ':':
528-
return true, false
605+
this.state.hexState = hexColon
529606

530-
case index == 9 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
531-
return true, false
607+
default:
608+
if this.state.hexColons > 0 && unicode.IsSpace(r) {
609+
return true, true
610+
}
611+
return false, true
612+
}
532613

533-
case index == 10 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
534-
return true, false
614+
return false, false
535615

536-
case index == 11 && r == ':':
537-
return true, false
616+
case hexChar1, hexChar2, hexChar3, hexChar4:
617+
switch {
618+
case this.state.hexState != hexChar4 && isHex(r):
619+
this.state.hexState++
620+
this.state.hexSuccColons = 0
538621

539-
case index == 12 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
540-
return true, false
622+
case r == ':':
623+
this.state.hexState = hexColon
624+
this.state.hexColons++
625+
this.state.hexSuccColons++
541626

542-
case index == 13 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
543-
return true, false
627+
if this.state.hexSuccColons > this.state.hexMaxSuccColons {
628+
this.state.hexMaxSuccColons = this.state.hexSuccColons
629+
}
544630

545-
case index == 14 && r == ':':
546-
return true, false
631+
default:
632+
if this.state.hexColons > 0 && unicode.IsSpace(r) {
633+
return true, true
634+
}
635+
return false, true
636+
}
547637

548-
case index == 15 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
549-
return true, false
638+
if this.state.hexColons > 0 {
639+
return true, false
640+
}
550641

551-
case index == 16 && (r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'):
552-
return true, true
642+
return false, false
553643
}
554644

555645
return false, true
@@ -566,12 +656,27 @@ func (this *Message) reset() {
566656
this.state.start = 0
567657
this.state.end = len(this.data)
568658
this.state.cur = 0
659+
660+
this.resetHexStates()
569661
}
570662

571-
func isLetter(ch rune) bool {
572-
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
663+
func (this *Message) resetHexStates() {
664+
this.state.hexState = hsStart
665+
this.state.hexStart = false
666+
this.state.hexColons = 0
667+
this.state.hexSuccColons = 0
668+
this.state.hexMaxSuccColons = 0
669+
this.state.hexSuccColonsSeries = 0
670+
}
671+
672+
func isLetter(r rune) bool {
673+
return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || r == '_' || r >= 0x80 && unicode.IsLetter(r)
573674
}
574675

575676
func isLiteral(r rune) bool {
576-
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || r == '+' || r == '-' || r == '_' || r == '#' || r == '\\' || r == '%' || r == '*' || r == '@' || r == '$' || r == '?'
677+
return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z' || r == '+' || r == '-' || r == '_' || r == '#' || r == '\\' || r == '%' || r == '*' || r == '@' || r == '$' || r == '?'
678+
}
679+
680+
func isHex(r rune) bool {
681+
return r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9'
577682
}

0 commit comments

Comments
 (0)