Permalink
Browse files

major updates

- added a experimental json scanner
- moved apache log check outside of Tokenize()
- added more time checks
- changed duration to integer
- merged different scanners into a single scanner with different
  functions for scanning json vs general
- changed scanner back to having internal Sequence buffer instead of
  supplied by caller
- separated out scanner and message files
- exported message type so others can use it to write their own scanners
  • Loading branch information...
zhenjl committed Feb 24, 2015
1 parent d5f2dab commit 713979f70d6025308e434205973249aa3138e58e
Showing with 3,179 additions and 801 deletions.
  1. +1 −1 analyzer.go
  2. +12 −18 analyzer_test.go
  3. +50 −50 cmd/sequence/sequence.go
  4. +199 −0 genmethods.go
  5. +2 −2 gentokens.go
  6. +588 −0 message.go
  7. +7 −7 parser.go
  8. +37 −12 parser_test.go
  9. +1,277 −0 reqmethods.go
  10. +307 −517 scanner.go
  11. +681 −186 scanner_test.go
  12. +3 −0 sequence.go
  13. +4 −4 sequence_test.go
  14. +7 −0 time.go
  15. +4 −4 tokens.go
@@ -1063,7 +1063,7 @@ LOOP:
fexists[FieldMsgTime] = true
}

case TokenURL:
case TokenURI:
if !fexists[FieldObject] {
seq[i].Field = FieldObject
seq[i].Type = seq[i].Field.TokenType()
@@ -56,7 +56,7 @@ var (
{
"id=firewall time=\"2005-03-18 14:01:46\" fw=TOPSEC priv=6 recorder=kernel type=conn policy=414 proto=TCP rule=accept src=61.167.71.244 sport=35223 dst=210.82.119.211 dport=25 duration=27 inpkt=37 outpkt=39 sent=1770 rcvd=20926 smac=00:04:c1:8b:d8:82 dmac=00:0b:5f:b2:1d:80",
//"id = %string% time = \" %time% \" fw = %string% priv = %integer% recorder = %string% type = %string% policy = %integer% proto = %string% rule = %string% src = %ipv4% sport = %integer% dst = %ipv4% dport = %integer% duration = %integer% inpkt = %integer% outpkt = %integer% sent = %integer% rcvd = %integer% smac = %mac% dmac = %mac%",
"id = %string% time = \" %msgtime% \" fw = %string% priv = %integer% recorder = %string% type = %string% policy = %integer% proto = %protocol% rule = %string% src = %srcipv4% sport = %srcport% dst = %dstipv4% dport = %dstport% duration = %integer% inpkt = %integer% outpkt = %integer% sent = %integer% rcvd = %integer% smac = %srcmac% dmac = %dstmac%",
"id = %string% time = \" %msgtime% \" fw = %string% priv = %integer% recorder = %string% type = %string% policy = %integer% proto = %protocol% rule = %string% src = %srcipv4% sport = %srcport% dst = %dstipv4% dport = %dstport% duration = %duration% inpkt = %integer% outpkt = %integer% sent = %integer% rcvd = %integer% smac = %srcmac% dmac = %dstmac%",
},
{
"id=firewall time=\"2005-03-18 14:01:43\" fw=TOPSEC priv=4 recorder=kernel type=conn policy=504 proto=TCP rule=deny src=210.82.121.91 sport=4958 dst=61.229.37.85 dport=23124 smac=00:0b:5f:b2:1d:80 dmac=00:04:c1:8b:d8:82",
@@ -80,11 +80,10 @@ var (

func TestAnalyzerMergeNodes(t *testing.T) {
atree := NewAnalyzer()
seq := make(Sequence, 0, 20)
scanner := NewScanner()

for _, data := range analyzerSshdSamples {
seq = seq[:0]
seq, err := DefaultScanner.Tokenize(data, seq)
seq, err := scanner.Scan(data)
require.NoError(t, err)
err = atree.Add(seq)
require.NoError(t, err)
@@ -205,11 +204,10 @@ func TestAnalyzerMergeNodes(t *testing.T) {

func TestAnalyzerKeyValuePairs(t *testing.T) {
atree := NewAnalyzer()
seq := make(Sequence, 0, 20)
scanner := NewScanner()

for _, tc := range analyzerKVTests {
seq = seq[:0]
seq, err := DefaultScanner.Tokenize(tc.msg, seq)
seq, err := scanner.Scan(tc.msg)
require.NoError(t, err)
err = atree.Add(seq)
require.NoError(t, err, tc.msg)
@@ -247,19 +245,17 @@ func TestAnalyzerKeyValuePairs(t *testing.T) {

func TestAnalyzerMatchPatterns(t *testing.T) {
atree := NewAnalyzer()
seq := make(Sequence, 0, 20)
scanner := NewScanner()

for _, tc := range analyzerSshTests {
seq = seq[:0]
seq, err := DefaultScanner.Tokenize(tc.msg, seq)
seq, err := scanner.Scan(tc.msg)
require.NoError(t, err)
err = atree.Add(seq)
require.NoError(t, err, tc.msg)
}

for _, tc := range analyzerKVTests {
seq = seq[:0]
seq, err := DefaultScanner.Tokenize(tc.msg, seq)
seq, err := scanner.Scan(tc.msg)
require.NoError(t, err)
err = atree.Add(seq)
require.NoError(t, err, tc.msg)
@@ -268,20 +264,18 @@ func TestAnalyzerMatchPatterns(t *testing.T) {
atree.Finalize()

for _, tc := range analyzerSshTests {
seq = seq[:0]
seq, err := DefaultScanner.Tokenize(tc.msg, seq)
seq, err := scanner.Scan(tc.msg)
require.NoError(t, err)
seq, err = atree.Analyze(seq)
require.NoError(t, err, tc.msg)
require.Equal(t, tc.pat, seq.String(), tc.msg, seq)
require.Equal(t, tc.pat, seq.String(), tc.msg+"\n"+seq.PrintTokens())
}

for _, tc := range analyzerKVTests {
seq = seq[:0]
seq, err := DefaultScanner.Tokenize(tc.msg, seq)
seq, err := scanner.Scan(tc.msg)
require.NoError(t, err)
seq, err = atree.Analyze(seq)
require.NoError(t, err, tc.msg)
require.Equal(t, tc.pat, seq.String(), tc.msg, seq)
require.Equal(t, tc.pat, seq.String(), tc.msg+"\n"+seq.PrintTokens())
}
}
@@ -279,6 +279,7 @@ var (
patdir string
cpuprofile string
workers int
format string

quit chan struct{}
done chan struct{}
@@ -291,14 +292,17 @@ func init() {
done = make(chan struct{})

scanCmd.Flags().StringVarP(&inmsg, "msg", "m", "", "message to tokenize")
scanCmd.Flags().StringVarP(&format, "fmt", "f", "general", "format of the message to tokenize, can be 'json' or 'general'")
scanCmd.Run = scan

analyzeCmd.Flags().StringVarP(&format, "fmt", "f", "general", "format of the message to tokenize, can be 'json' or 'general'")
analyzeCmd.Flags().StringVarP(&infile, "infile", "i", "", "input file, required")
analyzeCmd.Flags().StringVarP(&patfile, "patfile", "p", "", "initial pattern file, optional")
analyzeCmd.Flags().StringVarP(&patdir, "patdir", "d", "", "pattern directory,, all files in directory will be used, optional")
analyzeCmd.Flags().StringVarP(&outfile, "outfile", "o", "", "output file, if empty, to stdout")
analyzeCmd.Run = analyze

parseCmd.Flags().StringVarP(&format, "fmt", "f", "general", "format of the message to tokenize, can be 'json' or 'general'")
parseCmd.Flags().StringVarP(&infile, "infile", "i", "", "input file, required ")
parseCmd.Flags().StringVarP(&patfile, "patfile", "p", "", "initial pattern file, required")
parseCmd.Flags().StringVarP(&patdir, "patdir", "d", "", "pattern directory,, all files in directory will be used")
@@ -308,11 +312,13 @@ func init() {
benchCmd.AddCommand(benchScanCmd)
benchCmd.AddCommand(benchParseCmd)

benchScanCmd.Flags().StringVarP(&format, "fmt", "f", "general", "format of the message to tokenize, can be 'json' or 'general'")
benchScanCmd.Flags().StringVarP(&infile, "infile", "i", "", "input file, required ")
benchScanCmd.Flags().StringVarP(&cpuprofile, "cpuprofile", "c", "", "CPU profile filename")
benchScanCmd.Flags().IntVarP(&workers, "workers", "w", 1, "number of parsing workers")
benchScanCmd.Run = benchScan

benchParseCmd.Flags().StringVarP(&format, "fmt", "f", "general", "format of the message to tokenize, can be 'json' or 'general'")
benchParseCmd.Flags().StringVarP(&infile, "infile", "i", "", "input file, required ")
benchParseCmd.Flags().StringVarP(&patfile, "patfile", "p", "", "pattern file, required")
benchParseCmd.Flags().StringVarP(&patdir, "patdir", "d", "", "pattern directory,, all files in directory will be used")
@@ -363,12 +369,8 @@ func profile() {
}

func scan(cmd *cobra.Command, args []string) {
seq := make(sequence.Sequence, 0, 20)
seq, err := sequence.DefaultScanner.Tokenize(inmsg, seq)
if err != nil {
log.Fatal(err)
}

scanner := sequence.NewScanner()
seq := scanMessage(scanner, inmsg)
fmt.Println(seq.PrintTokens())
}

@@ -381,13 +383,12 @@ func analyze(cmd *cobra.Command, args []string) {

parser := buildParser()
analyzer := sequence.NewAnalyzer()
scanner := sequence.NewScanner()

// Open input file
iscan, ifile := openFile(infile)
defer ifile.Close()

seq := make(sequence.Sequence, 0, 20)

// For all the log messages, if we can't parse it, then let's add it to the
// analyzer for pattern analysis
for iscan.Scan() {
@@ -396,11 +397,7 @@ func analyze(cmd *cobra.Command, args []string) {
continue
}

seq = seq[:0]
seq, err := sequence.DefaultScanner.Tokenize(line, seq)
if err != nil {
log.Fatal(err)
}
seq := scanMessage(scanner, line)

if _, err := parser.Parse(seq); err != nil {
analyzer.Add(seq)
@@ -426,11 +423,7 @@ func analyze(cmd *cobra.Command, args []string) {
}
n++

seq = seq[:0]
seq, err := sequence.DefaultScanner.Tokenize(line, seq)
if err != nil {
log.Fatal(err)
}
seq := scanMessage(scanner, line)

pseq, err := parser.Parse(seq)
if err == nil {
@@ -485,7 +478,7 @@ func parse(cmd *cobra.Command, args []string) {
profile()

parser := buildParser()
seq := make(sequence.Sequence, 0, 20)
scanner := sequence.NewScanner()

iscan, ifile := openFile(infile)
defer ifile.Close()
@@ -503,17 +496,13 @@ func parse(cmd *cobra.Command, args []string) {
}
n++

seq = seq[:0]
seq, err := sequence.DefaultScanner.Tokenize(line, seq)
if err != nil {
log.Fatal(err)
}
seq := scanMessage(scanner, line)

pseq, err := parser.Parse(seq)
seq, err := parser.Parse(seq)
if err != nil {
log.Printf("Error (%s) parsing: %s", err, line)
} else {
fmt.Fprintf(ofile, "%s\n%s\n\n", line, pseq.PrintTokens())
fmt.Fprintf(ofile, "%s\n%s\n\n", line, seq.PrintTokens())
}
}

@@ -534,7 +523,6 @@ func benchScan(cmd *cobra.Command, args []string) {
var lines []string
var totalSize int
n := 0
seq := make(sequence.Sequence, 0, 20)

for iscan.Scan() {
line := iscan.Text()
@@ -552,9 +540,9 @@ func benchScan(cmd *cobra.Command, args []string) {
now := time.Now()

if workers == 1 {
scanner := sequence.NewScanner()
for _, line := range lines {
seq = seq[:0]
sequence.DefaultScanner.Tokenize(line, seq)
scanMessage(scanner, line)
}
} else {
var wg sync.WaitGroup
@@ -564,9 +552,10 @@ func benchScan(cmd *cobra.Command, args []string) {
wg.Add(1)
go func() {
defer wg.Done()
scanner := sequence.NewScanner()

for line := range msgpipe {
seq = seq[:0]
sequence.DefaultScanner.Tokenize(line, seq)
scanMessage(scanner, line)
}
}()
}
@@ -615,14 +604,10 @@ func benchParse(cmd *cobra.Command, args []string) {
now := time.Now()

if workers == 1 {
seq := make(sequence.Sequence, 0, 20)
scanner := sequence.NewScanner()

for _, line := range lines {
seq = seq[:0]
seq, err := sequence.DefaultScanner.Tokenize(line, seq)
if err != nil {
log.Fatal(err)
}
parser.Parse(seq)
parser.Parse(scanMessage(scanner, line))
}
} else {
var wg sync.WaitGroup
@@ -632,14 +617,10 @@ func benchParse(cmd *cobra.Command, args []string) {
wg.Add(1)
go func() {
defer wg.Done()
seq := make(sequence.Sequence, 0, 20)
scanner := sequence.NewScanner()

for line := range msgpipe {
seq = seq[:0]
seq, err := sequence.DefaultScanner.Tokenize(line, seq)
if err != nil {
log.Fatal(err)
}
parser.Parse(seq)
parser.Parse(scanMessage(scanner, line))
}
}()
}
@@ -658,11 +639,31 @@ func benchParse(cmd *cobra.Command, args []string) {
<-done
}

func buildParser() *sequence.Parser {
parser := sequence.NewParser()
func scanMessage(scanner *sequence.Scanner, data string) sequence.Sequence {
var (
seq sequence.Sequence
err error
)

switch format {
case "json":
seq, err = scanner.ScanJson(data)

default:
seq, err = scanner.Scan(data)
}

if err != nil {
log.Fatal(err)
}
return seq
}

func buildParser() *sequence.GeneralParser {
parser := sequence.NewGeneralParser()
scanner := sequence.NewScanner()

var files []string
seq := make(sequence.Sequence, 0, 20)

if patdir != "" {
files = getDirOfFiles(patdir)
@@ -682,8 +683,7 @@ func buildParser() *sequence.Parser {
continue
}

seq = seq[:0]
seq, err := sequence.DefaultScanner.Tokenize(line, seq)
seq, err := scanner.Scan(line)
if err != nil {
log.Fatal(err)
}
Oops, something went wrong.

0 comments on commit 713979f

Please sign in to comment.