Skip to content

Commit

Permalink
grok performance improvement (#2)
Browse files Browse the repository at this point in the history
Improve the performance of grok matcher by
- extract namedGroups only once
- re-use the same matcher when extracting the value of the named groups
- use Guava's Splitter instead of String.split()
- apply Converter.convert() only if the key contains data type.

Other changes:
- modify build.gradle to publish 0.1.6-SNAPSHOT to our s3 repo
- move patterns/* into src/main/resources so that they're bundled into jar
  • Loading branch information
keitaf committed Mar 9, 2018
1 parent 551fadb commit 37198dd
Show file tree
Hide file tree
Showing 16 changed files with 186 additions and 154 deletions.
52 changes: 45 additions & 7 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
apply plugin: 'java'
apply plugin: 'eclipse'
import com.amazonaws.auth.*
import com.amazonaws.auth.profile.*

buildscript {
repositories {
jcenter()
}
dependencies {
// Install AWS SDK so that we can use local AWS credentials for S3 access
classpath 'com.amazonaws:aws-java-sdk-core:1.11.202'
}
}

plugins {
id 'java'
id 'maven'
id 'maven-publish'
id 'eclipse'
id 'idea'
}

// Apply code quality checks
apply from: "$rootProject.projectDir/gradle/codeQuality.gradle"
Expand All @@ -17,16 +35,36 @@ repositories {
mavenCentral()
}

sourceCompatibility = JavaVersion.VERSION_1_7
sourceCompatibility = JavaVersion.VERSION_1_8
group = "oi.thekraken"
archivesBaseName = "grok"
version = '0.1.5'
version = '0.1.6-SNAPSHOT'

dependencies {
compile "com.github.tony19:named-regexp:0.2.4"
compile "org.apache.commons:commons-lang3:3.4"
compile "com.google.code.gson:gson:2.7"
compile "org.apache.commons:commons-lang3:3.7"
compile "com.google.code.gson:gson:2.8.2"
compile "com.google.guava:guava:24.0-jre"
compile "org.slf4j:slf4j-api:1.7.21"

testCompile group: 'junit', name: 'junit', version: '4.12'
}

def awsCredentials = new DefaultAWSCredentialsProviderChain().getCredentials()

publishing {
publications {
mavenJava(MavenPublication) {
from components.java
}
}

repositories {
maven {
url "s3://repo.dashbase.io/snapshot"
credentials(AwsCredentials) {
accessKey awsCredentials.AWSAccessKeyId
secretKey awsCredentials.AWSSecretKey
}
}
}
}
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
<common.version>3.1</common.version>
<gson.version>2.2.2</gson.version>
<!-- maven -->
<java.version>1.7</java.version>
<java.version>1.8</java.version>
</properties>

<dependencies>
Expand Down
79 changes: 38 additions & 41 deletions src/main/java/io/thekraken/grok/api/Converter.java
Original file line number Diff line number Diff line change
@@ -1,59 +1,63 @@
package io.thekraken.grok.api;

import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableMap;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.*;

/**
* Convert String argument to the right type.
*
*
* @author anthonyc
*
*/
public class Converter {

public static Map<String, IConverter<?>> converters = new HashMap<String, IConverter<?>>();
public static Locale locale = Locale.ENGLISH;

static {
converters.put("byte", new ByteConverter());
converters.put("boolean", new BooleanConverter());
converters.put("short", new ShortConverter());
converters.put("int", new IntegerConverter());
converters.put("long", new LongConverter());
converters.put("float", new FloatConverter());
converters.put("double", new DoubleConverter());
converters.put("date", new DateConverter());
converters.put("datetime", new DateConverter());
converters.put("string", new StringConverter());
public static final CharMatcher DELIMITER = CharMatcher.anyOf(";:");

}
private static final Splitter SPLITTER = Splitter.on(DELIMITER).limit(3);

private static Map<String, IConverter<?>> CONVERTERS = ImmutableMap.<String, IConverter<?>>builder()
.put("byte", new ByteConverter())
.put("boolean", new BooleanConverter())
.put("short", new ShortConverter())
.put("int", new IntegerConverter())
.put("long", new LongConverter())
.put("float", new FloatConverter())
.put("double", new DoubleConverter())
.put("date", new DateConverter())
.put("datetime", new DateConverter())
.put("string", new StringConverter())
.build();

private static IConverter getConverter(String key) throws Exception {
IConverter converter = converters.get(key);
IConverter converter = CONVERTERS.get(key);
if (converter == null) {
throw new Exception("Invalid data type :" + key);
}
return converter;
}

public static KeyValue convert(String key, Object value) {
String[] spec = key.split(";|:",3);
public static KeyValue convert(String key, String value) {
List<String> spec = SPLITTER.splitToList(key);
try {
if (spec.length == 1) {
return new KeyValue(spec[0], value);
} else if (spec.length == 2) {
return new KeyValue(spec[0], getConverter(spec[1]).convert(String.valueOf(value)));
} else if (spec.length == 3) {
return new KeyValue(spec[0], getConverter(spec[1]).convert(String.valueOf(value), spec[2]));
} else {
return new KeyValue(spec[0], value, "Unsupported spec :" + key);
switch (spec.size()) {
case 1:
return new KeyValue(spec.get(0), value);
case 2:
return new KeyValue(spec.get(0), getConverter(spec.get(1)).convert(value));
case 3:
return new KeyValue(spec.get(0), getConverter(spec.get(1)).convert(value, spec.get(2)));
default:
return new KeyValue(spec.get(0), value, "Unsupported spec :" + key);
}
} catch (Exception e) {
return new KeyValue(spec[0], value, e.toString());
return new KeyValue(spec.get(0), value, e.toString());
}
}
}
Expand All @@ -65,13 +69,14 @@ public static KeyValue convert(String key, Object value) {

class KeyValue {

private String key = null;
private Object value = null;
private String grokFailure = null;
private final String key;
private final Object value;
private final String grokFailure;

public KeyValue(String key, Object value) {
this.key = key;
this.value = value;
grokFailure = null;
}

public KeyValue(String key, Object value, String grokFailure) {
Expand All @@ -92,17 +97,9 @@ public String getKey() {
return key;
}

public void setKey(String key) {
this.key = key;
}

public Object getValue() {
return value;
}

public void setValue(Object value) {
this.value = value;
}
}


Expand Down
31 changes: 7 additions & 24 deletions src/main/java/io/thekraken/grok/api/Garbage.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@
*******************************************************************************/
package io.thekraken.grok.api;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.*;

/**
* The Leon the professional of {@code Grok}.<br>
Expand All @@ -30,19 +26,8 @@
*/
public class Garbage {

private List<String> toRemove;
private Map<String, Object> toRename;

/**
* Create a new {@code Garbage} object.
*/
public Garbage() {

toRemove = new ArrayList<String>();
toRename = new TreeMap<String, Object>();
/** this is a default value to remove */
toRemove.add("UNWANTED");
}
private Set<String> toRemove = new HashSet<>();
private Map<String, Object> toRename = new HashMap<>();

/**
* Set a new name to be change when exporting the final output.
Expand Down Expand Up @@ -103,17 +88,15 @@ public int remove(Map<String, Object> map) {
return item;
}

if (map.isEmpty()) {
if (map.isEmpty() || toRemove.isEmpty()) {
return item;
}

for (Iterator<Map.Entry<String, Object>> it = map.entrySet().iterator(); it.hasNext();) {
Map.Entry<String, Object> entry = it.next();
for (int i = 0; i < toRemove.size(); i++) {
if (entry.getKey().equals(toRemove.get(i))) {
it.remove();
item++;
}
if (toRemove.contains(entry.getKey())) {
it.remove();
item++;
}
}
return item;
Expand Down
53 changes: 23 additions & 30 deletions src/main/java/io/thekraken/grok/api/Grok.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,16 @@

import static java.lang.String.format;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.io.*;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import com.google.common.io.Files;
import com.google.common.io.Resources;
import io.thekraken.grok.api.exception.GrokException;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
Expand All @@ -56,7 +51,7 @@
* @since 0.0.1
* @author anthonycorbacho
*/
public class Grok implements Serializable {
public class Grok {

private static final Logger LOG = LoggerFactory.getLogger(Grok.class);
/**
Expand All @@ -76,6 +71,8 @@ public class Grok implements Serializable {
* Pattern of the namedRegex.
*/
private Pattern compiledNamedRegex;

public Set<String> namedGroups;
/**
* {@code Grok} discovery.
*/
Expand Down Expand Up @@ -107,8 +104,8 @@ public Grok() {
disco = null;
namedRegex = StringUtils.EMPTY;
compiledNamedRegex = null;
grokPatternDefinition = new TreeMap<String, String>();
namedRegexCollection = new TreeMap<String, String>();
grokPatternDefinition = new HashMap<>();
namedRegexCollection = new HashMap<>();
savedPattern = StringUtils.EMPTY;
}

Expand Down Expand Up @@ -186,7 +183,7 @@ public void copyPatterns(Map<String, String> cpy) throws GrokException {
throw new GrokException("Invalid Patterns");
}
for (Map.Entry<String, String> entry : cpy.entrySet()) {
grokPatternDefinition.put(entry.getKey().toString(), entry.getValue().toString());
grokPatternDefinition.put(entry.getKey(), entry.getValue());
}
}

Expand Down Expand Up @@ -216,22 +213,16 @@ public String getNamedRegex() {
*/
public void addPatternFromFile(String file) throws GrokException {

File f = new File(file);
if (!f.exists()) {
throw new GrokException("Pattern not found");
}

if (!f.canRead()) {
throw new GrokException("Pattern cannot be read");
}

FileReader r = null;
Reader r = null;
try {
r = new FileReader(f);
try {
URL patternFile = Resources.getResource(file);
r = new InputStreamReader(patternFile.openStream(), StandardCharsets.UTF_8);
} catch (IllegalArgumentException e) {
r = Files.newReader(new File(file), StandardCharsets.UTF_8);
}
addPatternFromReader(r);
} catch (FileNotFoundException e) {
throw new GrokException(e.getMessage());
} catch (@SuppressWarnings("hiding") IOException e) {
} catch (IllegalArgumentException|IOException e) {
throw new GrokException(e.getMessage());
} finally {
try {
Expand Down Expand Up @@ -374,12 +365,13 @@ public void compile(String pattern, boolean namedOnly) throws GrokException {
}
iterationLeft--;

Set<String> namedGroups = GrokUtils.getNameGroups(GrokUtils.GROK_PATTERN.pattern());
Matcher m = GrokUtils.GROK_PATTERN.matcher(namedRegex);
// Match %{Foo:bar} -> pattern name and subname
// Match %{Foo=regex} -> add new regex definition
if (m.find()) {
continueIteration = true;
Map<String, String> group = GrokUtils.namedGroups(m, m.group());
Map<String, String> group = GrokUtils.namedGroups(m, namedGroups);
if (group.get("definition") != null) {
try {
addPattern(group.get("pattern"), group.get("definition"));
Expand Down Expand Up @@ -414,6 +406,7 @@ public void compile(String pattern, boolean namedOnly) throws GrokException {
}
// Compile the regex
compiledNamedRegex = Pattern.compile(namedRegex);
namedGroups = GrokUtils.getNameGroups(namedRegex);
}

/**
Expand Down
Loading

0 comments on commit 37198dd

Please sign in to comment.