Skip to content

Commit

Permalink
Remove round-trip between string and byte[] for LIKE
Browse files Browse the repository at this point in the history
By using NonStrictUTF8Encoding, joni does not suffer infinite loop
problem with invalid utf8. As a result, the roundtrip between byte[]
and string can be removed.
  • Loading branch information
haozhun committed Jul 30, 2015
1 parent a17fb1a commit ea66e8c
Showing 1 changed file with 5 additions and 18 deletions.
Expand Up @@ -18,7 +18,7 @@
import com.facebook.presto.operator.scalar.ScalarOperator;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.type.StandardTypes;
import io.airlift.jcodings.specific.UTF8Encoding;
import io.airlift.jcodings.specific.NonStrictUTF8Encoding;
import io.airlift.joni.Option;
import io.airlift.joni.Regex;
import io.airlift.joni.Syntax;
Expand Down Expand Up @@ -55,13 +55,10 @@ private LikeFunctions() {}
@SqlType(StandardTypes.BOOLEAN)
public static boolean like(@SqlType(StandardTypes.VARCHAR) Slice value, @SqlType(LikePatternType.NAME) Regex pattern)
{
// Joni doesn't handle invalid UTF-8, so replace invalid characters
// Joni can infinite loop with UTF8Encoding when invalid UTF-8 is encountered.
// NonStrictUTF8Encoding must be used to avoid this issue.
byte[] bytes = value.getBytes();
if (isAscii(bytes)) {
return regexMatches(pattern, bytes);
}
// convert to a String and back to "fix" any broken UTF-8 sequences
return regexMatches(pattern, value.toStringUtf8().getBytes(UTF_8));
return regexMatches(pattern, bytes);
}

@ScalarOperator(OperatorType.CAST)
Expand Down Expand Up @@ -123,7 +120,7 @@ private static Regex likeToPattern(String patternString, char escapeChar, boolea
regex.append('$');

byte[] bytes = regex.toString().getBytes(UTF_8);
return new Regex(bytes, 0, bytes.length, Option.MULTILINE, UTF8Encoding.INSTANCE, SYNTAX);
return new Regex(bytes, 0, bytes.length, Option.MULTILINE, NonStrictUTF8Encoding.INSTANCE, SYNTAX);
}

@SuppressWarnings("NumericCastThatLosesPrecision")
Expand All @@ -139,14 +136,4 @@ private static char getEscapeChar(Slice escape)
}
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Escape must be empty or a single character");
}

private static boolean isAscii(byte[] bytes)
{
for (byte b : bytes) {
if (b < 0) {
return false;
}
}
return true;
}
}

0 comments on commit ea66e8c

Please sign in to comment.