Skip to content

Commit

Permalink
Fix elasticsearch like operator utf8 character length error
Browse files Browse the repository at this point in the history
  • Loading branch information
Jin-H authored and martint committed Sep 13, 2023
1 parent 1d17262 commit 9d4c460
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 1 deletion.
Expand Up @@ -86,6 +86,7 @@
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static com.google.common.collect.Iterators.singletonIterator;
import static io.airlift.slice.SliceUtf8.getCodePointAt;
import static io.airlift.slice.SliceUtf8.lengthOfCodePoint;
import static io.trino.plugin.elasticsearch.ElasticsearchTableHandle.Type.QUERY;
import static io.trino.plugin.elasticsearch.ElasticsearchTableHandle.Type.SCAN;
import static io.trino.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT;
Expand Down Expand Up @@ -592,7 +593,7 @@ protected static String likeToRegexp(Slice pattern, Optional<Slice> escape)
int position = 0;
while (position < pattern.length()) {
int currentChar = getCodePointAt(pattern, position);
position += 1;
position += lengthOfCodePoint(currentChar);
checkEscape(!escaped || currentChar == '%' || currentChar == '_' || currentChar == escapeChar.get());
if (!escaped && escapeChar.isPresent() && currentChar == escapeChar.get()) {
escaped = true;
Expand Down
Expand Up @@ -1053,6 +1053,30 @@ public void testLike()
.put("text_column", "soome%text")
.buildOrThrow());

// Add another document to make sure utf8 character sequence length is right
index(indexName, ImmutableMap.<String, Object>builder()
.put("keyword_column", "中文")
.put("text_column", "中文")
.buildOrThrow());

// Add another document to make sure utf8 character sequence length is right
index(indexName, ImmutableMap.<String, Object>builder()
.put("keyword_column", "こんにちは")
.put("text_column", "こんにちは")
.buildOrThrow());

// Add another document to make sure utf8 character sequence length is right
index(indexName, ImmutableMap.<String, Object>builder()
.put("keyword_column", "안녕하세요")
.put("text_column", "안녕하세요")
.buildOrThrow());

// Add another document to make sure utf8 character sequence length is right
index(indexName, ImmutableMap.<String, Object>builder()
.put("keyword_column", "Привет")
.put("text_column", "Привет")
.buildOrThrow());

assertThat(query("" +
"SELECT " +
"keyword_column " +
Expand All @@ -1075,6 +1099,38 @@ public void testLike()
"WHERE keyword_column LIKE 'soome$%%' ESCAPE '$'"))
.matches("VALUES VARCHAR 'soome%text'")
.isFullyPushedDown();

assertThat(query("" +
"SELECT " +
"text_column " +
"FROM " + indexName + " " +
"WHERE keyword_column LIKE '中%'"))
.matches("VALUES VARCHAR '中文'")
.isFullyPushedDown();

assertThat(query("" +
"SELECT " +
"text_column " +
"FROM " + indexName + " " +
"WHERE keyword_column LIKE 'こんに%'"))
.matches("VALUES VARCHAR 'こんにちは'")
.isFullyPushedDown();

assertThat(query("" +
"SELECT " +
"text_column " +
"FROM " + indexName + " " +
"WHERE keyword_column LIKE '안녕하%'"))
.matches("VALUES VARCHAR '안녕하세요'")
.isFullyPushedDown();

assertThat(query("" +
"SELECT " +
"text_column " +
"FROM " + indexName + " " +
"WHERE keyword_column LIKE 'При%'"))
.matches("VALUES VARCHAR 'Привет'")
.isFullyPushedDown();
}

@Test
Expand Down
Expand Up @@ -34,6 +34,10 @@ public void testLikeToRegexp()
assertEquals(likeToRegexp("s_.m%ex\\t", Optional.of("$")), "s.\\.m.*ex\\\\t");
assertEquals(likeToRegexp("\000%", Optional.empty()), "\000.*");
assertEquals(likeToRegexp("\000%", Optional.of("\000")), "%");
assertEquals(likeToRegexp("中文%", Optional.empty()), "中文.*");
assertEquals(likeToRegexp("こんにちは%", Optional.empty()), "こんにちは.*");
assertEquals(likeToRegexp("안녕하세요%", Optional.empty()), "안녕하세요.*");
assertEquals(likeToRegexp("Привет%", Optional.empty()), "Привет.*");
}

private static String likeToRegexp(String pattern, Optional<String> escapeChar)
Expand Down

0 comments on commit 9d4c460

Please sign in to comment.