From aecaff643ea144f656861fff900ed0f568896382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Fri, 23 Apr 2021 23:26:21 -0300 Subject: [PATCH] Add base implementation and tests for LPAD function considering string input values --- cpp/src/gandiva/precompiled/string_ops.cc | 67 +++++++++++++++++++ .../gandiva/precompiled/string_ops_test.cc | 43 ++++++++++++ cpp/src/gandiva/precompiled/types.h | 4 ++ 3 files changed, 114 insertions(+) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index fa9164bd1396c..ab5e4acc05daa 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1422,6 +1422,73 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, out_len); } +FORCE_INLINE +const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, + gdv_int32* out_len) { + // if the text length or the defined return length (number of characters to return) + // is <=0, then return an empty string. + if (text_len == 0 || return_length <= 0) { + *out_len = 0; + return ""; + } + + // initially counts the number of utf8 characters in the defined text and fill_text + int32_t text_char_count = utf8_length(context, text, text_len); + int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len); + // text_char_count is zero if input has invalid utf8 char + // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char + if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) { + *out_len = 0; + return ""; + } + + if (return_length == text_char_count || + (return_length > text_char_count && fill_text_len == 0)) { + // case where the return length is same as the text's length, or if it need to + // fill into text but "fill_text" is empty, then return text directly. + *out_len = text_len; + return text; + } else if (return_length < text_char_count) { + // case where it truncates the result on return length. + *out_len = utf8_byte_pos(context, text, text_len, return_length); + return text; + } else { + // case (return_length > text_char_count) + // case where it needs to copy "fill_text" on the string left. The total number + // of chars to copy is given by (return_length - text_char_count) + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, return_length)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + // try to fulfill the return string with the "fill_text" continuously + int32_t copied_chars_count = 0; + int32_t copied_chars_position = 0; + while (copied_chars_count < return_length - text_char_count) { + int32_t char_len; + int32_t fill_index; + // for each char, evaluate its length to consider it when mem copying + for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) { + if (copied_chars_count >= return_length - text_char_count) { + break; + } + char_len = utf8_char_length(fill_text[fill_index]); + copied_chars_count++; + } + memcpy(ret + copied_chars_position, fill_text, fill_index); + copied_chars_position += fill_index; + } + // after fulfilling the text, copy the main string + memcpy(ret + copied_chars_position, text, text_len); + *out_len = copied_chars_position + text_len; + return ret; + } +} + FORCE_INLINE const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len, const char* delimiter, gdv_int32 delim_len, gdv_int32 index, diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 9326aac1e0f9b..258efc6fd1db2 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -696,6 +696,49 @@ TEST(TestStringOps, TestLtrim) { EXPECT_FALSE(ctx.has_error()); } +TEST(TestStringOps, TestLpadString) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + out_str = lpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = lpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString"); + + out_str = lpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFTestString"); + + out_str = lpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString"); + + out_str = lpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "ддабвгд"); + + out_str = lpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд"); + + out_str = lpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "дhello"); +} + TEST(TestStringOps, TestRtrim) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 1b0f96e0ab7fe..3d08417c69277 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -407,6 +407,10 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len, const char* str, gdv_int32 str_len, gdv_int32 start_pos); +const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, + gdv_int32* out_len); + const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len,