Skip to content

Commit

Permalink
Add base implementation and tests for LPAD function considering strin…
Browse files Browse the repository at this point in the history
…g input values
  • Loading branch information
jpedroantunes committed May 3, 2021
1 parent 4363fef commit aecaff6
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 0 deletions.
67 changes: 67 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1422,6 +1422,73 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
out_len);
}

FORCE_INLINE
const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len,
gdv_int32* out_len) {
// if the text length or the defined return length (number of characters to return)
// is <=0, then return an empty string.
if (text_len == 0 || return_length <= 0) {
*out_len = 0;
return "";
}

// initially counts the number of utf8 characters in the defined text and fill_text
int32_t text_char_count = utf8_length(context, text, text_len);
int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
// text_char_count is zero if input has invalid utf8 char
// fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
*out_len = 0;
return "";
}

if (return_length == text_char_count ||
(return_length > text_char_count && fill_text_len == 0)) {
// case where the return length is same as the text's length, or if it need to
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
} else if (return_length < text_char_count) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
} else {
// case (return_length > text_char_count)
// case where it needs to copy "fill_text" on the string left. The total number
// of chars to copy is given by (return_length - text_char_count)
char* ret =
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
// try to fulfill the return string with the "fill_text" continuously
int32_t copied_chars_count = 0;
int32_t copied_chars_position = 0;
while (copied_chars_count < return_length - text_char_count) {
int32_t char_len;
int32_t fill_index;
// for each char, evaluate its length to consider it when mem copying
for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
if (copied_chars_count >= return_length - text_char_count) {
break;
}
char_len = utf8_char_length(fill_text[fill_index]);
copied_chars_count++;
}
memcpy(ret + copied_chars_position, fill_text, fill_index);
copied_chars_position += fill_index;
}
// after fulfilling the text, copy the main string
memcpy(ret + copied_chars_position, text, text_len);
*out_len = copied_chars_position + text_len;
return ret;
}
}

FORCE_INLINE
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
const char* delimiter, gdv_int32 delim_len, gdv_int32 index,
Expand Down
43 changes: 43 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,49 @@ TEST(TestStringOps, TestLtrim) {
EXPECT_FALSE(ctx.has_error());
}

TEST(TestStringOps, TestLpadString) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;
const char* out_str;

out_str = lpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Test");

out_str = lpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = lpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = lpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString");

out_str = lpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "FillFTestString");

out_str = lpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString");

out_str = lpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "ддабвгд");

out_str = lpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");

out_str = lpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "дhello");
}

TEST(TestStringOps, TestRtrim) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/gandiva/precompiled/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,10 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
gdv_int32 sub_str_len, const char* str,
gdv_int32 str_len, gdv_int32 start_pos);

const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len,
gdv_int32* out_len);

const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
gdv_int32 text_len, const char* from_str,
gdv_int32 from_str_len,
Expand Down

0 comments on commit aecaff6

Please sign in to comment.