Skip to content

Commit

Permalink
Move lowercasing of Iceberg partitioning to PartitionFields
Browse files Browse the repository at this point in the history
  • Loading branch information
mdesmet authored and findepi committed Aug 8, 2022
1 parent 6093129 commit da9cd47
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import javax.inject.Inject;

import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
Expand Down Expand Up @@ -68,9 +67,7 @@ public IcebergTableProperties(
List.class,
ImmutableList.of(),
false,
value -> ((Collection<?>) value).stream()
.map(name -> ((String) name).toLowerCase(ENGLISH))
.collect(toImmutableList()),
value -> (List<?>) value,
value -> value))
.add(stringProperty(
LOCATION_PROPERTY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,22 @@
import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.lang.Integer.parseInt;
import static java.lang.String.format;
import static java.util.Locale.ENGLISH;

public final class PartitionFields
{
private static final String NAME = "[a-z_][a-z0-9_]*";
private static final String NAME = "[a-zA-Z_][a-zA-Z0-9_]*";
private static final String FUNCTION_ARGUMENT_NAME = "\\((" + NAME + ")\\)";
private static final String FUNCTION_ARGUMENT_NAME_AND_INT = "\\((" + NAME + "), *(\\d+)\\)";

private static final Pattern IDENTITY_PATTERN = Pattern.compile(NAME);
private static final Pattern YEAR_PATTERN = Pattern.compile("year" + FUNCTION_ARGUMENT_NAME);
private static final Pattern MONTH_PATTERN = Pattern.compile("month" + FUNCTION_ARGUMENT_NAME);
private static final Pattern DAY_PATTERN = Pattern.compile("day" + FUNCTION_ARGUMENT_NAME);
private static final Pattern HOUR_PATTERN = Pattern.compile("hour" + FUNCTION_ARGUMENT_NAME);
private static final Pattern BUCKET_PATTERN = Pattern.compile("bucket" + FUNCTION_ARGUMENT_NAME_AND_INT);
private static final Pattern TRUNCATE_PATTERN = Pattern.compile("truncate" + FUNCTION_ARGUMENT_NAME_AND_INT);
private static final Pattern VOID_PATTERN = Pattern.compile("void" + FUNCTION_ARGUMENT_NAME);
private static final Pattern YEAR_PATTERN = Pattern.compile("(?i:year)" + FUNCTION_ARGUMENT_NAME);
private static final Pattern MONTH_PATTERN = Pattern.compile("(?i:month)" + FUNCTION_ARGUMENT_NAME);
private static final Pattern DAY_PATTERN = Pattern.compile("(?i:day)" + FUNCTION_ARGUMENT_NAME);
private static final Pattern HOUR_PATTERN = Pattern.compile("(?i:hour)" + FUNCTION_ARGUMENT_NAME);
private static final Pattern BUCKET_PATTERN = Pattern.compile("(?i:bucket)" + FUNCTION_ARGUMENT_NAME_AND_INT);
private static final Pattern TRUNCATE_PATTERN = Pattern.compile("(?i:truncate)" + FUNCTION_ARGUMENT_NAME_AND_INT);
private static final Pattern VOID_PATTERN = Pattern.compile("(?i:void)" + FUNCTION_ARGUMENT_NAME);

private static final Pattern ICEBERG_BUCKET_PATTERN = Pattern.compile("bucket\\[(\\d+)]");
private static final Pattern ICEBERG_TRUNCATE_PATTERN = Pattern.compile("truncate\\[(\\d+)]");
Expand All @@ -60,20 +61,25 @@ public static void parsePartitionField(PartitionSpec.Builder builder, String fie
{
@SuppressWarnings("PointlessBooleanExpression")
boolean matched = false ||
tryMatch(field, IDENTITY_PATTERN, match -> builder.identity(match.group())) ||
tryMatch(field, YEAR_PATTERN, match -> builder.year(match.group(1))) ||
tryMatch(field, MONTH_PATTERN, match -> builder.month(match.group(1))) ||
tryMatch(field, DAY_PATTERN, match -> builder.day(match.group(1))) ||
tryMatch(field, HOUR_PATTERN, match -> builder.hour(match.group(1))) ||
tryMatch(field, BUCKET_PATTERN, match -> builder.bucket(match.group(1), parseInt(match.group(2)))) ||
tryMatch(field, TRUNCATE_PATTERN, match -> builder.truncate(match.group(1), parseInt(match.group(2)))) ||
tryMatch(field, VOID_PATTERN, match -> builder.alwaysNull(match.group(1))) ||
tryMatch(field, IDENTITY_PATTERN, match -> builder.identity(fromIdentifier(match.group()))) ||
tryMatch(field, YEAR_PATTERN, match -> builder.year(fromIdentifier(match.group(1)))) ||
tryMatch(field, MONTH_PATTERN, match -> builder.month(fromIdentifier(match.group(1)))) ||
tryMatch(field, DAY_PATTERN, match -> builder.day(fromIdentifier(match.group(1)))) ||
tryMatch(field, HOUR_PATTERN, match -> builder.hour(fromIdentifier(match.group(1)))) ||
tryMatch(field, BUCKET_PATTERN, match -> builder.bucket(fromIdentifier(match.group(1)), parseInt(match.group(2)))) ||
tryMatch(field, TRUNCATE_PATTERN, match -> builder.truncate(fromIdentifier(match.group(1)), parseInt(match.group(2)))) ||
tryMatch(field, VOID_PATTERN, match -> builder.alwaysNull(fromIdentifier(match.group(1)))) ||
false;
if (!matched) {
throw new IllegalArgumentException("Invalid partition field declaration: " + field);
}
}

private static String fromIdentifier(String identifier)
{
return identifier.toLowerCase(ENGLISH);
}

private static boolean tryMatch(CharSequence value, Pattern pattern, Consumer<MatchResult> match)
{
Matcher matcher = pattern.matcher(value);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ public void testParse()
{
assertParse("order_key", partitionSpec(builder -> builder.identity("order_key")));
assertParse("comment", partitionSpec(builder -> builder.identity("comment")));
assertParse("COMMENT", partitionSpec(builder -> builder.identity("comment")), "comment");
assertParse("year(ts)", partitionSpec(builder -> builder.year("ts")));
assertParse("month(ts)", partitionSpec(builder -> builder.month("ts")));
assertParse("day(ts)", partitionSpec(builder -> builder.day("ts")));
Expand All @@ -47,6 +48,14 @@ public void testParse()
assertParse("truncate(comment, 13)", partitionSpec(builder -> builder.truncate("comment", 13)));
assertParse("truncate(order_key, 88)", partitionSpec(builder -> builder.truncate("order_key", 88)));
assertParse("void(order_key)", partitionSpec(builder -> builder.alwaysNull("order_key")));
assertParse("YEAR(ts)", partitionSpec(builder -> builder.year("ts")), "year(ts)");
assertParse("MONtH(ts)", partitionSpec(builder -> builder.month("ts")), "month(ts)");
assertParse("DaY(ts)", partitionSpec(builder -> builder.day("ts")), "day(ts)");
assertParse("HoUR(ts)", partitionSpec(builder -> builder.hour("ts")), "hour(ts)");
assertParse("BuCKET(order_key, 42)", partitionSpec(builder -> builder.bucket("order_key", 42)), "bucket(order_key, 42)");
assertParse("TRuncate(comment, 13)", partitionSpec(builder -> builder.truncate("comment", 13)), "truncate(comment, 13)");
assertParse("TRUNCATE(order_key, 88)", partitionSpec(builder -> builder.truncate("order_key", 88)), "truncate(order_key, 88)");
assertParse("VOId(order_key)", partitionSpec(builder -> builder.alwaysNull("order_key")), "void(order_key)");

assertInvalid("bucket()", "Invalid partition field declaration: bucket()");
assertInvalid("abc", "Cannot find source column: abc");
Expand All @@ -55,13 +64,20 @@ public void testParse()
assertInvalid("bucket(notes, 88)", "Cannot bucket by type: list<string>");
assertInvalid("truncate(ts, 13)", "Cannot truncate type: timestamp");
assertInvalid("year(order_key)", "Cannot partition type long by year");
assertInvalid("ABC", "Cannot find source column: abc");
assertInvalid("year(ABC)", "Cannot find source column: abc");
}

private static void assertParse(String value, PartitionSpec expected)
private static void assertParse(String value, PartitionSpec expected, String canonicalRepresentation)
{
assertEquals(expected.fields().size(), 1);
assertEquals(parseField(value), expected);
assertEquals(getOnlyElement(toPartitionFields(expected)), value);
assertEquals(getOnlyElement(toPartitionFields(expected)), canonicalRepresentation);
}

private static void assertParse(String value, PartitionSpec expected)
{
assertParse(value, expected, value);
}

private static void assertInvalid(String value, String message)
Expand Down

0 comments on commit da9cd47

Please sign in to comment.