Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First prototype of hive data importer
Includes: - StorageManager w/ stored metadata in H2. Imports from this class are transactional. - StorageManager has a two phase import that uses the first pass stats to determine the optimal encoding - Hive wrapper around StorageManager to track partitions loaded (HiveImportManager) - Retry handling for import errors - HACKY way of providing temporary query support until we can formalize the process with the actual QueryPlanner There are a ton of very hacky things in this commit, but it should be sufficient for running imports and providing the foundation for running queries on the stored data.
- Loading branch information
Showing
7 changed files
with
870 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
68 changes: 68 additions & 0 deletions
68
presto-main/src/main/java/com/facebook/presto/RetryDriver.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
package com.facebook.presto; | ||
|
||
import io.airlift.log.Logger; | ||
|
||
import java.util.concurrent.Callable; | ||
import java.util.concurrent.TimeUnit; | ||
|
||
import static com.google.common.base.Preconditions.checkArgument; | ||
import static com.google.common.base.Preconditions.checkNotNull; | ||
|
||
public class RetryDriver | ||
{ | ||
private static final Logger log = Logger.get(RetryDriver.class); | ||
|
||
private RetryDriver() | ||
{ | ||
} | ||
|
||
public static <V> V runWithRetry(Callable<V> callable) | ||
throws Exception | ||
{ | ||
return runWithRetry(callable, "<default>"); | ||
} | ||
|
||
public static <V> V runWithRetry(Callable<V> callable, int maxRetryAttempts) | ||
throws Exception | ||
{ | ||
return runWithRetry(callable, "<default>", maxRetryAttempts); | ||
} | ||
|
||
public static <V> V runWithRetry(Callable<V> callable, String callableName) | ||
throws Exception | ||
{ | ||
return runWithRetry(callable, callableName, 10); | ||
} | ||
|
||
public static <V> V runWithRetry(Callable<V> callable, String callableName, int maxRetryAttempts) | ||
throws Exception | ||
{ | ||
return runWithRetry(callable, callableName, maxRetryAttempts, 1); | ||
} | ||
|
||
public static <V> V runWithRetry(Callable<V> callable, String callableName, int maxRetryAttempts, int sleepSecs) | ||
throws Exception | ||
{ | ||
checkNotNull(callable, "callable is null"); | ||
checkNotNull(callableName, "callableName is null"); | ||
checkArgument(maxRetryAttempts > 0, "maxRetryAttempts must be greater than zero"); | ||
checkArgument(sleepSecs >= 0, "sleepSecs must be at least than zero"); | ||
|
||
int attempt = 0; | ||
while (true) { | ||
attempt++; | ||
try { | ||
return callable.call(); | ||
} | ||
catch (Exception e) { | ||
if (attempt == maxRetryAttempts) { | ||
throw e; | ||
} | ||
else { | ||
log.warn("Failed on executing %s with attempt %d, will retry. Exception: %s", callableName, attempt, e.getMessage()); | ||
} | ||
TimeUnit.SECONDS.sleep(sleepSecs); | ||
} | ||
} | ||
} | ||
} |
176 changes: 176 additions & 0 deletions
176
presto-main/src/main/java/com/facebook/presto/metadata/HiveImportManager.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
package com.facebook.presto.metadata; | ||
|
||
import com.facebook.presto.Tuple; | ||
import com.facebook.presto.TupleInfo; | ||
import com.facebook.presto.block.StaticTupleAppendingTupleStream; | ||
import com.facebook.presto.block.TupleStream; | ||
import com.facebook.presto.hive.HiveClient; | ||
import com.facebook.presto.hive.PartitionChunk; | ||
import com.facebook.presto.hive.RecordIterator; | ||
import com.facebook.presto.hive.SchemaField; | ||
import com.facebook.presto.ingest.HiveTupleStream; | ||
import com.facebook.presto.slice.Slices; | ||
import com.google.common.base.Charsets; | ||
import com.google.inject.Inject; | ||
import org.skife.jdbi.v2.Handle; | ||
import org.skife.jdbi.v2.IDBI; | ||
import org.skife.jdbi.v2.tweak.HandleCallback; | ||
|
||
import java.util.List; | ||
import java.util.concurrent.Callable; | ||
|
||
import static com.facebook.presto.RetryDriver.runWithRetry; | ||
import static com.google.common.base.Preconditions.checkNotNull; | ||
|
||
public class HiveImportManager | ||
{ | ||
private final HiveClient hiveClient; | ||
private final StorageManager storageManager; | ||
private final HiveImportRegistry hiveImportRegistry; | ||
|
||
@Inject | ||
public HiveImportManager(HiveClient hiveClient, StorageManager storageManager, IDBI dbi) | ||
{ | ||
this.hiveClient = checkNotNull(hiveClient, "hiveClient is null"); | ||
this.storageManager = checkNotNull(storageManager, "storageManager is null"); | ||
hiveImportRegistry = new HiveImportRegistry(checkNotNull(dbi, "dbi is null")); | ||
} | ||
|
||
public long importPartition(final String databaseName, final String tableName, final String partitionName) | ||
throws Exception | ||
{ | ||
checkNotNull(databaseName, "databaseName is null"); | ||
checkNotNull(tableName, "tableName is null"); | ||
checkNotNull(partitionName, "partitionName is null"); | ||
|
||
// TODO: prevent multiple simultaneous imports on same partition (race condition) | ||
if (hiveImportRegistry.isPartitionImported(databaseName, tableName, partitionName)) { | ||
// Already imported | ||
return 0; | ||
} | ||
|
||
final Tuple partitionTuple = TupleInfo.SINGLE_VARBINARY.builder() | ||
.append(Slices.wrappedBuffer(partitionName.getBytes(Charsets.UTF_8))) | ||
.build(); | ||
|
||
List<PartitionChunk> chunks = runWithRetry(new Callable<List<PartitionChunk>>() | ||
{ | ||
@Override | ||
public List<PartitionChunk> call() | ||
throws Exception | ||
{ | ||
return hiveClient.getPartitionChunks(databaseName, tableName, partitionName); | ||
} | ||
}); | ||
|
||
final List<SchemaField> schemaFields = runWithRetry(new Callable<List<SchemaField>>() | ||
{ | ||
@Override | ||
public List<SchemaField> call() | ||
throws Exception | ||
{ | ||
return hiveClient.getTableSchema(databaseName, tableName); | ||
} | ||
}); | ||
|
||
long rowCount = 0; | ||
// TODO: right now, failures can result in partial partitions to be loaded (smallest unit needs to be transactional) | ||
for (final PartitionChunk chunk : chunks) { | ||
rowCount += runWithRetry(new Callable<Long>() | ||
{ | ||
@Override | ||
public Long call() | ||
throws Exception | ||
{ | ||
try (RecordIterator records = hiveClient.getRecords(chunk)) { | ||
TupleStream sourceTupleStream = new StaticTupleAppendingTupleStream( | ||
new HiveTupleStream(records, schemaFields), | ||
partitionTuple | ||
); | ||
// TODO: add layer to break up incoming TupleStream based on size | ||
return storageManager.importTableShard(sourceTupleStream, databaseName, tableName); | ||
} | ||
} | ||
}); | ||
} | ||
hiveImportRegistry.markPartitionImported(databaseName, tableName, partitionName); | ||
return rowCount; | ||
} | ||
|
||
// TODO: the import registry should use the CHUNK as the smallest unit of import | ||
private static class HiveImportRegistry | ||
{ | ||
private final IDBI dbi; | ||
|
||
public HiveImportRegistry(IDBI dbi) | ||
{ | ||
this.dbi = checkNotNull(dbi, "dbi is null"); | ||
initializeDatabaseIfNecessary(); | ||
} | ||
|
||
private void initializeDatabaseIfNecessary() | ||
{ | ||
dbi.withHandle(new HandleCallback<Void>() | ||
{ | ||
@Override | ||
public Void withHandle(Handle handle) | ||
throws Exception | ||
{ | ||
// TODO: use ids for database, table, and partition | ||
handle.createStatement("CREATE TABLE IF NOT EXISTS imported_hive_partitions (database VARCHAR(256), table VARCHAR(256), partition VARCHAR(256), PRIMARY KEY(database, table, partition))") | ||
.execute(); | ||
return null; | ||
} | ||
}); | ||
} | ||
|
||
public boolean isPartitionImported(final String databaseName, final String tableName, final String partitionName) | ||
{ | ||
checkNotNull(databaseName, "databaseName is null"); | ||
checkNotNull(tableName, "tableName is null"); | ||
checkNotNull(partitionName, "partitionName is null"); | ||
|
||
return dbi.withHandle(new HandleCallback<Boolean>() | ||
{ | ||
@Override | ||
public Boolean withHandle(Handle handle) | ||
throws Exception | ||
{ | ||
return !handle.createQuery( | ||
"SELECT * " + | ||
"FROM imported_hive_partitions " + | ||
"WHERE database = :database " + | ||
"AND table = :table " + | ||
"AND partition = :partition") | ||
.bind("database", databaseName) | ||
.bind("table", tableName) | ||
.bind("partition", partitionName) | ||
.list() | ||
.isEmpty(); | ||
} | ||
}); | ||
} | ||
|
||
public void markPartitionImported(final String databaseName, final String tableName, final String partitionName) | ||
{ | ||
checkNotNull(databaseName, "databaseName is null"); | ||
checkNotNull(tableName, "tableName is null"); | ||
checkNotNull(partitionName, "partitionName is null"); | ||
|
||
dbi.withHandle(new HandleCallback<Void>() | ||
{ | ||
@Override | ||
public Void withHandle(Handle handle) | ||
throws Exception | ||
{ | ||
handle.createStatement("INSERT INTO imported_hive_partitions (database, table, partition) values (:database, :table, :partition)") | ||
.bind("database", databaseName) | ||
.bind("table", tableName) | ||
.bind("partition", partitionName) | ||
.execute(); | ||
return null; | ||
} | ||
}); | ||
} | ||
} | ||
} |
Oops, something went wrong.