-
Notifications
You must be signed in to change notification settings - Fork 191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Auto-create tables when batch insertion fails. #248
Changes from all commits
0a21bd5
d855548
b7c851d
ed3c9b2
c45aca0
a2538ed
8709d04
0b2ffec
c612f69
5a8ce72
88f8c63
9798333
ca32148
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,7 +32,6 @@ | |
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.concurrent.atomic.AtomicInteger; | ||
|
||
/** | ||
* Batch Table Writer that uploads records to GCS as a blob | ||
|
@@ -45,6 +44,7 @@ public class GCSBatchTableWriter implements Runnable { | |
|
||
private final String bucketName; | ||
private final String blobName; | ||
private final String topic; | ||
|
||
private final List<RowToInsert> rows; | ||
private final GCSToBQWriter writer; | ||
|
@@ -56,15 +56,18 @@ public class GCSBatchTableWriter implements Runnable { | |
* @param bucketName the name of the GCS bucket where the blob should be uploaded | ||
* @param baseBlobName the base name of the blob in which the serialized rows should be uploaded. | ||
* The full name is [baseBlobName]_[writerId]_ | ||
* @param topic Kafka record topic | ||
*/ | ||
private GCSBatchTableWriter(List<RowToInsert> rows, | ||
GCSToBQWriter writer, | ||
TableId tableId, | ||
String bucketName, | ||
String baseBlobName) { | ||
String baseBlobName, | ||
String topic) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: add doc to topic There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added doc~ |
||
this.tableId = tableId; | ||
this.bucketName = bucketName; | ||
this.blobName = baseBlobName; | ||
this.topic = topic; | ||
|
||
this.rows = rows; | ||
this.writer = writer; | ||
|
@@ -73,7 +76,7 @@ private GCSBatchTableWriter(List<RowToInsert> rows, | |
@Override | ||
public void run() { | ||
try { | ||
writer.writeRows(rows, tableId, bucketName, blobName); | ||
writer.writeRows(rows, tableId, bucketName, blobName, topic); | ||
} catch (ConnectException ex) { | ||
throw new ConnectException("Failed to write rows to GCS", ex); | ||
} catch (InterruptedException ex) { | ||
|
@@ -87,6 +90,7 @@ public void run() { | |
public static class Builder implements TableWriterBuilder { | ||
private final String bucketName; | ||
private String blobName; | ||
private String topic; | ||
|
||
private final TableId tableId; | ||
|
||
|
@@ -101,16 +105,19 @@ public static class Builder implements TableWriterBuilder { | |
* @param tableId The bigquery table to be written to. | ||
* @param gcsBucketName The GCS bucket to write to. | ||
* @param gcsBlobName The name of the GCS blob to write. | ||
* @param topic Kafka record topic | ||
* @param recordConverter the {@link RecordConverter} to use. | ||
*/ | ||
public Builder(GCSToBQWriter writer, | ||
TableId tableId, | ||
String gcsBucketName, | ||
String gcsBlobName, | ||
String topic, | ||
RecordConverter<Map<String, Object>> recordConverter) { | ||
|
||
this.bucketName = gcsBucketName; | ||
this.blobName = gcsBlobName; | ||
this.topic = topic; | ||
|
||
this.tableId = tableId; | ||
|
||
|
@@ -133,7 +140,7 @@ public void addRow(RowToInsert rowToInsert) { | |
} | ||
|
||
public GCSBatchTableWriter build() { | ||
return new GCSBatchTableWriter(rows, writer, tableId, bucketName, blobName); | ||
return new GCSBatchTableWriter(rows, writer, tableId, bucketName, blobName, topic); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
|
||
|
||
import com.google.cloud.bigquery.BigQuery; | ||
import com.google.cloud.bigquery.TableId; | ||
import com.google.cloud.bigquery.BigQueryError; | ||
import com.google.cloud.bigquery.BigQueryException; | ||
import com.google.cloud.bigquery.InsertAllRequest; | ||
|
@@ -37,16 +38,20 @@ | |
import java.util.Map; | ||
|
||
/** | ||
* A {@link BigQueryWriter} capable of updating BigQuery table schemas. | ||
* A {@link BigQueryWriter} capable of updating BigQuery table schemas and creating non-existed tables automatically. | ||
*/ | ||
public class AdaptiveBigQueryWriter extends BigQueryWriter { | ||
private static final Logger logger = LoggerFactory.getLogger(AdaptiveBigQueryWriter.class); | ||
|
||
// The maximum number of retries we will attempt to write rows after updating a BQ table schema. | ||
private static final int AFTER_UPDATE_RETY_LIMIT = 5; | ||
// The maximum number of retries we will attempt to write rows after creating a table or updating a BQ table schema. | ||
private static final int RETRY_LIMIT = 5; | ||
// Wait for about 30s between each retry since both creating table and updating schema take up to 2~3 minutes to take effect. | ||
private static final int RETRY_WAIT_TIME = 30000; | ||
|
||
private final BigQuery bigQuery; | ||
private final SchemaManager schemaManager; | ||
private final boolean autoUpdateSchemas; | ||
private final boolean autoCreateTables; | ||
|
||
/** | ||
* @param bigQuery Used to send write requests to BigQuery. | ||
|
@@ -57,10 +62,14 @@ public class AdaptiveBigQueryWriter extends BigQueryWriter { | |
public AdaptiveBigQueryWriter(BigQuery bigQuery, | ||
SchemaManager schemaManager, | ||
int retry, | ||
long retryWait) { | ||
long retryWait, | ||
boolean autoUpdateSchemas, | ||
boolean autoCreateTables) { | ||
super(retry, retryWait); | ||
this.bigQuery = bigQuery; | ||
this.schemaManager = schemaManager; | ||
this.autoUpdateSchemas = autoUpdateSchemas; | ||
this.autoCreateTables = autoCreateTables; | ||
} | ||
|
||
private boolean isTableMissingSchema(BigQueryException exception) { | ||
|
@@ -69,6 +78,12 @@ private boolean isTableMissingSchema(BigQueryException exception) { | |
return exception.getReason() != null && exception.getReason().equalsIgnoreCase("invalid"); | ||
} | ||
|
||
private boolean isTableNotExistedException(BigQueryException exception) { | ||
// If a table does not exist, it will raise a BigQueryException that the input is notFound | ||
// Referring to Google Cloud Error Codes Doc: https://cloud.google.com/bigquery/docs/error-messages?hl=en | ||
return exception.getCode() == 404; | ||
} | ||
|
||
/** | ||
* Sends the request to BigQuery, then checks the response to see if any errors have occurred. If | ||
* any have, and all errors can be blamed upon invalid columns in the rows sent, attempts to | ||
|
@@ -86,21 +101,24 @@ public Map<Long, List<BigQueryError>> performWriteRequest( | |
try { | ||
request = createInsertAllRequest(tableId, rows); | ||
writeResponse = bigQuery.insertAll(request); | ||
// Should only perform one schema update attempt; may have to continue insert attempts due to | ||
// BigQuery schema updates taking up to two minutes to take effect | ||
// Should only perform one schema update attempt. | ||
if (writeResponse.hasErrors() | ||
&& onlyContainsInvalidSchemaErrors(writeResponse.getInsertErrors())) { | ||
&& onlyContainsInvalidSchemaErrors(writeResponse.getInsertErrors()) && autoUpdateSchemas) { | ||
attemptSchemaUpdate(tableId, topic); | ||
} | ||
} catch (BigQueryException exception) { | ||
if (isTableMissingSchema(exception)) { | ||
// Should only perform one table creation attempt. | ||
if (isTableNotExistedException(exception) && autoCreateTables && bigQuery.getTable(tableId.getBaseTableId()) == null) { | ||
attemptTableCreate(tableId.getBaseTableId(), topic); | ||
} else if (isTableMissingSchema(exception) && autoUpdateSchemas) { | ||
attemptSchemaUpdate(tableId, topic); | ||
} else { | ||
throw exception; | ||
} | ||
} | ||
|
||
// Schema update might be delayed, so multiple insertion attempts may be necessary | ||
// Creating tables or updating table schemas in BigQuery takes up to 2~3 minutes to take affect, | ||
// so multiple insertion attempts may be necessary. | ||
int attemptCount = 0; | ||
while (writeResponse == null || writeResponse.hasErrors()) { | ||
logger.trace("insertion failed"); | ||
|
@@ -117,10 +135,15 @@ && onlyContainsInvalidSchemaErrors(writeResponse.getInsertErrors())) { | |
return writeResponse.getInsertErrors(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the doc accordingly~ |
||
attemptCount++; | ||
if (attemptCount >= AFTER_UPDATE_RETY_LIMIT) { | ||
if (attemptCount >= RETRY_LIMIT) { | ||
throw new BigQueryConnectException( | ||
"Failed to write rows after BQ schema update within " | ||
+ AFTER_UPDATE_RETY_LIMIT + " attempts for: " + tableId.getBaseTableId()); | ||
+ RETRY_LIMIT + " attempts for: " + tableId.getBaseTableId()); | ||
} | ||
try { | ||
Thread.sleep(RETRY_WAIT_TIME); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If retry is caused by schema update, we don't need
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the comment here https://github.com/wepay/kafka-connect-bigquery/blob/master/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/write/row/AdaptiveBigQueryWriter.java#L90-L91, updating schema in bigquery is also an async process which takes up to 2 min. I think we'll want to wait for some time (like 30s) for each retry for updating schema as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I would suggest add more comment in the function or clean up a little bit. It is not very easy to read to people unfamiliar. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cleaned up the existed comments to my best effort. |
||
} catch (InterruptedException e) { | ||
// no-op, we want to keep retrying the insert | ||
} | ||
} | ||
logger.debug("table insertion completed successfully"); | ||
|
@@ -136,6 +159,16 @@ private void attemptSchemaUpdate(PartitionedTableId tableId, String topic) { | |
} | ||
} | ||
|
||
private void attemptTableCreate(TableId tableId, String topic) { | ||
try { | ||
schemaManager.createTable(tableId, topic); | ||
logger.info("Table {} does not exist, auto-created table for topic {}", tableId, topic); | ||
} catch (BigQueryException exception) { | ||
throw new BigQueryConnectException( | ||
"Failed to create table " + tableId, exception); | ||
} | ||
} | ||
|
||
/* | ||
* Currently, the only way to determine the cause of an insert all failure is by examining the map | ||
* object returned by the insertErrors() method of an insert all response. The only way to | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@bingqinzhou @mtagle @wicknicks Seems like now there is no way to use this connector without specifying
schema registry
.getGcsWriter()
is being called at start of the connector and which expects schema registry details. I am using the connector forjson
data which doesn't require schema registry and connector is throwing exception.