In [0]:
dbutils.widgets.dropdown("generate_data", "No", ['Yes', 'No'])
generate_data = dbutils.widgets.get("generate_data") == "Yes"

dbutils.widgets.dropdown("resample_data", "No", ["Yes", "No"])
resample_data = dbutils.widgets.get("resample_data") == "Yes"


# Generate Test Data for Benchmarking

This notebook generates a bunch of test data, based on the `samples.nyctaxi.trips` dataset.

For reference, using the following parameters, the data generation took about 90 minutes.

* `target_data_size`: `1000` (MB)
* `source_data_size`: `50` (MB)
* `number_of_tables`: `20` 
* `number_of_updates`: `50` (50 updates per table = 1000 updates)

In [0]:
dbutils.widgets.text("target_data_size", "50000", "Size of Target Table (MB)")
dbutils.widgets.text("source_data_size", "1000", "Size of Incremental Source Data (MB)")
dbutils.widgets.text("number_of_tables", "20", "Number of target tables to generate")
dbutils.widgets.text("number_of_updates", "50", "Number of source tables (per target)")

dbutils.widgets.text("catalog", "field_demos", "Catalog to use")
dbutils.widgets.text("schema", "tauherng", "Schema to use")
dbutils.widgets.text("target_table_prefix", "merge", "Table Prefix to use")

NUM_TABLES_TO_GENERATE = int(dbutils.widgets.get("number_of_tables"))
NUM_UPDATES_TO_GENERATE = int(dbutils.widgets.get("number_of_updates"))
TARGET_DATA_SIZE = int(dbutils.widgets.get("target_data_size"))
SOURCE_DATA_SIZE = int(dbutils.widgets.get("source_data_size"))
CATALOG = dbutils.widgets.get("catalog")
SCHEMA = dbutils.widgets.get("schema")
TABLE_PREFIX = dbutils.widgets.get("target_table_prefix")

In [0]:
from pyspark.sql import functions as F
import json

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS IDENTIFIER(:catalog);
CREATE SCHEMA IF NOT EXISTS IDENTIFIER(CONCAT(:catalog, ".", :schema))

In [0]:
df_size_mb = spark.sql("describe detail samples.nyctaxi.trips").select("sizeInBytes").collect()[0][0]/1024/1024

In [0]:
original_df = spark.read.table("samples.nyctaxi.trips")

In [0]:
num_duplications = int(TARGET_DATA_SIZE/df_size_mb)

In [0]:
# Keep track of created tables so we can duplicate them for serverless & classic
created_tables = []

In [0]:
if dbutils.widgets.get("generate_data") == "Yes":
  print("Generating data")
  # Generate the data
  first_table_name = f"{CATALOG}.{SCHEMA}.{TABLE_PREFIX}_target_table_0"
  spark.sql(f"DROP TABLE IF EXISTS {first_table_name}")
  spark.sql(f"CREATE TABLE {first_table_name}")
  table_dupes = 1
  original_df.write.mode("append").option('mergeSchema', "true").saveAsTable(first_table_name)
  while table_dupes < num_duplications:
    df = spark.read.table(first_table_name)
    df.write.mode("append").option('mergeSchema', "true").saveAsTable(first_table_name)
    table_dupes = table_dupes * 2
    print(table_dupes)

  for table_num in range(NUM_TABLES_TO_GENERATE):
    table_name = f"{CATALOG}.{SCHEMA}.{TABLE_PREFIX}_target_table_{table_num+1}"
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    df = spark.read.table(first_table_name).withColumn("uuid", F.expr("uuid()"))
    # Just re-use the same data, but re-generate all UUIDs
    df.write.mode("append").option('mergeSchema', "true").saveAsTable(table_name)
    created_tables += [table_name]

  spark.sql(f"DROP TABLE {first_table_name}") # We don't need this anymore. Save space

  with open("created_tables.json", "w") as final:
    json.dump(created_tables, final)

In [0]:
# Now we randomly pick some of the target data to update and save as an incremental source

if resample_data:
  for table_num in range(NUM_TABLES_TO_GENERATE):
    target_table_name = f"{CATALOG}.{SCHEMA}.{TABLE_PREFIX}_target_table_{table_num+1}"
    for update_num in range(NUM_UPDATES_TO_GENERATE):
      source_table_name = f"{CATALOG}.{SCHEMA}.{TABLE_PREFIX}_source_table_{table_num+1}_{update_num+1}"
      target_df = spark.read.table(target_table_name)
      frac_updates = SOURCE_DATA_SIZE/TARGET_DATA_SIZE
      source_df = target_df.sample(fraction = frac_updates)
      source_df = source_df.withColumn("pickup_zip", F.col("pickup_zip")+1).withColumn("dropoff_zip", F.col("dropoff_zip")-1)
      source_df.write.mode("overwrite").saveAsTable(source_table_name)