In [1]:
# Before running, either set the matching secrets (https://docs.azuredatabricks.net/user-guide/secrets/secrets.html)
# or edit the variables below to contain valid connection details

secrets_scope = "KEYS"

data_lake_app_id = dbutils.secrets.get(secrets_scope, "DATA_LAKE_APP_ID")
data_lake_app_key = dbutils.secrets.get(secrets_scope, "DATA_LAKE_APP_KEY")
data_lake_app_tenant = dbutils.secrets.get(secrets_scope, "DATA_LAKE_APP_TENANT")
data_lake_account = dbutils.secrets.get(secrets_scope, "DATA_LAKE_ACCOUNT")

storage_account_key = dbutils.secrets.get(secrets_scope, "STORAGE_ACCOUNT_KEY")
storage_account = dbutils.secrets.get(secrets_scope, "STORAGE_ACCOUNT")

mount_folder = "test"
output_folder = "data"
data_lake_mount_point = "/mnt/lake"
storage_mount_point = "/mnt/blob"

# Data Lake connectioction information and credentials
data_lake_configs = {"dfs.adls.oauth2.access.token.provider.type": "ClientCredential",
           "dfs.adls.oauth2.client.id": data_lake_app_id,
           "dfs.adls.oauth2.credential": data_lake_app_key,
           "dfs.adls.oauth2.refresh.url": "https://login.microsoftonline.com/%s/oauth2/token" % data_lake_app_tenant}

dbutils.fs.mount(
  source = "adl://%s.azuredatalakestore.net/%s" % (data_lake_account, mount_folder),
  mount_point = data_lake_mount_point,
  extra_configs = data_lake_configs)

storage_configs = {"fs.azure.account.key.%s.blob.core.windows.net" % storage_account: storage_account_key}

dbutils.fs.mount(
  source = "wasbs://%s@%s.blob.core.windows.net/%s" % (mount_folder, storage_account, mount_folder),
  mount_point = storage_mount_point,
  extra_configs = storage_configs)

dbutils.fs.mounts()

In [2]:
from random import random
from time import time

from pyspark.sql.types import *

schema = StructType([
  StructField("First", IntegerType(), True),
  StructField("Second", IntegerType(), True),
  StructField("Third", IntegerType(), True),
  StructField("Body", BinaryType(), True)
])

first_folder_count = 10
second_folder_count = 10
third_folder_count = 10
rows_per_file = 100

print('Amount of rows in each file: %d' % rows_per_file)
print('Total amount of files: %d' % (first_folder_count * second_folder_count * third_folder_count))

values = []
for first in range(first_folder_count):
  for second in range(second_folder_count):
    for third in range(third_folder_count):
      for i in range(rows_per_file):
        body = "{\"id\":\"sensor-id-%s\",\"v\":%f,\"t\":%d}" % (i, random(), time() * 1000)
        values.append((first, second, third, bytearray(body)))

print('Values list generated')

df = spark.createDataFrame(values, schema=schema)

In [3]:
save_folder = "%s/%s" % (storage_mount_point, output_folder)
df.write.partitionBy("First", "Second", "Third").format("com.databricks.spark.avro").save(save_folder)

In [4]:
save_folder = "%s/%s" % (data_lake_mount_point, output_folder)
df.write.partitionBy("First", "Second", "Third").format("com.databricks.spark.avro").save(save_folder)

In [5]:
dbutils.fs.unmount(data_lake_mount_point)
dbutils.fs.unmount(storage_mount_point)