In [1]:
# Before running, either set the matching secrets (https://docs.azuredatabricks.net/user-guide/secrets/secrets.html)
# or edit the variables below to contain valid connection details

secrets_scope = "KEYS"

data_lake_app_id = dbutils.secrets.get(secrets_scope, "DATA_LAKE_APP_ID")
data_lake_app_key = dbutils.secrets.get(secrets_scope, "DATA_LAKE_APP_KEY")
data_lake_app_tenant = dbutils.secrets.get(secrets_scope, "DATA_LAKE_APP_TENANT")
data_lake_account = dbutils.secrets.get(secrets_scope, "DATA_LAKE_ACCOUNT")

storage_account_key = dbutils.secrets.get(secrets_scope, "STORAGE_ACCOUNT_KEY")
storage_account = dbutils.secrets.get(secrets_scope, "STORAGE_ACCOUNT")

mount_folder = "dummy"
data_lake_mount_point = "/mnt/lake"
storage_mount_point = "/mnt/blob"

# Data Lake connectioction information and credentials
data_lake_configs = {"dfs.adls.oauth2.access.token.provider.type": "ClientCredential",
           "dfs.adls.oauth2.client.id": data_lake_app_id,
           "dfs.adls.oauth2.credential": data_lake_app_key,
           "dfs.adls.oauth2.refresh.url": "https://login.microsoftonline.com/%s/oauth2/token" % data_lake_app_tenant}

dbutils.fs.mount(
  source = "adl://%s.azuredatalakestore.net/%s" % (data_lake_account, mount_folder),
  mount_point = data_lake_mount_point,
  extra_configs = data_lake_configs)

storage_configs = {"fs.azure.account.key.%s.blob.core.windows.net" % storage_account: storage_account_key}

dbutils.fs.mount(
  source = "wasbs://%s@%s.blob.core.windows.net/%s" % (mount_folder, storage_account, mount_folder),
  mount_point = storage_mount_point,
  extra_configs = storage_configs)

dbutils.fs.mounts()

In [2]:
from random import random
from time import time
import math
import os

import avro.schema
from avro.datafile import DataFileWriter
from avro.io import DatumWriter

first_folder_count = 10
second_folder_count = 10
third_folder_count = 10
rows_per_file = 10

schema_string = '''
{
  "type" : "record",
  "name" : "Message",
  "namespace" : "Microsoft.Azure.Devices",
  "fields" : [ {
    "name" : "Body",
    "type" : [ "null", "bytes" ]
  } ]
}
'''
schema = avro.schema.parse(schema_string)

print('Amount of rows in each file: %d' % rows_per_file)
print('Total amount of files: %d' % (first_folder_count * second_folder_count * third_folder_count))

try:
  dbutils
except NameError:
  mkdirs = os.makedirs
else:
  mkdirs = dbutils.fs.mkdirs

def create_file(*args):
  folder_path = os.path.join(*map(str, args))
  mkdirs(folder_path)
  file_name = "dummy.avro"
  file_path = os.path.join(folder_path, file_name)
  writer = DataFileWriter(open(file_path, "wb"), DatumWriter(), schema)
  for i in range(rows_per_file):
    writer.append({"Body": "{\"id\":\"sensor-id-%s\",\"v\":%f,\"t\":%d}" % (i, random(), time() * 1000)})
  writer.close()

def create_files(root_folder):
  for first in range(first_folder_count):
    for second in range(second_folder_count):
      for third in range(third_folder_count):
        create_file(root_folder, first, second, third)

In [3]:
create_files(storage_mount_point)
dbutils.fs.ls(storage_mount_point)

In [4]:
create_files(data_lake_mount_point)
dbutils.fs.ls(data_lake_mount_point)

In [5]:
dbutils.fs.unmount(data_lake_mount_point)
dbutils.fs.unmount(storage_mount_point)