In [0]:
catalogs_df = spark.sql("show catalogs").filter("catalog not in ('hive_metastore', 'legacy_hive_catalog', 'sample', 'system','__databricks_internal','clearsale','delta_share_test','delta_share_teste','gcp_bq_lakeflow_prd','glue-odin-contatos-catalog','glue-odin-contatos_catalog','clearsale_enriquecimento_contatos','nike_glue_prd','samples')")

In [0]:
%skip
from pyspark.sql.functions import col
catalogs_df = catalogs_df.filter(col("catalog").rlike("^[o-z]"))

In [0]:
%skip
catalogs_df = spark.createDataFrame([("cross_prd",)], ["catalog"])

In [0]:
def get_comment(catalog,schema,schema_path):
    product = catalog.replace('_prd', '')
    match schema:
        case "bronze":
          return f"To store raw data ingested from {product}. Contains unprocessed data in delta format, PII is already encrypted, serving as the first layer in the data pipeline. Stored in {schema_path}"
        case "silver":
          return f"To store cleansed and enriched data from {product}. Contains data that has been refined and partially processed, making it suitable for more detailed analysis. Stored in {schema_path}"
        case "gold":
          return f"To store highly processed and aggregated data from {product}. Contains final datasets that are ready for business intelligence and reporting purposes. Stored in {schema_path}"
        case "control":
          return f"To store control tables used during data processing. Contains metadata and control tables that guide and manage the processing workflows. Stored in {schema_path}"
        case "temp":
          return f"To store temporary tables and volumes from {product}. Contains intermediate datasets that are used for processing and analysis. Stored in {schema_path}"
        case "raw":
          return f"Schema for raw data tables and volumes from {product} data product, stored in {schema_path}"
        case "sensitive":
          return f"Schema for tables and volumes with unencrypted PII data from {product} data product, stored in {schema_path}"
        case "sandbox":
          return f"Schema for tables and volumes used in {product} team for experiments, stored in {schema_path}"
        case "stage":
          return f"Schema for tables and volumes used in {product} for staging and snapshots, stored in {schema_path}"
        case "ingestion":
          return f"Schema for tables and volumes used in ingestion from {product} data product, stored in {schema_path}"
        case "delivery":
          return f"Schema for tables and volumes used in delivery from {product} data product, stored in {schema_path}"
        case "control":
          return f"Schema for control tables from {product} data product, stored in {schema_path}"
        case "vector":
          return f"Schema for tables and volumes used in vector search index from {product} data product, stored in {schema_path}"
        case "models":
          return f"Schema for tables and volumesused to serve data science models for the {product} data product, stored in {schema_path}"
        case "features":
          return f"Schema for tables and volumes used in data science features from {product} data product, stored in {schema_path}"
        case _:
            return f"Schema for {schema} tables and volumes from {product} data product, stored in {schema_path}"

In [0]:
def create_schema(catalog,schema):
  schema_exists = spark.sql(f"SHOW SCHEMAS IN {catalog} LIKE '{schema}'").count() > 0
  if schema_exists:
    print(f"Schema {schema} already exist in {catalog}")
  else:
    spark.sql(f"USE CATALOG {catalog}")

    catalog_path = spark.sql(f"DESCRIBE CATALOG EXTENDED {catalog}").filter("info_name = 'Storage Root'").select("info_value").collect()[0][0]
    schema_path = f"{catalog_path}{schema}"
    comment_schema = get_comment(catalog,schema,schema_path)
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema} MANAGED LOCATION '{schema_path}' COMMENT '{comment_schema}'")
    spark.sql(f"ALTER SCHEMA {catalog}.{schema} OWNER TO `APP-ECSBR-DATABRICKS-ADMIN`")
    spark.sql(f"ALTER SCHEMA {catalog}.{schema} INHERIT PREDICTIVE OPTIMIZATION")
    print(f"Schema {schema} created in {catalog}")

In [0]:
new_schemas = ['temp','raw','ingestion','bronze','delivery','control','vector','sensitive','sandbox','stage','silver','gold','models','features','semantic']
new_schemas = ['semantic']

In [0]:
for catalog_row in catalogs_df.select("catalog").collect():
  catalog = catalog_row.catalog
  print(catalog)
  for schema_item in new_schemas:
    create_schema(catalog,schema_item)
  if schema_item == "silver":
    spark.sql(f"GRANT USE SCHEMA, SELECT ON SCHEMA {catalog}.silver TO `account users`")
  if schema_item == "gold":
    spark.sql(f"GRANT USE SCHEMA, SELECT ON SCHEMA {catalog}.gold TO `account users`")
  if schema_item == "models":
    spark.sql(f"GRANT USE SCHEMA, SELECT ON SCHEMA {catalog}.models TO `account users`")
    spark.sql(f"GRANT APPLY TAG, EXECUTE, READ VOLUME, REFRESH, SELECT, USE SCHEMA ON SCHEMA {catalog}.models TO `APP-ECSBR_Databricks_DataScience`")
  if schema_item == "features":
    spark.sql(f"GRANT APPLY TAG, EXECUTE, READ VOLUME, REFRESH, SELECT, USE SCHEMA ON SCHEMA {catalog}.features TO `APP-ECSBR_Databricks_DataScience`")
  if schema_item == "semantic":
    spark.sql(f"GRANT USE SCHEMA, SELECT ON SCHEMA {catalog}.semantic TO `account users`")
  if schema_item == "raw":
    spark.sql(f"GRANT APPLY TAG, EXECUTE, MODIFY, READ VOLUME, REFRESH, SELECT, USE SCHEMA ON SCHEMA {catalog}.raw TO `APP-ECSBR_Databricks_Sustentacao`")
  for schema_sp_grant in ['raw','ingestion','bronze','delivery','control','vector','stage','silver','gold','models','features']:
    try:
      spark.sql(f"GRANT APPLY TAG, CREATE FUNCTION, CREATE MATERIALIZED VIEW, CREATE MODEL, CREATE MODEL VERSION, CREATE TABLE, CREATE VOLUME, EXECUTE, MODIFY, READ VOLUME,  REFRESH, SELECT, USE SCHEMA, WRITE VOLUME   ON SCHEMA {catalog}.{schema_sp_grant} TO `ecs_ci_cd_datalake@br.experian.com`")
    except Exception as e:
      print(f"Error granting {schema_sp_grant} schema to ecs_ci_cd_datalake@br.experian.com")

  for schema_sp_download in ['temp','raw','ingestion','bronze','delivery','control','vector','sandbox','stage','silver','gold','models','features']:
    spark.sql(f"GRANT USE SCHEMA, SELECT ON SCHEMA {catalog}.silver TO `544cb3ff-43e7-4e1c-bb8c-85c55b27b021`")
  spark.sql(f"GRANT MANAGE ON SCHEMA {catalog}.sensitive TO `APP-ECSBR_Databricks_Governanca`")

In [0]:
for schema_name in ['raw','bronze','silver','gold','sandbox','temp','features','models','semantic']:
  if spark.sql(f"SHOW SCHEMAS IN `{catalog}` LIKE '{schema_name}'").count() > 0:
    spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{schema_name}.checkpoints")
    for principal_name in ['NTT-MIGRATION','ecs_ci_cd_datalake@br.experian.com','APP-ECSBR-DATABRICKS-ADMIN','APP-ECSBR_Databricks_Sustentacao']:
      spark.sql(f"GRANT READ VOLUME, WRITE VOLUME ON VOLUME {catalog}.{schema_name}.checkpoints TO `{principal_name}`")