# Full Idempotent Setup

## 1. Setup

In [0]:
dbutils.widgets.text(
    "catalog_name", 
    "apex_bank_demo", 
    "1. Catalog Name"
)
dbutils.widgets.dropdown(
    "reset_mode", 
    "False", 
    ["True", "False"], 
    "2. Nuke & Reset?"
)
catalog_name = dbutils.widgets.get("catalog_name")
reset_mode = dbutils.widgets.get("reset_mode")

## 2. Cleanup

In [0]:
if reset_mode == "True":
    print(f"Dropping catalog {catalog_name}...")
    spark.sql(
        f"DROP CATALOG IF EXISTS {catalog_name} CASCADE"
    )

## 3. Infrastructure

In [0]:
print(f"Building architecture in {catalog_name}...")
# Create Catalog
spark.sql(
    f"CREATE CATALOG IF NOT EXISTS {catalog_name}"
)
spark.sql(
    f"USE CATALOG {catalog_name}"
)
# Create Schemas
spark.sql(
    "CREATE SCHEMA IF NOT EXISTS raw_data"
)
spark.sql(
    "CREATE SCHEMA IF NOT EXISTS analytics"
)
# Create Volume (The Landing Zone)
spark.sql(
    "CREATE VOLUME IF NOT EXISTS raw_data.landing_zone"
)
spark.sql(
    "CREATE VOLUME IF NOT EXISTS raw_data.checkpoints"
)
print("Infrastructure ready")

## 4. Seed Data

_Note: this is blocked on a shared cluster restricting access to local filesystem_

In [0]:
import os

def find_project_root(start_path, marker="databricks.yml"):
    """Walk up directories until we find the project marker file"""
    current = start_path
    while current != "/" and current != "/Workspace":
        if os.path.exists(os.path.join(current, marker)):
            return current
        current = os.path.dirname(current)
    raise FileNotFoundError(f"Could not find {marker} in parent directories")

# Dynamic path retrieval for the Git repo
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
notebook_dir = f"/Workspace{os.path.dirname(notebook_path)}"

# Find project root by looking for databricks.yml marker
project_root = find_project_root(notebook_dir)
source_path = os.path.join(project_root, "data")

# Target Volume Path (trailing slash required)
destination_volume_path = (
    f"/Volumes/{catalog_name}/raw_data/landing_zone/"
)

print(
    f"Copying data...\nFrom: {source_path}\nTo: {destination_volume_path}"
)

if not os.path.exists(source_path):
    print(f"Error: Data directory not found at {source_path}")
else:
    files = [
        f for f in os.listdir(source_path) 
        if f.endswith(".csv")
    ]
    for file in files:
        source_file_uri = f"file:{os.path.join(source_path, file)}"
        target_file_uri = f"{destination_volume_path}{file}"
        dbutils.fs.cp(
            source_file_uri, 
            target_file_uri
        )
        print(f"  -> Copied {file}")

print(
    f"Data seeding complete. {len(files)} files transferred."
)

## 5. Load Reference Tables

Load accounts and fraud_labels from CSVs into analytics schema for use by `03_unity_catalog_setup.sql`

In [0]:
# Load accounts table from CSV
accounts_path = f"/Volumes/{catalog_name}/raw_data/landing_zone/synthetic_accounts.csv"
print(f"Loading accounts from {accounts_path}...")

accounts_df = spark.read.csv(
    accounts_path,
    header=True,
    inferSchema=True
)

accounts_df.write.mode("overwrite").saveAsTable(
    f"{catalog_name}.analytics.accounts"
)

print(f"Loaded {accounts_df.count()} accounts into {catalog_name}.analytics.accounts")

In [0]:
# Load fraud_labels table from CSV
fraud_labels_path = f"/Volumes/{catalog_name}/raw_data/landing_zone/synthetic_fraud_labels.csv"
print(f"Loading fraud_labels from {fraud_labels_path}...")

fraud_labels_df = spark.read.csv(
    fraud_labels_path,
    header=True,
    inferSchema=True
)

fraud_labels_df.write.mode("overwrite").saveAsTable(
    f"{catalog_name}.analytics.fraud_labels"
)

print(f"Loaded {fraud_labels_df.count()} fraud_labels into {catalog_name}.analytics.fraud_labels")

## 6. Verify

In [0]:
display(
    dbutils.fs.ls(destination_volume_path)
)