In [0]:
# Databricks Notebook (Python)
import json, random
from datetime import datetime, timedelta, timezone

# =========================
# 設定
# =========================
catalog = "dataquality"
schema = "dq_demo"
base_path = f"/Volumes/{catalog}/{schema}/dq_demo_volume"

# event_ts は内部的に年付きの timestamp を使います（ファイル名/フォルダ名は年なし）
start_dt = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
days = 7
records_per_file = 100

channels = ["CVS", "Supermarket", "Vending", "ECommerce"]
plants = [f"PLANT_{i:02d}" for i in range(1, 4)]
warehouses = [f"WH_{i:02d}" for i in range(1, 4)]
stores = [f"STORE_{i:04d}" for i in range(1, 51)]
skus = [f"SKU_{i:03d}" for i in range(1, 31)]
packages = [350, 500, 600]
quality_flags = ["OK", "DAMAGED", "RETURNED"]

def ensure_dir(path: str):
    dbutils.fs.mkdirs(path)

ensure_dir(base_path)

# =========================
# レコード生成
# =========================
def gen_record(hour_dt, idx, day_idx, dup_txn_pool):
    """
    day_idx: 0-based
      - 0〜5: Day1〜Day6（軽微な汚れ）
      - 6: Day7（明確な異常）
    """
    is_day7 = (day_idx == 6)

    # 年なしの識別子（見た目用）
    mmdd = hour_dt.strftime("%m%d")
    mm_dd = hour_dt.strftime("%m-%d")
    hh = hour_dt.strftime("%H")
    source_file_hour = f"{mm_dd}-{hh}"  # 例: 01-07-09

    # -------------------------
    # 基本値（正常）
    # -------------------------
    plant_id = random.choice(plants)
    warehouse_id = random.choice(warehouses)
    store_id = random.choice(stores)
    channel = random.choice(channels)
    sku_id = random.choice(skus)
    package_ml = random.choice(packages)
    qty = random.randint(1, 24)
    unit_price = random.randint(80, 250)
    sales = qty * unit_price
    lead = random.randint(2, 24)
    promo = random.random() < 0.2
    qflag = random.choices(quality_flags, weights=[0.95, 0.03, 0.02])[0]

    # txn_id も年なし（見た目用）
    txn_id = f"{mm_dd}-{hh}-{idx:03d}-{random.randint(1000,9999)}"

    # -------------------------
    # 異常注入確率
    # -------------------------
    if is_day7:
        # 明確に閾値超え
        p_null_wh = 0.30
        p_dup_txn = 0.10
        p_sales_mismatch = 0.03
        p_sku_bias = 0.70
    else:
        # 現実的な軽微な汚れ（閾値未満）
        p_null_wh = 0.005         # 0.5%
        p_dup_txn = 0.002         # 0.2%
        p_sales_mismatch = 0.001  # 0.1%
        p_sku_bias = 0.15

    r = random.random()

    # warehouse_id 欠損
    if r < p_null_wh:
        warehouse_id = None

    # txn_id 重複
    if r < p_dup_txn and dup_txn_pool:
        txn_id = random.choice(dup_txn_pool)

    # 売上不整合
    if r < p_sales_mismatch:
        sales = sales + random.choice([-10, 10, 50, -50])

    # SKU 偏り（分布ドリフト）
    if random.random() < p_sku_bias:
        sku_id = "SKU_001"

    # 1時間内に分散した event_ts（timestamp自体は年付きのまま）
    event_ts = hour_dt + timedelta(
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
    )

    return {
        "event_ts": event_ts.isoformat(),
        "source_file_hour": source_file_hour,  # 年なし
        "txn_id": txn_id,                      # 年なし
        "plant_id": plant_id,
        "warehouse_id": warehouse_id,
        "store_id": store_id,
        "channel": channel,
        "sku_id": sku_id,
        "package_ml": package_ml,
        "qty": qty,
        "unit_price_yen": unit_price,
        "sales_yen": sales,
        "delivery_lead_hours": lead,
        "promo_flag": promo,
        "quality_flag": qflag
    }

# =========================
# ファイル生成（01-01 〜 01-07）
# =========================
for day in range(days):
    for h in range(24):
        hour_dt = start_dt + timedelta(days=day, hours=h)

        mm_dd = hour_dt.strftime("%m-%d")  # 例: 01-07
        hh = hour_dt.strftime("%H")
        mmdd = hour_dt.strftime("%m%d")    # 例: 0107

        folder = f"{base_path}/dt={mm_dd}/hr={hh}"
        ensure_dir(folder)

        # 重複txn_idプール（年なし）
        dup_pool = [f"{mm_dd}-{hh}-DUP-{i:02d}" for i in range(10)]

        lines = []
        for i in range(records_per_file):
            rec = gen_record(hour_dt, i, day, dup_pool)
            lines.append(json.dumps(rec, ensure_ascii=False))

        # ★年なしファイル名
        file_path = f"{folder}/bev_events_{mmdd}_{hh}.json"
        dbutils.fs.put(file_path, "\n".join(lines), overwrite=True)

print("Sample data generation completed:", base_path)