In [0]:
# install dependency untuk read_excel
%pip install openpyxl

# (opsional di beberapa runtime) restart kernel python agar paket terdeteksi
import sys
if "databricks" in sys.version.lower():
    try:
        dbutils.library.restartPython()
    except Exception:
        pass

In [0]:
%restart_python

In [0]:
# STEP 1 — konfigurasi
import pandas as pd
import glob, os, datetime
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
from functools import reduce

CATALOG = "lapse_scoring_dev"
SCHEMA_LANDING = "00_landing"
SCHEMA_BRONZE  = "01_bronze"
VOLUME_NAME    = "chandra"    # ubah jika nama volumenya berbeda
#TABLE_NAME     = "chandra_201703_trad_master_bronze"    # nama tabel bronze tujuan
TABLE_NAME     = "chandra_201701_trad_master_bronze"    # nama tabel bronze tujuan
WRITE_MODE     = "overwrite"           # ganti "overwrite" untuk run pertama jika mau reset; "append" untuk tambah

# Path: pandas harus pakai /Volumes, Spark boleh dbfs:/Volumes
LANDING_DIR_PANDAS = f"/Volumes/{CATALOG}/{SCHEMA_LANDING}/{VOLUME_NAME}"
LANDING_XLSX_GLOB  = os.path.join(LANDING_DIR_PANDAS, "201703/TRAD_MASTER.xlsx")
TARGET_TABLE       = f'{CATALOG}.`{SCHEMA_BRONZE}`.{TABLE_NAME}'

# STEP 2 — pastikan catalog & schema bronze ada
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{SCHEMA_BRONZE}`")
spark.sql(f"USE SCHEMA `{SCHEMA_BRONZE}`")

# STEP 3 — helper: normalisasi kolom & baca semua sheet dari 1 file
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    return df.rename(columns={c: c.strip().lower().replace(" ", "_") for c in df.columns})

def read_all_sheets_with_meta(xlsx_path: str) -> list[pd.DataFrame]:
    # sheet_name=None -> dict {sheet: DataFrame}
    sheets = pd.read_excel(xlsx_path, sheet_name=None)
    out = []
    for sheet_name, pdf in sheets.items():
        if pdf is None or len(pdf) == 0:
            continue
        pdf = normalize_cols(pdf)
        # tambah metadata sumber & sheet
        pdf["_source_path"] = xlsx_path
        pdf["_sheet_name"]  = str(sheet_name)
        out.append(pdf)
    return out

# STEP 4 — kumpulkan semua file .xlsx (rekursif) & baca
xlsx_files = glob.glob(LANDING_XLSX_GLOB, recursive=True)
if not xlsx_files:
    raise FileNotFoundError(f"Tidak ditemukan file .xlsx di {LANDING_DIR_PANDAS}. Upload dulu file Excel ke volume tersebut.")

pdf_all_list = []
for f in xlsx_files:
    try:
        pdf_all_list.extend(read_all_sheets_with_meta(f))
    except Exception as e:
        print(f"[WARN] Gagal baca: {f} -> {e}")

if not pdf_all_list:
    raise ValueError("Tidak ada sheet yang terbaca dari file-file Excel.")

# gabungkan semua sheet dari semua file
pdf_all = pd.concat(pdf_all_list, ignore_index=True)

# STEP 5 — convert ke Spark DataFrame & tambah _ingest_ts (UPDATED)
import re
import pandas as pd
import numpy as np
from pyspark.sql import functions as F

pdf = pdf_all.copy()

# --- A) Sanitizer nama kolom (Delta/UC safe) ---
def sanitize(name: str) -> str:
    if name is None:
        name = ""
    n = str(name)

    # jika kolom metadata internal (mis. _source_path, _sheet_name), biarkan apa adanya
    if n.startswith("_"):
        return n

    # trim + lowercase
    n = n.strip().lower()
    # ganti karakter terlarang dengan underscore
    n = re.sub(r"[ ,;{}()\n\t=]+", "_", n)
    # ganti semua non-alfanumerik (selain underscore) dengan underscore
    n = re.sub(r"[^a-z0-9_]", "_", n)
    # kompres underscore berulang
    n = re.sub(r"_+", "_", n)
    # hapus underscore di awal/akhir
    n = n.strip("_")
    # jika kosong, beri nama default
    if n == "":
        n = "col"
    # jika diawali angka, tambahkan prefix agar valid
    if re.match(r"^[0-9]", n):
        n = f"c_{n}"
    return n

# terapkan sanitizer + jaga keunikan
safe_cols = []
seen = {}
for c in pdf.columns:
    base = sanitize(c)
    alias = base
    k = 1
    while alias in seen:
        k += 1
        alias = f"{base}_{k}"
    seen[alias] = True
    safe_cols.append(alias)

pdf.columns = safe_cols

# --- B) Pastikan ID/nomor polis bertipe string (hapus akhiran .0 hasil Excel) ---
for c in ["chdrnum", "clntnum", "lifenum"]:
    if c in pdf.columns:
        pdf[c] = (pdf[c].astype("string")
                          .str.replace(r"\.0$", "", regex=True))

# --- C) Cast object campuran ke string; biarkan numerik/datetime apa adanya ---
for c in pdf.columns:
    if pdf[c].dtype == "object":
        pdf[c] = pdf[c].astype("string")

# --- D) Ganti NaN/NaT jadi None agar Spark tidak error saat Arrow conversion ---
pdf = pdf.where(pd.notnull(pdf), None)

# --- E) Buat Spark DF + metadata ---
df = spark.createDataFrame(pdf)
df = df.withColumn("_ingest_ts", F.current_timestamp())

df.printSchema()
df.show(10, truncate=False)

# STEP 6 — tulis ke Bronze (Delta), mode append, allow schema evolve
(
    df.write
      .mode(WRITE_MODE)                 # "append" (default), atau "overwrite" untuk reset awal
      .option("mergeSchema", "true")    # kalau ada kolom baru saat run berikutnya
      .format("delta")
      .saveAsTable(TARGET_TABLE)
)

print(f"Write OK → {TARGET_TABLE}")

# STEP 7 — verifikasi cepat
cnt = spark.table(TARGET_TABLE).count()
print("Row count:", cnt)
display(spark.sql(f"SELECT * FROM {TARGET_TABLE} LIMIT 20"))

In [0]:
# STEP 2 — pastikan catalog & schema bronze ada
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{SCHEMA_BRONZE}`")
spark.sql(f"USE SCHEMA `{SCHEMA_BRONZE}`")

In [0]:
# STEP 3 — helper: normalisasi kolom & baca semua sheet dari 1 file
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    return df.rename(columns={c: c.strip().lower().replace(" ", "_") for c in df.columns})

def read_all_sheets_with_meta(xlsx_path: str) -> list[pd.DataFrame]:
    # sheet_name=None -> dict {sheet: DataFrame}
    sheets = pd.read_excel(xlsx_path, sheet_name=None)
    out = []
    for sheet_name, pdf in sheets.items():
        if pdf is None or len(pdf) == 0:
            continue
        pdf = normalize_cols(pdf)
        # tambah metadata sumber & sheet
        pdf["_source_path"] = xlsx_path
        pdf["_sheet_name"]  = str(sheet_name)
        out.append(pdf)
    return out

In [0]:
# STEP 4 — kumpulkan semua file .xlsx (rekursif) & baca
xlsx_files = glob.glob(LANDING_XLSX_GLOB, recursive=True)
if not xlsx_files:
    raise FileNotFoundError(f"Tidak ditemukan file .xlsx di {LANDING_DIR_PANDAS}. Upload dulu file Excel ke volume tersebut.")

pdf_all_list = []
for f in xlsx_files:
    try:
        pdf_all_list.extend(read_all_sheets_with_meta(f))
    except Exception as e:
        print(f"[WARN] Gagal baca: {f} -> {e}")

if not pdf_all_list:
    raise ValueError("Tidak ada sheet yang terbaca dari file-file Excel.")

# gabungkan semua sheet dari semua file
pdf_all = pd.concat(pdf_all_list, ignore_index=True)

In [0]:
# STEP 5 — convert ke Spark DataFrame & tambah _ingest_ts (UPDATED)
import re
import pandas as pd
import numpy as np
from pyspark.sql import functions as F

pdf = pdf_all.copy()

# --- A) Sanitizer nama kolom (Delta/UC safe) ---
def sanitize(name: str) -> str:
    if name is None:
        name = ""
    n = str(name)

    # jika kolom metadata internal (mis. _source_path, _sheet_name), biarkan apa adanya
    if n.startswith("_"):
        return n

    # trim + lowercase
    n = n.strip().lower()
    # ganti karakter terlarang dengan underscore
    n = re.sub(r"[ ,;{}()\n\t=]+", "_", n)
    # ganti semua non-alfanumerik (selain underscore) dengan underscore
    n = re.sub(r"[^a-z0-9_]", "_", n)
    # kompres underscore berulang
    n = re.sub(r"_+", "_", n)
    # hapus underscore di awal/akhir
    n = n.strip("_")
    # jika kosong, beri nama default
    if n == "":
        n = "col"
    # jika diawali angka, tambahkan prefix agar valid
    if re.match(r"^[0-9]", n):
        n = f"c_{n}"
    return n

# terapkan sanitizer + jaga keunikan
safe_cols = []
seen = {}
for c in pdf.columns:
    base = sanitize(c)
    alias = base
    k = 1
    while alias in seen:
        k += 1
        alias = f"{base}_{k}"
    seen[alias] = True
    safe_cols.append(alias)

pdf.columns = safe_cols

# --- B) Pastikan ID/nomor polis bertipe string (hapus akhiran .0 hasil Excel) ---
for c in ["chdrnum", "clntnum", "lifenum"]:
    if c in pdf.columns:
        pdf[c] = (pdf[c].astype("string")
                          .str.replace(r"\.0$", "", regex=True))

# --- C) Cast object campuran ke string; biarkan numerik/datetime apa adanya ---
for c in pdf.columns:
    if pdf[c].dtype == "object":
        pdf[c] = pdf[c].astype("string")

# --- D) Ganti NaN/NaT jadi None agar Spark tidak error saat Arrow conversion ---
pdf = pdf.where(pd.notnull(pdf), None)

# --- E) Buat Spark DF + metadata ---
df = spark.createDataFrame(pdf)
df = df.withColumn("_ingest_ts", F.current_timestamp())

df.printSchema()
df.show(10, truncate=False)

In [0]:
# STEP 6 — tulis ke Bronze (Delta), mode append, allow schema evolve
(
    df.write
      .mode(WRITE_MODE)                 # "append" (default), atau "overwrite" untuk reset awal
      .option("mergeSchema", "true")    # kalau ada kolom baru saat run berikutnya
      .format("delta")
      .saveAsTable(TARGET_TABLE)
)

print(f"Write OK → {TARGET_TABLE}")

In [0]:
# STEP 7 — verifikasi cepat
cnt = spark.table(TARGET_TABLE).count()
print("Row count:", cnt)
display(spark.sql(f"SELECT * FROM {TARGET_TABLE} LIMIT 20"))