In [0]:
# Config with iterative process for each csv file in the list

# === CONFIG UMUM ===
from pyspark.sql import functions as F

CATALOG        = "lapse_scoring_dev"
SCHEMA_LANDING = "00_landing"
SCHEMA_BRONZE  = "01_bronze"
VOLUME_NAME    = "chandra"
WRITE_MODE     = "append"   # atau "overwrite" untuk refresh awal

BASE_DIR = f"dbfs:/Volumes/{CATALOG}/{SCHEMA_LANDING}/{VOLUME_NAME}"

# Daftar pekerjaan: (SRC_PATTERN relatif terhadap BASE_DIR) -> TABLE_NAME di Bronze
JOBS = [
    {"SRC_PATTERN": "201811/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201811_trad_master_bronze"},
    {"SRC_PATTERN": "201811/UL_MASTER.csv",          "TABLE_NAME": "chandra_201811_ul_master_bronze"},
    {"SRC_PATTERN": "201812/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201812_trad_master_bronze"},
    {"SRC_PATTERN": "201812/UL_MASTER.csv",          "TABLE_NAME": "chandra_201812_ul_master_bronze"},
    {"SRC_PATTERN": "201901/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201901_trad_master_bronze"},
    {"SRC_PATTERN": "201901/UL_MASTER.csv",          "TABLE_NAME": "chandra_201901_ul_master_bronze"},
    {"SRC_PATTERN": "201902/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201902_trad_master_bronze"},
    {"SRC_PATTERN": "201902/UL_MASTER.csv",          "TABLE_NAME": "chandra_201902_ul_master_bronze"},
    {"SRC_PATTERN": "201903/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201903_trad_master_bronze"},
    {"SRC_PATTERN": "201903/UL_MASTER.csv",          "TABLE_NAME": "chandra_201903_ul_master_bronze"},
    {"SRC_PATTERN": "201904/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201904_trad_master_bronze"},
    {"SRC_PATTERN": "201904/UL_MASTER.csv",          "TABLE_NAME": "chandra_201904_ul_master_bronze"},
    {"SRC_PATTERN": "201905/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201905_trad_master_bronze"},
    {"SRC_PATTERN": "201905/UL_MASTER.csv",          "TABLE_NAME": "chandra_201905_ul_master_bronze"},
    {"SRC_PATTERN": "201906/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201906_trad_master_bronze"},
    {"SRC_PATTERN": "201906/UL_MASTER.csv",          "TABLE_NAME": "chandra_201906_ul_master_bronze"},
    {"SRC_PATTERN": "201907/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201907_trad_master_bronze"},
    {"SRC_PATTERN": "201907/UL_MASTER.csv",          "TABLE_NAME": "chandra_201907_ul_master_bronze"},
    {"SRC_PATTERN": "201908/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201908_trad_master_bronze"},
    {"SRC_PATTERN": "201908/UL_MASTER.csv",          "TABLE_NAME": "chandra_201908_ul_master_bronze"},
    {"SRC_PATTERN": "201909/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201909_trad_master_bronze"},
    {"SRC_PATTERN": "201909/UL_MASTER.csv",          "TABLE_NAME": "chandra_201909_ul_master_bronze"},
    {"SRC_PATTERN": "201910/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201910_trad_master_bronze"},
    {"SRC_PATTERN": "201910/UL_MASTER.csv",          "TABLE_NAME": "chandra_201910_ul_master_bronze"},
    {"SRC_PATTERN": "201911/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201911_trad_master_bronze"},
    {"SRC_PATTERN": "201911/UL_MASTER.csv",          "TABLE_NAME": "chandra_201911_ul_master_bronze"},
    {"SRC_PATTERN": "201912/TRAD_MASTER.csv",          "TABLE_NAME": "chandra_201912_trad_master_bronze"},
    {"SRC_PATTERN": "201912/UL_MASTER.csv",          "TABLE_NAME": "chandra_201912_ul_master_bronze"},
    # Tambah baris lain di sini...
]

# === PERSIAPAN CATALOG/SCHEMA BRONZE ===
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{SCHEMA_BRONZE}`")
spark.sql(f"USE SCHEMA `{SCHEMA_BRONZE}`")

def detect_sep(sample_path: str) -> str:
    """Deteksi delimiter dari header file (`,` vs `;`)."""
    head = dbutils.fs.head(sample_path, 4096)
    return "," if head.count(",") >= head.count(";") else ";"

def ingest_one(src_pattern: str, table_name: str):
    src_path = f"{BASE_DIR}/{src_pattern}"
    target_table = f'{CATALOG}.`{SCHEMA_BRONZE}`.{table_name}'
    print(f"\n==> Ingest: {src_path} -> {target_table}")

    # Deteksi delimiter
    sep = detect_sep(src_path)
    print(f"Detected delimiter: '{sep}'")

    # Baca CSV
    df = (spark.read
          .option("header", True)
          .option("sep", sep)
          .option("inferSchema", True)
          .option("quote", '"')
          .option("escape", '"')
          .option("multiLine", True)
          .option("mode", "PERMISSIVE")
          .csv(src_path))

    # Normalisasi ringan nama kolom
    df = df.toDF(*[c.strip().lower().replace(" ", "_") for c in df.columns])

    # Kolom ID -> string (kalau ada)
    for c in ["chdrnum", "clntnum", "lifenum"]:
        if c in df.columns:
            df = df.withColumn(c, F.col(c).cast("string"))

    # Metadata UC-friendly
    df = (df
          .withColumn("_ingest_ts", F.current_timestamp())
          .withColumn("_source_path", F.col("_metadata.file_path")))

    # Tulis ke Bronze
    (df.write
        .mode(WRITE_MODE)
        .option("mergeSchema", "true")
        .format("delta")
        .saveAsTable(target_table))

    # Verifikasi ringan
    cnt = spark.table(target_table).count()
    print(f"Write OK → {target_table} | rows={cnt}")

# === JALANKAN SEMUA JOB ===
errors = []
for job in JOBS:
    try:
        ingest_one(job["SRC_PATTERN"], job["TABLE_NAME"])
    except Exception as e:
        print(f"[ERROR] {job['SRC_PATTERN']} -> {job['TABLE_NAME']} :: {e}")
        errors.append((job, str(e)))

if errors:
    print("\n=== SUMMARY: SOME JOBS FAILED ===")
    for (job, msg) in errors:
        print(f"- {job['SRC_PATTERN']} -> {job['TABLE_NAME']} :: {msg}")
else:
    print("\n=== ALL JOBS COMPLETED SUCCESSFULLY ===")


In [0]:
# ==== STEP 1: konfigurasi ====
from pyspark.sql import functions as F

CATALOG        = "lapse_scoring_dev"
SCHEMA_LANDING = "00_landing"
SCHEMA_BRONZE  = "01_bronze"
VOLUME_NAME    = "chandra"   # folder di Volume
SRC_PATTERN    = "201803/TRAD_MASTER 201803.csv"  # pola file CSV di landing
TABLE_NAME     = "chandra_201803_trad_master_bronze"
WRITE_MODE     = "append"    # "overwrite" untuk full refresh awal

SRC_PATH = f"dbfs:/Volumes/{CATALOG}/{SCHEMA_LANDING}/{VOLUME_NAME}/{SRC_PATTERN}"
TARGET_TABLE = f'{CATALOG}.`{SCHEMA_BRONZE}`.{TABLE_NAME}'

# ==== STEP 2: pastikan catalog & schema bronze ====
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{SCHEMA_BRONZE}`")
spark.sql(f"USE SCHEMA `{SCHEMA_BRONZE}`")

# ==== STEP 3: READ CSV dari landing ====

# 2) deteksi delimiter dari header
header = dbutils.fs.head(SRC_PATH, 4096)
comma_ct = header.count(",")
semi_ct  = header.count(";")
sep = "," if comma_ct >= semi_ct else ";"

df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .option("sep", sep)
      # sesuaikan delimiter jika perlu: .option("sep",";")
      .option("multiLine", True)   # jika ada kolom teks panjang dengan newline
      .option("quote", '"')
      .option("escape", '"')
      .option("mode", "PERMISSIVE")
      .csv(SRC_PATH))

# Normalisasi ringan nama kolom
df = df.toDF(*[c.strip().lower().replace(" ", "_") for c in df.columns])

# Kolom ID (policy/client/life) amankan sebagai STRING
for c in ["chdrnum", "clntnum", "lifenum"]:
    if c in df.columns:
        df = df.withColumn(c, F.col(c).cast("string"))

# Tambahkan metadata teknis
from pyspark.sql import functions as F

df = (df
      .withColumn("_ingest_ts", F.current_timestamp())
      .withColumn("_source_path", F.col("_metadata.file_path"))  # ← pengganti input_file_name()
)

# ==== STEP 4: TULIS ke Bronze (Delta) ====
(df.write
   .mode(WRITE_MODE)
   .option("mergeSchema","true")
   .format("delta")
   .saveAsTable(TARGET_TABLE))

print(f"Write OK → {TARGET_TABLE}")

# ==== STEP 5: verifikasi cepat ====
print("Row count:", spark.table(TARGET_TABLE).count())
display(spark.sql(f"SELECT * FROM {TARGET_TABLE} LIMIT 20"))

In [0]:
# ==== STEP 1: konfigurasi ====
from pyspark.sql import functions as F

CATALOG        = "lapse_scoring_dev"
SCHEMA_LANDING = "00_landing"
SCHEMA_BRONZE  = "01_bronze"
VOLUME_NAME    = "chandra"   # folder di Volume
SRC_PATTERN    = "201712/TRAD_MASTER 201712.csv"  # pola file CSV di landing
TABLE_NAME     = "chandra_201712_trad_master_bronze"
WRITE_MODE     = "append"    # "overwrite" untuk full refresh awal

SRC_PATH = f"dbfs:/Volumes/{CATALOG}/{SCHEMA_LANDING}/{VOLUME_NAME}/{SRC_PATTERN}"
TARGET_TABLE = f'{CATALOG}.`{SCHEMA_BRONZE}`.{TABLE_NAME}'

print(SRC_PATH);

In [0]:
# ==== STEP 2: pastikan catalog & schema bronze ====
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{SCHEMA_BRONZE}`")
spark.sql(f"USE SCHEMA `{SCHEMA_BRONZE}`")

In [0]:
# ==== STEP 3: READ CSV dari landing ====

# 2) deteksi delimiter dari header
header = dbutils.fs.head(SRC_PATH, 4096)
comma_ct = header.count(",")
semi_ct  = header.count(";")
sep = "," if comma_ct >= semi_ct else ";"

df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .option("sep", sep)
      # sesuaikan delimiter jika perlu: .option("sep",";")
      .option("multiLine", True)   # jika ada kolom teks panjang dengan newline
      .option("quote", '"')
      .option("escape", '"')
      .option("mode", "PERMISSIVE")
      .csv(SRC_PATH))

# Normalisasi ringan nama kolom
df = df.toDF(*[c.strip().lower().replace(" ", "_") for c in df.columns])

# Kolom ID (policy/client/life) amankan sebagai STRING
for c in ["chdrnum", "clntnum", "lifenum"]:
    if c in df.columns:
        df = df.withColumn(c, F.col(c).cast("string"))

# Tambahkan metadata teknis
from pyspark.sql import functions as F

df = (df
      .withColumn("_ingest_ts", F.current_timestamp())
      .withColumn("_source_path", F.col("_metadata.file_path"))  # ← pengganti input_file_name()
)

# (opsional) intip hasil
df.printSchema()
display(df.limit(10))

In [0]:
# ==== STEP 4: TULIS ke Bronze (Delta) ====
(df.write
   .mode(WRITE_MODE)
   .option("mergeSchema","true")
   .format("delta")
   .saveAsTable(TARGET_TABLE))

print(f"Write OK → {TARGET_TABLE}")

In [0]:
# ==== STEP 5: verifikasi cepat ====
print("Row count:", spark.table(TARGET_TABLE).count())
display(spark.sql(f"SELECT * FROM {TARGET_TABLE} LIMIT 20"))