In [0]:
spark.conf.set(
  "fs.azure.account.key.goodreadsreviews60300832.dfs.core.windows.net",
  "REMOVED_SECRET=="
)

In [0]:

storage_acct = "goodreadsreviews60300832"
container   = "lakehouse"
silver_path = f"abfss://{container}@{storage_acct}.dfs.core.windows.net/processed"
gold_path   = f"abfss://{container}@{storage_acct}.dfs.core.windows.net/gold"


In [0]:
# --- INPUT: curated Gold table path ---
curated_path = f"{gold_path}/features_v1"   # adjust if your curated table lives elsewhere

# ================================
# I) Load curated reviews (Gold)
# ================================
curated = (
    spark.read.format("delta").load(curated_path)
)
curated.columns


# Minimal columns required; keep any others you need later
required_cols = ["review_id","book_id","author_id","rating","review_text","date_added","n_votes"]
for c in required_cols:
    if c not in curated.columns:
        raise ValueError(f"Missing required column: {c}")

curated = curated.dropDuplicates(["review_id"]).filter("review_text IS NOT NULL AND length(review_text) >= 10")


In [0]:
# ===================================
# II) 70/15/15 Split (no data leakage)
# ===================================
train_df, val_df, test_df = curated.randomSplit([0.70, 0.15, 0.15], seed=42)

# Persist splits (weâ€™ll transform then write final feature tables)
for df, name in [(train_df,"train"), (val_df,"val"), (test_df,"test")]:
    df.createOrReplaceTempView(f"split_{name}")


In [0]:
# ===================================
# III) Save raw splits to Gold layer
# ===================================

feature_v2_base = f"{gold_path}/features_v2"
train_out = f"{feature_v2_base}/train"
val_out   = f"{feature_v2_base}/val"
test_out  = f"{feature_v2_base}/test"

# Write to Delta (overwrite mode = replace if exists)
(train_df.write.mode("overwrite").format("delta").save(train_out))
(val_df.write.mode("overwrite").format("delta").save(val_out))
(test_df.write.mode("overwrite").format("delta").save(test_out))

print("Splits saved to:")
print(" -", train_out)
print(" -", val_out)
print(" -", test_out)

Splits saved to:
 - abfss://lakehouse@goodreadsreviews60300832.dfs.core.windows.net/gold/features_v2/train
 - abfss://lakehouse@goodreadsreviews60300832.dfs.core.windows.net/gold/features_v2/val
 - abfss://lakehouse@goodreadsreviews60300832.dfs.core.windows.net/gold/features_v2/test


In [0]:
display(dbutils.fs.ls(f"{gold_path}/features_v2/train/_delta_log"))

path,name,size,modificationTime
abfss://lakehouse@goodreadsreviews60300832.dfs.core.windows.net/gold/features_v2/train/_delta_log/00000000000000000000.crc,00000000000000000000.crc,3323,1762798435000
abfss://lakehouse@goodreadsreviews60300832.dfs.core.windows.net/gold/features_v2/train/_delta_log/00000000000000000000.json,00000000000000000000.json,332861,1762798434000
abfss://lakehouse@goodreadsreviews60300832.dfs.core.windows.net/gold/features_v2/train/_delta_log/__tmp_path_dir/,__tmp_path_dir/,0,1762798434000
abfss://lakehouse@goodreadsreviews60300832.dfs.core.windows.net/gold/features_v2/train/_delta_log/_staged_commits/,_staged_commits/,0,1762798342000
