## Dataset Model-Ready

In [1]:
# =======================================
# 0. Dataset Model-Ready (Load & Encode)
# =======================================

import sys
from pathlib import Path

import pandas as pd

# Pastikan root project masuk ke sys.path
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.data_prep.clean_global_superstore import (
    load_raw_data,
    clean_global_superstore,
    encode_categoricals_and_process_for_model,
    save_clean_data,
    save_model_ready_data,
)

# Path penting
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "Global_Superstore2.csv"
CLEAN_PATH = PROJECT_ROOT / "data" / "processed" / "global_superstore_clean.csv"
MODEL_READY_PATH = PROJECT_ROOT / "data" / "processed" / "global_superstore_model_ready.csv"

TARGET_COL = "is_profitable"

# -------------------------------------------------
# Step 1: Pastikan df_clean tersedia
# -------------------------------------------------
if CLEAN_PATH.exists():
    print(f"[INFO] Load data bersih dari {CLEAN_PATH}")
    df_clean = pd.read_csv(
        CLEAN_PATH,
        parse_dates=["order_date", "ship_date"],
        low_memory=False,
    )
else:
    print(f"[INFO] File clean belum ada, load raw dari {RAW_PATH} lalu cleaning...")
    df_raw = load_raw_data(RAW_PATH)
    df_clean = clean_global_superstore(df_raw)
    save_clean_data(df_clean, CLEAN_PATH)

print("Shape df_clean:", df_clean.shape)

# -------------------------------------------------
# Step 2: Buat / load dataset model-ready (encoded)
# -------------------------------------------------
if MODEL_READY_PATH.exists():
    print(f"[INFO] Load model-ready dari {MODEL_READY_PATH}")
    df_model = pd.read_csv(MODEL_READY_PATH, low_memory=False)
else:
    print("[INFO] Model-ready belum ada, lakukan encoding dari df_clean...")
    df_model = encode_categoricals_and_process_for_model(df_clean)
    save_model_ready_data(df_model, MODEL_READY_PATH)

print("Shape df_model (siap modelling):", df_model.shape)

# -------------------------------------------------
# Step 3: Pisahkan fitur (X) dan target (y)
# -------------------------------------------------
if TARGET_COL not in df_model.columns:
    raise ValueError(f"Kolom target '{TARGET_COL}' tidak ditemukan di df_model.")

feature_cols = [c for c in df_model.columns if c != TARGET_COL]

X = df_model[feature_cols]
y = df_model[TARGET_COL]

print("Jumlah fitur:", len(feature_cols))
print("Contoh fitur:", feature_cols[:10])

[INFO] Load data bersih dari D:\Coding\git-repo\github\tubes-pda\tubes-if5100-global-superstore\data\processed\global_superstore_clean.csv
Shape df_clean: (51290, 30)
[INFO] Load model-ready dari D:\Coding\git-repo\github\tubes-pda\tubes-if5100-global-superstore\data\processed\global_superstore_model_ready.csv
Shape df_model (siap modelling): (51290, 52)
Jumlah fitur: 51
Contoh fitur: ['sales', 'quantity', 'discount', 'shipping_cost', 'order_priority', 'order_year', 'order_month', 'order_quarter', 'shipping_days', 'sales_per_quantity']
