## Load Data

In [None]:
import sys
from pathlib import Path

import pandas as pd

# Pastikan root project ke sys.path
PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.data_prep.clean_global_superstore import (
    load_raw_data,
    clean_global_superstore,
    encode_categoricals_and_process_for_model,
    save_clean_data,
    save_model_ready_data,
)

RAW_PATH = PROJECT_ROOT / "data" / "raw" / "Global_Superstore2.csv"
CLEAN_PATH = PROJECT_ROOT / "data" / "processed" / "global_superstore_clean.csv"
MODEL_READY_PATH = PROJECT_ROOT / "data" / "processed" / "global_superstore_model_ready.csv"

TARGET_COL = "is_profitable"

# --- Step 1: load / generate df_clean ---
if CLEAN_PATH.exists():
    print(f"[INFO] Load data bersih dari {CLEAN_PATH}")
    df_clean = pd.read_csv(
        CLEAN_PATH,
        parse_dates=["order_date", "ship_date"],
        low_memory=False,
    )
else:
    print(f"[INFO] File clean belum ada, load raw dari {RAW_PATH} lalu cleaning...")
    df_raw = load_raw_data(RAW_PATH)
    df_clean = clean_global_superstore(df_raw)
    save_clean_data(df_clean, CLEAN_PATH)

print("Shape df_clean:", df_clean.shape)

# --- Step 2: load / generate df_model (model-ready) ---
if MODEL_READY_PATH.exists():
    print(f"[INFO] Load model-ready dari {MODEL_READY_PATH}")
    df_model = pd.read_csv(MODEL_READY_PATH, low_memory=False)
else:
    print("[INFO] File model-ready belum ada, lakukan encoding dari df_clean...")
    df_model = encode_categoricals_and_process_for_model(df_clean)
    save_model_ready_data(df_model, MODEL_READY_PATH)

print("Shape df_model:", df_model.shape)



[INFO] Load data bersih dari D:\Coding\git-repo\github\tubes-pda\tubes-if5100-global-superstore\data\processed\global_superstore_clean.csv
Shape df_clean: (51290, 30)
[INFO] Load model-ready dari D:\Coding\git-repo\github\tubes-pda\tubes-if5100-global-superstore\data\processed\global_superstore_model_ready.csv
Shape df_model: (51290, 52)


In [4]:
df_model.head()

Unnamed: 0,sales,quantity,discount,shipping_cost,order_priority,order_year,order_month,order_quarter,shipping_days,sales_per_quantity,...,sub_category_Envelopes,sub_category_Fasteners,sub_category_Furnishings,sub_category_Labels,sub_category_Machines,sub_category_Paper,sub_category_Phones,sub_category_Storage,sub_category_Supplies,sub_category_Tables
0,2309.65,7,0.0,933.57,3,2012,7,3,0,329.95,...,False,False,False,False,False,False,False,False,False,False
1,3709.395,9,0.1,923.63,3,2013,2,1,2,412.155,...,False,False,False,False,False,False,False,False,False,False
2,5175.171,9,0.1,915.49,1,2013,10,4,1,575.019,...,False,False,False,False,False,False,True,False,False,False
3,2892.51,5,0.1,910.16,1,2013,1,1,2,578.502,...,False,False,False,False,False,False,True,False,False,False
4,2832.96,8,0.0,903.04,3,2013,11,4,1,354.12,...,False,False,False,False,False,False,False,False,False,False


In [5]:
# --- Step 3: pisahkan fitur & target ---
if TARGET_COL not in df_model.columns:
    raise ValueError(f"Kolom target '{TARGET_COL}' tidak ditemukan di df_model.")

X = df_model.drop(columns=[TARGET_COL])
y = df_model[TARGET_COL]

print("Jumlah fitur:", X.shape[1])
print("Contoh nama fitur:", list(X.columns[:10]))

Jumlah fitur: 51
Contoh nama fitur: ['sales', 'quantity', 'discount', 'shipping_cost', 'order_priority', 'order_year', 'order_month', 'order_quarter', 'shipping_days', 'sales_per_quantity']


## Split Data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)