In [1]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import platform

# 시스템이 Windows일 경우
if platform.system() == 'Windows':
    plt.rc('font', family='Pretendard')
# macOS일 경우 예시:
elif platform.system() == 'Darwin':
    plt.rc('font', family='AppleGothic')

# 마이너스 부호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

In [1]:
from modules.data_loader import load_and_process
from modules.feature_selector import stage_feature_map

file_path = "../data/통합_train_데이터.parquet"
test_path = "../data/통합_test_데이터.parquet"

stage_name = "ab" # <- 변경해서 사용
save_path = "./models"

if stage_name!="all":
    df_train, used_cols = load_and_process(file_path=file_path, stage=stage_name)
    print("📌 가공된 컬럼 수:", len(used_cols))

데이터 불러오기 완료.
✅ 범주형 인코딩 완료
📌 가공된 컬럼 수: 30


In [None]:
from sklearn.calibration import LabelEncoder
from modules.model_AB import run_AB_modeling
from modules.model_CD import run_cd_modeling
from modules.model_E import run_E_modeling
from modules.model_baseline import run_baseline_modeling
from modules.model_utils import combine_segment_predictions
from modules.feature_selector import pca_cols, ab_feat, cd_baseline, selected_cols
from modules.feature_selector import stage_feature_map
import pandas as pd
import numpy as np

# ✅ CD만 학습
if stage_name == "cd":
    result = run_cd_modeling(df_train, used_cols, save_path)

# ✅ AB만 학습
elif stage_name == "ab":
    result = run_AB_modeling(df_train, used_cols)

# ✅ E만 학습
elif stage_name == "e":
    result = run_E_modeling(df_train, used_cols, save_path)

# ✅ 베이스라인 학습
elif stage_name in ["baseline", "base", ""]:
    from modules.model_baseline import run_baseline_modeling
    result = run_baseline_modeling(df_train, used_cols, save_path)

# ✅ 모든 모델을 실행하고 test 예측 및 통합
if stage_name == "all":
    print("🚀 전체 모델 학습 및 test 예측 시작")
    
    # 🔸 1. E 모델
    e_cols = selected_cols
    train_e, e_cols = load_and_process(file_path, stage="e")
    print("📊 [train_e] Segment 분포:\n", train_e["Segment"].value_counts())

    e_result = run_E_modeling(train_e, e_cols, save_path)
    e_model = e_result["model"]
    e_thresh = e_result["threshold"]

    test_e, _ = load_and_process(test_path, stage="e", base_cols=["ID"]) 

    e_probs = e_model.predict_proba(test_e[e_cols])[:, 1]
    e_preds_bin = (e_probs > e_thresh).astype(int)

    # 🔹 e_preds_bin: 1 = 'E', 0 = Others
    e_preds = np.full(len(test_e), fill_value=None)  # 최종 레이블 저장용

    # 🔸 2. AB 모델 (E 제외 + A/B 예측)
    ab_mask = (e_preds_bin == 0)
    test_ab = test_e.loc[ab_mask].copy()
    train_ab, ab_cols = load_and_process(file_path, stage="ab")
    ab_result = run_AB_modeling(train_ab, ab_cols, save_path)
    ab_model = ab_result["model"]
    ab_thresh = ab_result["threshold"]

    ab_probs = ab_model.predict_proba(test_ab[ab_cols])[:, 1]
    ab_preds_bin = (ab_probs > ab_thresh).astype(int)
    ab_mapping = {0: 'A', 1: 'B'}

    # 🔸 3. CD 모델 (E 제외 + C/D 예측)
    train_cd, cd_cols = load_and_process(file_path, stage="cd")
    cd_result = run_cd_modeling(train_cd, cd_cols, save_path)
    cd_model = cd_result["model"]
    cd_thresh = cd_result["threshold"]

    test_cd = test_e.loc[ab_mask].copy()  # CD와 AB는 같은 대상에서 갈라짐
    cd_probs = cd_model.predict_proba(test_cd[cd_cols])[:, 1]
    cd_preds_bin = (cd_probs > cd_thresh).astype(int)
    cd_mapping = {0: 'C', 1: 'D'}

    # 🔸 4. 최종 레이블 통합
    final_preds = []

    for i in range(len(test_e)):
        if e_preds_bin[i] == 1:
            final_preds.append('E')
        else:
            # AB/CD 분기 기준 필요 (예: AB/CD 모델 기준 스코어 비교 or 별 기준)
            # 여기서는 예시로 AB 모델 우선 적용 (B vs A)
            if i in test_ab.index:
                final_preds.append(ab_mapping[ab_preds_bin[test_ab.index.get_loc(i)]])
            else:
                final_preds.append(cd_mapping[cd_preds_bin[test_cd.index.get_loc(i)]])


🔍 AB 세그먼트 모델링 시작
📌 A/B 샘플 수: 1116
✅ 데이터 분할 완료: (892, 30) (224, 30)
✅ SMOTE 적용 완료
  → Before: 892 samples
  → After:  1554 samples (class balanced)
🧠 XGBoost 모델 정의 완료
Parameters: { "device" } are not used.

Parameters: { "device" } are not used.

✅ [XGB_AB]
📊 성능 평가:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       195
           1       0.66      0.79      0.72        29

    accuracy                           0.92       224
   macro avg       0.81      0.87      0.84       224
weighted avg       0.93      0.92      0.92       224

F1 Macro : 0.8359375
F1 Micro : 0.9196428571428571
F1 Weighted : 0.9227818080357143

🌟 최적 Threshold: 0.786 (F1 Score: 0.7857)


XGBoostError: [00:44:53] D:\bld\xgboost-split_1700181085428\work\dmlc-core\src\io\local_filesys.cc:209: Check failed: allow_null:  LocalFileSystem::Open "./models": Permission denied

In [4]:
# 9. Booster로 저장 (JSON)
import os
if os.path.exists("./models") and not os.path.isdir("./models"):
    raise NotADirectoryError("❗ './models' 경로가 디렉토리가 아니라 파일입니다. 삭제하거나 경로명을 변경하세요.")
os.makedirs("./models", exist_ok=True)

booster = model.get_booster()
booster.save_model(save_path)


NameError: name 'model' is not defined

In [None]:
    # 🔸 5. 제출 파일 저장
submission = pd.read_csv("../data/sample_submission.csv")
submission["Segment"] = final_preds
submission.to_csv("prediction_binary.csv", index=False)
print("✅ submission_final.csv 저장 완료")

In [None]:
print("🧪 stage_feature_map keys:", stage_feature_map.keys())

🧪 stage_feature_map keys: dict_keys(['base', 'cd_base', 'cd', 'ab'])


In [None]:
df_train.head(5)

In [None]:
result

In [None]:
df_train["Segment"].value_counts()

In [None]:
# 예측값 확인
print(result["y_pred"][:5])

# F1 score 출력
print("📌 최적 F1 Score:", result["f1_score"])

# 모델 확인
print("모델 구조:")
print(result["model"])

# SHAP 분석용
import shap
explainer = shap.Explainer(result["model"])
shap_values = explainer(result["X_val"])
shap.plots.beeswarm(shap_values)

In [None]:
from modules.model_utils import get_xgb_model, train_and_evaluate, apply_smote
from modules.model_utils import get_model

X_resampled, y_resampled = apply_smote(X_train, y_train) # SMOTE 적용, k_neighbors 조절 가능
model = get_xgb_model() # xgb 모델 가져오기 (기본) / 파라미터 조절 가능

#get_model로 가져오기도 가능
model1 = get_model("xgb", {"max_depth": 4})

model.fit(X_train, y_train)
model = train_and_evaluate(model, X_train, y_train, X_val, y_val)

In [None]:
from modules.model_utils import apply_best_threshold

best_thresh, best_f1, y_pred = apply_best_threshold(model, X_val, y_val)

In [None]:
from modules.model_utils import save_model_and_results

model = get_model("xgb")
model.fit(X_train, y_train)

save_model_and_results(model, model_name="XGB_CD", save_path="./outputs/cd_models")

In [None]:
# 개별모델 반복실행
from modules.model_utils import get_model
from modules.ensemble import run_custom_experiments

model_list = [
    ("XGB_d4", get_model("xgb", {"max_depth": 4})),
    ("XGB_d6", get_model("xgb", {"max_depth": 6})),
    ("RF_100", get_model("rf", {"n_estimators": 100})),
    ("LR_C1", get_model("lr", {"C": 1.0}))
]
#results_df = run_custom_experiments(X_train, y_train, X_val, y_val, model_list) <- 위 조절후 주석지우고 사용

In [None]:
# Voting 앙상블 실행
from modules.model_utils import get_model
from modules.ensemble import run_voting_ensemble

models = [
    ("xgb", get_model("xgb")),
    ("rf", get_model("rf")),
    ("lgbm", get_model("lgbm"))
]
weights = [3, 1, 2]

# result = run_voting_ensemble(X_train, y_train, X_val, y_val, model_configs=models, weights=weights) 

In [None]:
# Stacking 앙상블 실행
from modules.model_utils import get_model
from modules.ensemble import run_stacking_ensemble

base_models = [
    ("lr", get_model("lr")),
    ("cat", CatBoostClassifier(verbose=0)),
    ("svm", SVC(probability=True))
]

# 메타 모델로 XGBoost 사용
#result = run_stacking_ensemble(X_train, y_train, X_val, y_val, base_models, meta_model_name="xgb")

In [None]:
import model_CD

stage_to_model = {
    "cd": train_model_CD
    #"ab": train_model_AB,
}

# 실행
if stage_name in stage_to_model:
    model = stage_to_model[stage_name](df_train, used_cols)

# 항목별 확인 예시

In [None]:
# 예측값 확인
print(result["y_pred"][:5])

# F1 score 출력
print("📌 최적 F1 Score:", result["f1_score"])

# 모델 확인
print("모델 구조:")
print(result["model"])

# SHAP 분석용
import shap
explainer = shap.Explainer(result["model"])
shap_values = explainer(result["X_val"])
shap.plots.beeswarm(shap_values)