In [None]:
file_path = "/content/drive/MyDrive/data/통합_train_데이터.parquet"

In [None]:
import pandas as pd
from modules.feature_selector import selected_cols

# ✅ ID, Segment 포함 컬럼 리스트 구성
final_cols = selected_cols + ["ID", "Segment"]

# ✅ 필요한 컬럼만 로드
train_df = pd.read_parquet(file_path, columns=final_cols)

In [None]:
from modules.data_loader import map_categorical_columns
from sklearn.preprocessing import LabelEncoder
from modules.feature_selector import selected_cols, generate_vif_derived_features, generate_e_features

# Step 1. ID, Segment 제외한 가공 대상 컬럼만 분리
exclude_cols = ["ID", "Segment"]
target_col = "Segment"
categorical_cols = [col for col in train_df.columns if train_df[col].dtype == "object" and col not in exclude_cols]

# Step 2. 복사본 생성
df_processed = train_df.copy()

# Step 3. 이상값 처리 + 범주형 인코딩
for col in categorical_cols:
    df_processed[col] = df_processed[col].replace(['?', '알파벳', '기타'], pd.NA)
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col].astype(str))

# ✅ Step 4. 이 시점에서 파생변수 생성 함수 호출
# 예시) df_processed → generate_derived_features(df_processed)
df_processed = generate_e_features(df_processed) 
df_processed = generate_vif_derived_features(df_processed)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Segment 문자 라벨 인코딩
le_segment = LabelEncoder()
df_processed["Segment"] = le_segment.fit_transform(df_processed["Segment"])

# 저장해두면 나중에 역변환 가능
segment_label_mapping = dict(zip(le_segment.classes_, le_segment.transform(le_segment.classes_)))
print("📌 Segment 라벨 매핑:", segment_label_mapping)

In [None]:
df_processed["target"] = (df_processed["Segment"] == 4).astype(int)  # Segment E → 1, 나머지 → 0

In [None]:
# 3. X, y 분리
X = df_processed.drop(columns=["ID", "Segment", "target"])
y = df_processed["target"]
print("✅ X shape:", X.shape)
print("✅ y 분포:\n", y.value_counts())

In [None]:
from sklearn.model_selection import train_test_split

# Stratify를 적용해 클래스 비율을 유지한 채 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,         # 검증셋 비율 (20%)
    random_state=42,       # 재현성 고정
    stratify=y             # 클래스 비율 유지 (불균형 대비)
)

# 확인
print("✅ 학습셋 크기:", X_train.shape, y_train.shape)
print("✅ 검증셋 크기:", X_val.shape, y_val.shape)
print("✅ 학습셋 클래스 분포:\n", y_train.value_counts(normalize=True))
print("✅ 검증셋 클래스 분포:\n", y_val.value_counts(normalize=True))

In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
# XGBoost DMatrix 변환
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# GPU 기반 파라미터 설정
best_params = {
    'max_depth': 8,
    'learning_rate': 0.2435,
    'min_child_weight': 9,
    'subsample': 0.6043,
    'colsample_bytree': 0.8550,
    'gamma': 3.3658,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'tree_method': 'hist',   # 최신버전 기준
    'device': 'cuda',        # GPU 사용
    'use_label_encoder': False
}

# 학습
model = XGBClassifier(**best_params, verbosity=0)
model.fit(X_train, y_train)