In [20]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [21]:
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [22]:
except_cols = ["ID", "성공확률"]
X_train = train.drop(columns=except_cols)
y_train = train["성공확률"]

cat_cols = X_train.select_dtypes(include="object").columns.tolist()
X_train = pd.get_dummies(X_train, columns=cat_cols)
X_test = test.drop(columns="ID")
X_test = pd.get_dummies(X_test, columns=cat_cols)

X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [23]:
X_train['직원 수'] = X_train['직원 수'].fillna(X_train['직원 수'].median())
X_train['고객수(백만명)'] = X_train['고객수(백만명)'].fillna(X_train['고객수(백만명)'].median())

X_test['직원 수'] = X_test['직원 수'].fillna(X_test['직원 수'].median())
X_test['고객수(백만명)'] = X_test['고객수(백만명)'].fillna(X_test['고객수(백만명)'].median())

In [24]:
# XGBoost 모델 생성
xgb_model = XGBRegressor(
    n_estimators=300,    # 트리 개수
    max_depth=6,         # 트리 깊이
    learning_rate=0.1,   # 학습률
    random_state=42,
    n_jobs=-1            # CPU 모두 사용
)

In [25]:
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [26]:
import numpy as np

y_pred = np.round(y_pred, 3)

In [27]:
submission = pd.DataFrame({
    'ID': test['ID'],
    '성공확률': y_pred
})

In [29]:
submission.to_csv('submission_XG.csv', index=False)

In [None]:
# -----------------------------------------------------------
# MAE 측정

In [None]:
from sklearn.model_selection import train_test_split

# 학습 데이터/검증 데이터 나누기
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [None]:
# XGBoost 모델 생성
xgb_model2 = XGBRegressor(
    n_estimators=300,    # 트리 개수
    max_depth=6,         # 트리 깊이
    learning_rate=0.1,   # 학습률
    random_state=42,
    n_jobs=-1            # CPU 모두 사용
)

In [None]:
# 학습
xgb_model2.fit(X_train_sub, y_train_sub)

# 예측
y_val_pred_xgb = xgb_model.predict(X_val)

# 평가
mae_xgb = mean_absolute_error(y_val, y_val_pred_xgb)
print(f"✅ XGBoost 검증 MAE: {mae_xgb:.4f}")

✅ XGBoost 검증 MAE: 0.2102


In [18]:
import numpy as np

# 1. X_test로 최종 예측
y_pred_test = xgb_model.predict(X_test)

# 2. 예측 결과 소수점 3자리로 반올림
y_pred_test = np.round(y_pred_test, 3)

# 3. 제출 파일 만들기
submission = pd.DataFrame({
    'ID': test['ID'],          # 원본 test 파일에서 ID 가져오기
    '성공확률': y_pred_test     # 예측한 성공확률
})

# 4. 저장
submission.to_csv('submission_XG.csv', index=False)