In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from category_encoders import TargetEncoder

In [21]:
# 데이터 로드
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [22]:
# 타겟 및 피처 분리
y_train = train["성공확률"]
X_train = train.drop(columns=["ID", "성공확률"])
X_test = test.drop(columns=["ID"])

In [23]:
# 기업나이 생성
X_train["기업나이"] = 2025 - X_train["설립연도"]
X_test["기업나이"] = 2025 - X_test["설립연도"]
X_train.drop(columns=["설립연도"], inplace=True)
X_test.drop(columns=["설립연도"], inplace=True)

for col in ["직원 수", "고객수(백만명)", "기업가치(백억원)"]:
    X_train[col] = pd.to_numeric(X_train[col], errors="coerce")
    X_test[col] = pd.to_numeric(X_test[col], errors="coerce")
    X_train[col] = X_train[col].fillna(X_train[col].mean())
    X_test[col] = X_test[col].fillna(X_test[col].mean())

In [24]:
# 파생 변수 추가
X_train["SNS_강도"] = X_train["SNS 팔로워 수(백만명)"] / (X_train["기업나이"] + 1)
X_test["SNS_강도"] = X_test["SNS 팔로워 수(백만명)"] / (X_test["기업나이"] + 1)

X_train["연매출_투자비"] = X_train["연매출(억원)"] / (X_train["총 투자금(억원)"] + 1)
X_test["연매출_투자비"] = X_test["연매출(억원)"] / (X_test["총 투자금(억원)"] + 1)

X_train["고객당_투자금"] = X_train["총 투자금(억원)"] / (X_train["고객수(백만명)"] + 1)
X_test["고객당_투자금"] = X_test["총 투자금(억원)"] / (X_test["고객수(백만명)"] + 1)

In [25]:
# 3. Target Encoding

target_cols = ["국가", "분야", "투자단계"]
encoder = TargetEncoder(cols=target_cols)
X_train[target_cols] = encoder.fit_transform(X_train[target_cols], y_train)
X_test[target_cols] = encoder.transform(X_test[target_cols])

In [26]:
# 4. 열 정렬
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [27]:
# 5. 모델 정의
xgb = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0,
    n_jobs=-1
)

lgb = LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=10,
    num_leaves=31,
    random_state=42,
    n_jobs=-1
)

cat = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.02,
    depth=8,
    random_state=42,
    verbose=0
)

In [28]:
# 6. Cross-validation 예측 평균
kf = KFold(n_splits=5, shuffle=True, random_state=42)
xgb_preds = np.zeros(len(X_test))
lgb_preds = np.zeros(len(X_test))
cat_preds = np.zeros(len(X_test))

In [30]:
yesno_map = {"Yes": 1, "No": 0}

X_train["인수여부"] = X_train["인수여부"].map(yesno_map)
X_test["인수여부"] = X_test["인수여부"].map(yesno_map)

X_train["상장여부"] = X_train["상장여부"].map(yesno_map)
X_test["상장여부"] = X_test["상장여부"].map(yesno_map)


In [31]:
for train_idx, val_idx in kf.split(X_train):
    x_tr, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    xgb.fit(x_tr, y_tr)
    lgb.fit(x_tr, y_tr)
    cat.fit(x_tr, y_tr)

    xgb_preds += xgb.predict(X_test) / kf.get_n_splits()
    lgb_preds += lgb.predict(X_test) / kf.get_n_splits()
    cat_preds += cat.predict(X_test) / kf.get_n_splits()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1927
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 14
[LightGBM] [Info] Start training from score 0.534486
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000539 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1926
[LightGBM] [Info] Number of data points in the train set: 3501, number of used features: 14
[LightGBM] [Info] Start training from score 0.537104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1925
[LightGBM] [Info] Number of data points in the train set: 3501, number of used features: 14
[LightGBM] [Info] Start traini

In [32]:
# 7. 가중치 앙상블
final_pred = xgb_preds * 0.6 + lgb_preds * 0.25 + cat_preds * 0.15
final_pred = np.clip(final_pred, 0, 1)

In [33]:
# 8. 제출 파일 저장
submission["성공확률"] = final_pred
submission.to_csv("submission_stacked_weighted.csv", index=False)