In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [3]:
# 데이터 로드
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [4]:
# 타겟 및 피처 분리
y_train = train["성공확률"]
X_train = train.drop(columns=["ID", "성공확률"])
X_test = test.drop(columns=["ID"])

In [5]:
# 기업나이 생성
X_train["기업나이"] = 2025 - X_train["설립연도"]
X_test["기업나이"] = 2025 - X_test["설립연도"]
X_train.drop(columns=["설립연도"], inplace=True)
X_test.drop(columns=["설립연도"], inplace=True)

In [6]:
# 범주형 인코딩
cat_cols = X_train.select_dtypes(include="object").columns
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

In [7]:
# 결측치 처리
for col in ['직원 수', '고객수(백만명)']:
    X_train[col] = X_train[col].fillna(X_train[col].mean())
    X_test[col] = X_test[col].fillna(X_test[col].mean())

In [8]:
# 파생 변수 생성
X_train['매출_투자비'] = X_train['연매출(억원)'] / (X_train['총 투자금(억원)'] + 1)
X_test['매출_투자비'] = X_test['연매출(억원)'] / (X_test['총 투자금(억원)'] + 1)

X_train['고객당매출'] = X_train['연매출(억원)'] / (X_train['고객수(백만명)'] + 1)
X_test['고객당매출'] = X_test['연매출(억원)'] / (X_test['고객수(백만명)'] + 1)

X_train['SNS_비율'] = X_train['SNS 팔로워 수(백만명)'] / (X_train['고객수(백만명)'] + 1)
X_test['SNS_비율'] = X_test['SNS 팔로워 수(백만명)'] / (X_test['고객수(백만명)'] + 1)

In [9]:
# 모델 정의 (튜닝 반영)
lgb = LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=10,
    num_leaves=31,
    random_state=42,
    n_jobs=-1
)

In [10]:
xgb = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0,
    n_jobs=-1
)

In [11]:
cat = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.02,
    depth=8,
    random_state=42,
    verbose=0
)

In [12]:
# 모델 학습
lgb.fit(X_train, y_train)
xgb.fit(X_train, y_train)
cat.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Number of data points in the train set: 4376, number of used features: 15
[LightGBM] [Info] Start training from score 0.537340


<catboost.core.CatBoostRegressor at 0x1411cf8d510>

In [13]:
# 예측
lgb_pred = lgb.predict(X_test)
xgb_pred = xgb.predict(X_test)
cat_pred = cat.predict(X_test)

In [16]:
lgb_pred

array([0.52195865, 0.46053917, 0.46099733, ..., 0.57982434, 0.46916482,
       0.59636073], shape=(1755,))

In [17]:
# Soft voting (평균 앙상블)
# final_pred = (lgb_pred + xgb_pred + cat_pred) / 3
final_pred = (
    xgb_pred * 0.6 +    # 가장 성능 좋았던 XGB에 가장 많은 가중치
    lgb_pred * 0.25 +
    cat_pred * 0.15
)
final_pred = np.clip(final_pred, 0, 1)  # 성공확률은 0~1 범위

In [18]:
submission["성공확률"] = final_pred
submission.to_csv("submission_best_ensemble2.csv", index=False)

In [14]:
lgb_final = np.clip(lgb_pred, 0,1)
xgb_final = np.clip(xgb_pred, 0,1)
cat_final = np.clip(cat_pred, 0,1)

In [15]:
submission["성공확률"] = lgb_final
submission.to_csv("submission_lgb_final.csv", index=False)
submission["성공확률"] = xgb_final
submission.to_csv("submission_xgb_final.csv", index=False)
submission["성공확률"] = cat_final
submission.to_csv("submission_cat_final.csv", index=False)