In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [15]:
# 데이터 로드
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [16]:
# 타겟 및 피처 분리
y_train = train["성공확률"]
X_train = train.drop(columns=["ID", "성공확률"])
X_test = test.drop(columns=["ID"])

In [17]:
# 기업나이 생성
X_train["기업나이"] = 2025 - X_train["설립연도"]
X_test["기업나이"] = 2025 - X_test["설립연도"]
X_train.drop(columns=["설립연도"], inplace=True)
X_test.drop(columns=["설립연도"], inplace=True)

In [18]:
# 범주형 인코딩
cat_cols = X_train.select_dtypes(include="object").columns
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

In [19]:
# 결측치 처리
for col in ['직원 수', '고객수(백만명)']:
    X_train[col] = X_train[col].fillna(X_train[col].mean())
    X_test[col] = X_test[col].fillna(X_test[col].mean())

In [20]:
# 파생 변수 생성
X_train['매출_투자비'] = X_train['연매출(억원)'] / (X_train['총 투자금(억원)'] + 1)
X_test['매출_투자비'] = X_test['연매출(억원)'] / (X_test['총 투자금(억원)'] + 1)

X_train['고객당매출'] = X_train['연매출(억원)'] / (X_train['고객수(백만명)'] + 1)
X_test['고객당매출'] = X_test['연매출(억원)'] / (X_test['고객수(백만명)'] + 1)

X_train['SNS_비율'] = X_train['SNS 팔로워 수(백만명)'] / (X_train['고객수(백만명)'] + 1)
X_test['SNS_비율'] = X_test['SNS 팔로워 수(백만명)'] / (X_test['고객수(백만명)'] + 1)

In [28]:
X_test

Unnamed: 0,국가,분야,투자단계,직원 수,인수여부,상장여부,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업가치(백억원),기업나이,매출_투자비,고객당매출,SNS_비율
0,9,9,4,3261.0,0,1,45.0,5021.0,6680.0,2.00,0,23,1.330147,145.217391,0.043478
1,0,8,4,3707.0,1,0,70.0,1601.0,4654.0,4.20,5,5,2.905119,65.549296,0.059155
2,5,6,0,236.0,1,1,89.0,4709.0,9289.0,1.00,4,11,1.972187,103.211111,0.011111
3,0,5,1,637.0,1,1,17.0,2145.0,7005.0,5.00,0,22,3.264212,389.166667,0.277778
4,9,9,1,4922.0,1,0,68.0,4995.0,7593.0,4.36,5,19,1.519816,110.043478,0.063188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1750,2,6,1,2870.0,1,1,49.0,539.0,1942.0,4.00,0,10,3.596296,38.840000,0.080000
1751,0,9,0,278.0,1,1,35.0,2373.0,10847.0,3.00,3,19,4.569082,301.305556,0.083333
1752,0,3,0,1478.0,0,1,96.0,4215.0,8297.0,3.00,0,23,1.967979,85.536082,0.030928
1753,9,9,0,3570.0,0,0,59.0,3333.0,1399.0,5.00,3,3,0.419616,23.316667,0.083333


In [30]:
xgb = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,      # ← 너무 느리면 학습이 멈춰버림
    max_depth=6,
    min_child_weight=1,      # ← 분할 제한 완화
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,                 # ← 정보이득 기준 제거
    reg_alpha=0.1,           # ← L1 완화
    reg_lambda=1,            # ← L2 유지
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

In [31]:
# 모델 학습

xgb.fit(X_train, y_train)

In [32]:
# 예측

xgb_pred = xgb.predict(X_test)

In [33]:
xgb_final = np.clip(xgb_pred, 0,1)

In [34]:
xgb_final

array([0.5237289 , 0.42567077, 0.4199444 , ..., 0.5458875 , 0.4982186 ,
       0.5787137 ], shape=(1755,), dtype=float32)

In [35]:
submission["성공확률"] = xgb_final
submission.to_csv("submission_xgb_final2.csv", index=False)