In [32]:
# 라이브러리 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

In [43]:
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [44]:
# 타겟과 제거할 컬럼 지정
except_cols = ["ID", "성공확률"]
X_train = train.drop(columns=except_cols)
y_train = train["성공확률"]
X_test = test.drop(columns=["ID"])

In [45]:
# 설립연도 → 기업나이로 변환
X_train['기업나이'] = 2025 - X_train['설립연도']
X_test['기업나이'] = 2025 - X_test['설립연도']
X_train.drop(columns=['설립연도'], inplace=True)
X_test.drop(columns=['설립연도'], inplace=True)

In [46]:
# 기업가치 범위 → 평균 수치로 변환
def parse_value_range(x):
    if pd.isnull(x):
        return np.nan
    if '-' in str(x):
        try:
            parts = x.split('-')
            return (float(parts[0]) + float(parts[1])) / 2
        except:
            return np.nan
    try:
        return float(x)
    except:
        return np.nan
    
X_train['기업가치(백억원)'] = X_train['기업가치(백억원)'].apply(parse_value_range)
X_test['기업가치(백억원)'] = X_test['기업가치(백억원)'].apply(parse_value_range)

In [47]:
# 결측값 처리
for col in ['직원 수', '고객수(백만명)', '연매출(억원)', '총 투자금(억원)', '기업가치(백억원)', 'SNS 팔로워 수(백만명)']:
    X_train[col] = X_train[col].fillna(X_train[col].median())
    X_test[col] = X_test[col].fillna(X_test[col].median())

X_train['분야'] = X_train['분야'].fillna('Unknown')
X_test['분야'] = X_test['분야'].fillna('Unknown')


In [48]:
# 파생 변수 추가
X_train['투자금_1인당'] = X_train['총 투자금(억원)'] / (X_train['직원 수'] + 1)
X_test['투자금_1인당'] = X_test['총 투자금(억원)'] / (X_test['직원 수'] + 1)

X_train['고객당_매출'] = X_train['연매출(억원)'] / (X_train['고객수(백만명)'] + 1)
X_test['고객당_매출'] = X_test['연매출(억원)'] / (X_test['고객수(백만명)'] + 1)

In [49]:
# 범주형 변수 Target Encoding
cat_cols = ['국가', '투자단계', '인수여부', '상장여부', '분야']

X_train = pd.get_dummies(X_train, columns=cat_cols)
X_test = pd.get_dummies(X_test, columns=cat_cols)

# 열 개수가 달라질 수 있으므로 정렬
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [50]:
# 모델 정의 및 학습
model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    num_leaves=31,
    random_state=42
)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1705
[LightGBM] [Info] Number of data points in the train set: 4376, number of used features: 39
[LightGBM] [Info] Start training from score 0.537340


In [51]:
rf_model = RandomForestRegressor(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

lgbm_model = LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=5, random_state=42)
lgbm_model.fit(X_train, y_train)
lgbm_pred = lgbm_model.predict(X_test)

# 앙상블 평균
ensemble_pred = (rf_pred + lgbm_pred) / 2

submission = pd.DataFrame({
    "ID": test["ID"],
    "성공확률": np.round(ensemble_pred, 3)
})
submission.to_csv("submission_ensemble2.csv", index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1705
[LightGBM] [Info] Number of data points in the train set: 4376, number of used features: 39
[LightGBM] [Info] Start training from score 0.537340


In [52]:
submission

Unnamed: 0,ID,성공확률
0,TEST_0000,0.540
1,TEST_0001,0.508
2,TEST_0002,0.454
3,TEST_0003,0.538
4,TEST_0004,0.667
...,...,...
1750,TEST_1750,0.468
1751,TEST_1751,0.628
1752,TEST_1752,0.560
1753,TEST_1753,0.511
