In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor

In [2]:
# 1. 데이터 불러오기
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [3]:
# 2. target과 feature 분리
y_train = train["성공확률"]
X_train = train.drop(columns=["ID", "성공확률"])
X_test = test.drop(columns=["ID"])

In [4]:
# 3. '설립연도' → '기업나이'로 변환
X_train['기업나이'] = 2025 - X_train['설립연도']
X_test['기업나이'] = 2025 - X_test['설립연도']

In [5]:
# 사용 후 기존 '설립연도' 제거
X_train.drop(columns=['설립연도'], inplace=True)
X_test.drop(columns=['설립연도'], inplace=True)

In [6]:
# 4. Label Encoding for 범주형 변수
cat_cols = X_train.select_dtypes(include='object').columns

for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

In [7]:
# 5. 결측치 처리 (평균 대체)
for col in ['직원 수', '고객수(백만명)']:
    X_train[col] = X_train[col].fillna(X_train[col].mean())
    X_test[col] = X_test[col].fillna(X_test[col].mean())

In [8]:
# 6. 파생 변수 생성
X_train['매출_투자비'] = X_train['연매출(억원)'] / (X_train['총 투자금(억원)'] + 1)
X_test['매출_투자비'] = X_test['연매출(억원)'] / (X_test['총 투자금(억원)'] + 1)

In [9]:
# 7. 모델 학습 (LightGBM)
model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1422
[LightGBM] [Info] Number of data points in the train set: 4376, number of used features: 13
[LightGBM] [Info] Start training from score 0.537340


In [12]:
# 8. 제출 파일 생성
submission = pd.DataFrame({
    'ID': test['ID'],
    '성공확률': y_pred
})

In [13]:
submission

Unnamed: 0,ID,성공확률
0,TEST_0000,0.517332
1,TEST_0001,0.460961
2,TEST_0002,0.449949
3,TEST_0003,0.491189
4,TEST_0004,0.546809
...,...,...
1750,TEST_1750,0.476337
1751,TEST_1751,0.550476
1752,TEST_1752,0.543421
1753,TEST_1753,0.445502


In [14]:
submission.to_csv("submission_lgbm3.csv", index=False)