In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

In [30]:
# ------------------- 데이터 로드 ------------------- #
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

y_train = train["성공확률"]
X_train = train.drop(columns=["ID", "성공확률"])
X_test = test.drop(columns=["ID"])

In [31]:
# ------------------- 기업나이 및 범위형 수치 변환 ------------------- #
X_train["기업나이"] = 2025 - X_train["설립연도"]
X_test["기업나이"] = 2025 - X_test["설립연도"]
X_train.drop(columns=["설립연도"], inplace=True)
X_test.drop(columns=["설립연도"], inplace=True)

def parse_value_range(x):
    if pd.isnull(x):
        return np.nan
    if '-' in str(x):
        try:
            parts = x.split('-')
            return (float(parts[0]) + float(parts[1])) / 2
        except:
            return np.nan
    try:
        return float(x)
    except:
        return np.nan

X_train["기업가치(백억원)"] = train["기업가치(백억원)"].apply(parse_value_range)
X_test["기업가치(백억원)"] = test["기업가치(백억원)"].apply(parse_value_range)

In [32]:
# ------------------- 결측값 처리 ------------------- #
for col in ['직원 수', '고객수(백만명)', '기업가치(백억원)']:
    X_train[col] = X_train[col].fillna(X_train[col].mean())
    X_test[col] = X_test[col].fillna(X_test[col].mean())

In [33]:
# ------------------- 범주형 인코딩 ------------------- #
cat_cols = X_train.select_dtypes(include="object").columns
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

In [34]:
# ------------------- 파생 변수 생성 ------------------- #
X_train['매출_투자비'] = X_train['연매출(억원)'] / (X_train['총 투자금(억원)'] + 1)
X_test['매출_투자비'] = X_test['연매출(억원)'] / (X_test['총 투자금(억원)'] + 1)

X_train['고객당매출'] = X_train['연매출(억원)'] / (X_train['고객수(백만명)'] + 1)
X_test['고객당매출'] = X_test['연매출(억원)'] / (X_test['고객수(백만명)'] + 1)

X_train['SNS_비율'] = X_train['SNS 팔로워 수(백만명)'] / (X_train['고객수(백만명)'] + 1)
X_test['SNS_비율'] = X_test['SNS 팔로워 수(백만명)'] / (X_test['고객수(백만명)'] + 1)

X_train["직원당_매출"] = X_train["연매출(억원)"] / (X_train["직원 수"] + 1)
X_test["직원당_매출"] = X_test["연매출(억원)"] / (X_test["직원 수"] + 1)

X_train["투자_가치비"] = X_train["총 투자금(억원)"] / (X_train["기업가치(백억원)"] + 1)
X_test["투자_가치비"] = X_test["총 투자금(억원)"] / (X_test["기업가치(백억원)"] + 1)

In [35]:
# align
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# ------------------- 1차 학습으로 피처 중요도 확인 ------------------- #
xgb_temp = XGBRegressor(n_estimators=300, random_state=42, verbosity=0)
xgb_temp.fit(X_train, y_train)

imp_df = pd.DataFrame({
    "feature": X_train.columns,
    "importance": xgb_temp.feature_importances_
}).sort_values(by="importance", ascending=False)

top_features = imp_df[imp_df["importance"] > 0]["feature"].tolist()
X_train_sel = X_train[top_features]
X_test_sel = X_test[top_features]

In [36]:
# ------------------- 최종 모델 학습 ------------------- #
# xgb = XGBRegressor(
#     n_estimators=3000,
#     learning_rate=0.008,
#     max_depth=7,
#     min_child_weight=4,
#     subsample=0.85,
#     colsample_bytree=0.85,
#     gamma=1,
#     reg_alpha=1,
#     reg_lambda=1,
#     random_state=42,
#     n_jobs=-1,
#     verbosity=1
# )

# xgb.fit(X_train_sel, y_train)
# xgb_pred = xgb.predict(X_test_sel)

# 모델 학습
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_pred = np.clip(xgb_pred, 0, 1)

In [38]:
xgb_pred

array([0.5273898 , 0.50691444, 0.43532315, ..., 0.5493971 , 0.49042583,
       0.57573366], shape=(1755,), dtype=float32)

In [37]:
# ------------------- 제출 ------------------- #
submission["성공확률"] = xgb_pred
submission.to_csv("submission_xgb_top_final.csv", index=False)