In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.metrics import mean_absolute_error

In [61]:
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [62]:
except_cols = ["ID", "성공확률"]
X_train = train.drop(columns=except_cols)
y_train = train["성공확률"]

cat_cols = X_train.select_dtypes(include="object").columns.tolist()
X_train = pd.get_dummies(X_train, columns=cat_cols)
X_test = test.drop(columns="ID")
X_test = pd.get_dummies(X_test, columns=cat_cols)

In [63]:
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

In [None]:
train[]

In [64]:
# train_test_split 하기 전에
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

In [65]:
X_train = X_train.astype(float)
X_test = X_test.astype(float)

In [66]:
X_train.isnull().sum()
X_test.isnull().sum()

설립연도                   0
직원 수                   0
고객수(백만명)               0
총 투자금(억원)              0
연매출(억원)                0
SNS 팔로워 수(백만명)         0
국가_CT001               0
국가_CT002               0
국가_CT003               0
국가_CT004               0
국가_CT005               0
국가_CT006               0
국가_CT007               0
국가_CT008               0
국가_CT009               0
국가_CT010               0
분야_AI                  0
분야_게임                  0
분야_기술                  0
분야_물류                  0
분야_에너지                 0
분야_에듀테크                0
분야_이커머스                0
분야_푸드테크                0
분야_핀테크                 0
분야_헬스케어                0
투자단계_IPO               0
투자단계_Seed              0
투자단계_Series A          0
투자단계_Series B          0
투자단계_Series C          0
인수여부_No                0
인수여부_Yes               0
상장여부_No                0
상장여부_Yes               0
기업가치(백억원)_1500-2500    0
기업가치(백억원)_2500-3500    0
기업가치(백억원)_3500-4500    0
기업가치(백억원)_4500-6000    0
기업가치(백억원)_6000이상       0


In [67]:
model = TabNetRegressor(
    cat_idxs=[],        # get_dummies 했으니까 빈 리스트
    cat_dims=[],
    seed=42
)



In [68]:
# train 전체로 학습
model.fit(
    X_train=X_train.values,
    y_train=y_train.values.reshape(-1, 1),
    max_epochs=300,
    patience=30
)



epoch 0  | loss: 0.54889 |  0:00:00s
epoch 1  | loss: 0.1845  |  0:00:00s
epoch 2  | loss: 0.12609 |  0:00:01s
epoch 3  | loss: 0.10155 |  0:00:01s
epoch 4  | loss: 0.08987 |  0:00:01s
epoch 5  | loss: 0.07766 |  0:00:01s
epoch 6  | loss: 0.07138 |  0:00:02s
epoch 7  | loss: 0.06909 |  0:00:02s
epoch 8  | loss: 0.06863 |  0:00:02s
epoch 9  | loss: 0.06771 |  0:00:02s
epoch 10 | loss: 0.06393 |  0:00:03s
epoch 11 | loss: 0.06273 |  0:00:03s
epoch 12 | loss: 0.06269 |  0:00:03s
epoch 13 | loss: 0.06226 |  0:00:04s
epoch 14 | loss: 0.0615  |  0:00:04s
epoch 15 | loss: 0.06049 |  0:00:04s
epoch 16 | loss: 0.06099 |  0:00:04s
epoch 17 | loss: 0.06033 |  0:00:05s
epoch 18 | loss: 0.06041 |  0:00:05s
epoch 19 | loss: 0.0603  |  0:00:05s
epoch 20 | loss: 0.06081 |  0:00:06s
epoch 21 | loss: 0.06062 |  0:00:06s
epoch 22 | loss: 0.06003 |  0:00:06s
epoch 23 | loss: 0.05959 |  0:00:06s
epoch 24 | loss: 0.05976 |  0:00:07s
epoch 25 | loss: 0.05934 |  0:00:07s
epoch 26 | loss: 0.05985 |  0:00:07s
e

In [69]:
# test 데이터 예측
y_pred_test = model.predict(X_test.values)
y_pred_test = np.round(y_pred_test.flatten(), 3)

In [70]:
submission['성공확률'] = y_pred_test

In [72]:
submission.to_csv('submission_TabNet.csv', index=False)

In [None]:
# -----------------------------------------------------------
# MAE 측정

In [75]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [76]:
model2 = TabNetRegressor(
    cat_idxs=[],
    cat_dims=[],
    seed=42
)



In [77]:
model2.fit(
    X_train=X_train_sub.values,
    y_train=y_train_sub.values.reshape(-1, 1),
    max_epochs=300,
    patience=30
)



epoch 0  | loss: 0.70856 |  0:00:00s
epoch 1  | loss: 0.25161 |  0:00:00s
epoch 2  | loss: 0.16742 |  0:00:00s
epoch 3  | loss: 0.13023 |  0:00:01s
epoch 4  | loss: 0.10438 |  0:00:01s
epoch 5  | loss: 0.0886  |  0:00:01s
epoch 6  | loss: 0.08006 |  0:00:01s
epoch 7  | loss: 0.07605 |  0:00:02s
epoch 8  | loss: 0.0691  |  0:00:02s
epoch 9  | loss: 0.06847 |  0:00:02s
epoch 10 | loss: 0.06747 |  0:00:02s
epoch 11 | loss: 0.06348 |  0:00:03s
epoch 12 | loss: 0.06278 |  0:00:03s
epoch 13 | loss: 0.0618  |  0:00:03s
epoch 14 | loss: 0.06272 |  0:00:03s
epoch 15 | loss: 0.06171 |  0:00:04s
epoch 16 | loss: 0.06226 |  0:00:04s
epoch 17 | loss: 0.06162 |  0:00:04s
epoch 18 | loss: 0.06016 |  0:00:04s
epoch 19 | loss: 0.06137 |  0:00:04s
epoch 20 | loss: 0.06065 |  0:00:05s
epoch 21 | loss: 0.06013 |  0:00:05s
epoch 22 | loss: 0.05992 |  0:00:05s
epoch 23 | loss: 0.05985 |  0:00:05s
epoch 24 | loss: 0.05954 |  0:00:06s
epoch 25 | loss: 0.0593  |  0:00:06s
epoch 26 | loss: 0.0593  |  0:00:06s
e

In [78]:
y_val_pred = model.predict(X_val.values)
y_val_pred = y_val_pred.flatten()

In [79]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_val.values, y_val_pred)
print(f"✅ TabNet 검증 MAE: {mae:.4f}")

✅ TabNet 검증 MAE: 0.1625
