In [None]:
import pandas as pd

In [24]:
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [None]:
except_cols = ["ID", "성공확률"]
X_train = train.drop(columns=except_cols)
y_train = train["성공확률"]

cat_cols = X_train.select_dtypes(include="object").columns.tolist()
X_train = pd.get_dummies(X_train, columns=cat_cols)
X_test = test.drop(columns="ID")
X_test = pd.get_dummies(X_test, columns=cat_cols)

In [None]:
X_train['직원 수'] = X_train['직원 수'].fillna(X_train['직원 수'].median())
X_train['고객수(백만명)'] = X_train['고객수(백만명)'].fillna(X_train['고객수(백만명)'].median())

X_test['직원 수'] = X_test['직원 수'].fillna(X_test['직원 수'].median())
X_test['고객수(백만명)'] = X_test['고객수(백만명)'].fillna(X_test['고객수(백만명)'].median())

# knn 인퓨터 사용해서 결측치 채워보기 나와 가까운 이웃의 평균값으로
# 모델도 한 번 바꿔보기 (ex. )

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [31]:
submission = pd.DataFrame({
    'ID': test['ID'],             # test 데이터에 있는 ID 컬럼
    '성공확률': y_pred
})

submission['성공확률'] = submission['성공확률'].round(3)
submission.to_csv('submission.csv', index=False)

In [None]:
# -----------------------------------------------------------
# MAE 측정

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [35]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
model.fit(X_train_sub, y_train_sub)

In [37]:
y_val_pred = model.predict(X_val)

In [38]:
mse = mean_squared_error(y_val, y_val_pred)
rmse = mse**0.5

print(f"검증 RMSE: {rmse:.4f}")

검증 RMSE: 0.0906


In [39]:
from sklearn.metrics import mean_absolute_error

# 예측 결과 y_val_pred를 이용해서
mae = mean_absolute_error(y_val, y_val_pred)

print(f"검증 MAE: {mae:.4f}")

검증 MAE: 0.0747


In [None]:
from sklearn.ensemble import RandomForestRegressor

# RandomForest 튜닝

rf_model = RandomForestRegressor(
    n_estimators=500,   # 트리 개수 크게
    max_depth=12,       # 깊이 살짝 키우기
    min_samples_split=4, # 가지치기 조금 더 느슨하게
    min_samples_leaf=2,  # 리프 노드 최소 샘플
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_sub, y_train_sub)
y_val_pred_rf = rf_model.predict(X_val)

from sklearn.metrics import mean_absolute_error
mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
print(f"✅ 튜닝한 RandomForest 검증 MAE: {mae_rf:.4f}")

✅ 튜닝한 RandomForest 검증 MAE: 0.2033
