In [478]:
from google.colab import drive
!ls /content/drive/MyDrive/projects/ML_FinalProj
import sys
import os

drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/projects/ML_FinalProj')

os.chdir("/content/drive/MyDrive/projects/ML_FinalProj")

dataset  MLTeamPredictionBasic.ipynb  xgb_top20_feature_importance.png	zips
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [479]:
from pathlib import Path
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [480]:
ROOT = Path("dataset")
TEAM_STAT_PATH = ROOT / "team-stat"
GAME_LOG_PATH = ROOT / "game-log"

TOTAL_TEAM_STAT_PATH = TEAM_STAT_PATH / "01-25_팀기록.csv"

In [481]:
team_stat_df = pd.read_csv(TOTAL_TEAM_STAT_PATH)
print(team_stat_df.head())
teams = team_stat_df["team"]
team_names = [t[2:] for t in teams]
unique_team_names = np.unique(team_names)


   team  G.x    E  Range RAA  종합 RAA.x  종합RAA/144  POSAdj  RAAwithPOS  \
0  25한화  144   86       0.27      8.18       0.48  -10.12       -2.23   
1  25키움  144  119     -35.45    -35.75      -2.17  -10.05      -45.79   
2  25삼성  144   87      19.58     26.38       1.55   -8.27       18.07   
3  25롯데  144  111     -11.02    -13.96      -0.78   -9.52      -23.57   
4  25두산  144  120      11.33      8.39       0.49   -6.77        1.62   

   WAAwoPOS  WAAwithPOS  ...  RA9.y  rRA9.y  FIP.y  kwERA  ERA-FIP    ERA-  \
0     0.817      -0.241  ...   3.86    3.86   3.57   3.45    -0.02   81.93   
1    -3.573      -4.374  ...   6.10    6.10   4.93   4.60     0.46  133.16   
2     2.637       1.761  ...   4.59    4.59   4.31   4.10    -0.19   86.05   
3    -1.395      -2.401  ...   5.27    5.27   4.33   4.12     0.42  105.09   
4     0.839       0.227  ...   4.82    4.82   4.39   4.30    -0.09  102.09   

    rRA9-    FIP-  AVG.y  OBP.y  
0   80.21   82.58  0.244  0.318  
1  145.45  123.71  0.287

In [482]:
x = team_stat_df["HR.x"].tolist()
y = team_stat_df["HR"].tolist()
diff_count = sum([x[i]!=y[i] for i in range(len(x))])
print(diff_count)

214


In [483]:
team_stat_df_edited = team_stat_df.drop(columns=["wRC+.y", "G.x", "G.y", "G.x.x", "G.y.y", "HR.y", "R/ePA.y",
                                                 "IsoP.y", "ERA.y", "RA9.y", "FIP.y", "rRA9.y"]) # 중복제거
team_stat_df_edited = team_stat_df_edited.drop(columns=["POSAdj", "RAAwithPOS", "WAAwithPOS", "HR/FB", "WAR당 연봉.y"]) # 결측치 존재하는 column 제거
# 일부 column 이름 정리
team_stat_df_edited = team_stat_df_edited.rename(columns={"wRC+.x": "wRC+"})
team_stat_df_edited = team_stat_df_edited.rename(columns={"R/ePA.x": "R/ePA"})
team_stat_df_edited = team_stat_df_edited.rename(columns={"IsoP.x": "IsoP"})
team_stat_df_edited = team_stat_df_edited.rename(columns={"ERA.x": "ERA"})
team_stat_df_edited = team_stat_df_edited.rename(columns={"RA9.x": "RA9"})
team_stat_df_edited = team_stat_df_edited.rename(columns={"FIP.x": "FIP"})
team_stat_df_edited = team_stat_df_edited.rename(columns={"rRA9.x": "rRA9"})
team_stat_df_edited = team_stat_df_edited.rename(columns={"WAR당 연봉.x": "WAR당 연봉"})

# 팀 이름 / 연도 분리
team_stat_df_edited.insert(loc=1, column="year", value=team_stat_df_edited["team"].str[:2])
team_stat_df_edited.insert(loc=2, column="name", value=team_stat_df_edited["team"].str[2:])

# 팀 이름 통일한 id 할당
# game-log에서도 동일 mapping 적용
team_groups = {
    0: ['KIA', '해태'],
    1: ['삼성'],
    2: ['두산'],
    3: ['LG'],
    4: ['KT'],
    5: ['SSG', 'SK'],
    6: ['롯데'],
    7: ['한화'],
    8: ['NC'],
    9: ['키움', '넥센', '우리', '히어로즈'],
    10: ['현대']
}
name_to_id = {name: group_id for group_id, names in team_groups.items() for name in names} # 팀 이름 -> id 변환 mapping
team_stat_df_edited.insert(loc=0, column='id', value=team_stat_df_edited['name'].map(name_to_id))
team_stat_df_edited = team_stat_df_edited.drop(columns=["team"])


In [484]:
# id, year별로 year-1년까지의 모든 지표 가중합하여 전처리
def build_decayed_team_features(df, decay, exclude_cols = ("id", "year", "name", "G")):
  suffix = "_decay"

  df_out = df.copy()
  df_out["year"] = pd.to_numeric(df_out["year"], errors='raise')
  df_out = df_out.sort_values(["id", "year"], kind="mergesort")
  cand_cols = [c for c in df_out.columns if c not in exclude_cols]

  # 연도 고려하여 각 지표의 decay 누적 계산
  # input: pd.DataFrame     output: pd.Series
  def _decayed_cumsum_with_gaps(sub, col):
    years = sub["year"].to_numpy()
    vals = pd.to_numeric(sub[col], errors="coerce").fillna(0).to_numpy(dtype=float)
    out = np.zeros_like(vals)
    s = 0.0
    prev_year = None
    for i in range(len(vals)):
      if prev_year is None:
        gap_decay = 1.0
      else:
        gap = int(years[i] - prev_year)
        gap_decay = decay ** max(1, gap)
      prev_val = vals[i-1] if i>0 else 0.0
      s = s * gap_decay + prev_val
      out[i] = s
      prev_year = years[i]
    return pd.Series(out, index=sub.index)

  new_cols_list = []

  for col in cand_cols:
    new_col = f"{col}{suffix}"

    new_series = (
        df_out.groupby("id", group_keys=False).apply(
            lambda x: _decayed_cumsum_with_gaps(x, col),
            include_groups=False
        )
    )
    new_series.name = new_col
    new_cols_list.append(new_series)
  df_out = pd.concat([df_out] + new_cols_list, axis=1)
  return df_out

In [485]:
# game-log csv 불러오기 및 정렬
game_log_csv_names = ["games_20" + str(i).zfill(2) + ".csv" for i in range(2, 26)]
game_log_df = pd.DataFrame()

for i in range(2, 26):
  GAME_LOG_PATH_PER_YEAR = GAME_LOG_PATH / game_log_csv_names[i-2]
  game_log_per_year_df = pd.read_csv(GAME_LOG_PATH_PER_YEAR)
  game_log_df = pd.concat([game_log_df, game_log_per_year_df])

# game-log에서 팀 이름을 id로 변환
game_log_df = game_log_df.assign(
    home_id = game_log_df["home_team"].map(name_to_id),
    away_id = game_log_df["away_team"].map(name_to_id),
)

### 해당 시즌 내 지표(~승률, 최근10경기) 계산
game_log_base = game_log_df[['date', 'season', 'home_id', 'away_id', 'home_runs', 'away_runs']].dropna(subset=['home_id', 'away_id'])
game_log_base['home_id'] = game_log_base['home_id'].astype(int)
game_log_base['away_id'] = game_log_base['away_id'].astype(int)

home_games = game_log_base.rename(columns={'home_id': 'team_id', 'away_id': 'opp_id'})
home_games['win'] = (home_games['home_runs'] > home_games['away_runs']).astype(int)
away_games = game_log_base.rename(columns={'away_id': 'team_id', 'home_id': 'opp_id'})
away_games['win'] = (away_games['away_runs'] > away_games['home_runs']).astype(int)

all_games_df = pd.concat([
    home_games[['date', 'season', 'team_id', 'win']],
    away_games[['date', 'season', 'team_id', 'win']]
])
all_games_df = all_games_df.sort_values(by=['team_id', 'date'])

# 해당 시즌 누적 승률(직전 경기까지 반영)
gb_season = all_games_df.groupby(['team_id', 'season'])
wins_season_total = gb_season['win'].cumsum()
games_season_total = gb_season.cumcount() + 1
prev_wins = wins_season_total.shift(1).fillna(0)
prev_games = games_season_total.shift(1).fillna(0)
all_games_df['in_season_win_rate'] = (prev_wins / prev_games).fillna(0)

# 최근 10경기 승률
gb_team = all_games_df.groupby('team_id')
all_games_df['L10_win_rate'] = gb_team['win'].shift(1).rolling(
    window=10, min_periods=1
).mean().reset_index(level=0, drop=True).fillna(0)

# 병합
stats_to_merge = all_games_df.drop_duplicates(
    subset=['date', 'season', 'team_id'], keep='last'
)
stats_to_merge = stats_to_merge[['date', 'season', 'team_id', 'in_season_win_rate', 'L10_win_rate']]

# 지표별 decay 적용
team_stat_df_edited = build_decayed_team_features(
    team_stat_df_edited,
    decay=0.9
)
feature_cols = ['id', 'year'] + [col for col in team_stat_df_edited.columns if col.endswith("_decay")]
team_features_df = team_stat_df_edited[feature_cols].copy()

home_features = team_features_df.rename(columns={
    col: f"home_{col}" for col in team_features_df.columns if col not in ['id', 'year']
})
away_features = team_features_df.rename(columns={
    col: f"away_{col}" for col in team_features_df.columns if col not in ['id', 'year']
})

# game-log 전처리
game_log_df_cleaned = game_log_df.dropna(subset=["home_id", "away_id"]).copy()
game_log_df_cleaned['year'] = (game_log_df_cleaned['season'] % 100).astype(int)
game_log_df_cleaned['home_id'] = game_log_df_cleaned['home_id'].astype(int)
game_log_df_cleaned['away_id'] = game_log_df_cleaned['away_id'].astype(int)
game_log_df_cleaned['home_win'] = (game_log_df_cleaned['home_runs'] > game_log_df_cleaned['away_runs']).astype(int)
game_data_df = game_log_df_cleaned[['date', 'season', 'year', 'home_id', 'away_id', 'home_runs', 'away_runs', 'home_win']]

# game-log와 연도별 지표 결합
merged_df = pd.merge(
    game_data_df, home_features,
    left_on=['home_id', 'year'], right_on=['id', 'year'], how='left'
)
final_df = pd.merge(
    merged_df, away_features,
    left_on=['away_id', 'year'], right_on=['id', 'year'], how='left'
)
final_df = final_df.drop(columns=['id_x', 'id_y'])

# 해당 연도 승률 지표 추가
final_df_with_momentum = pd.merge(
    final_df,
    stats_to_merge.rename(columns={
        'team_id': 'home_id',
        'in_season_win_rate': 'home_in_season_win_rate',
        'L10_win_rate': 'home_L10_win_rate'
    }),
    on=['date', 'season', 'home_id'], how='left'
)
final_df_with_momentum = pd.merge(
    final_df_with_momentum,
    stats_to_merge.rename(columns={
        'team_id': 'away_id',
        'in_season_win_rate': 'away_in_season_win_rate',
        'L10_win_rate': 'away_L10_win_rate'
    }),
    on=['date', 'season', 'away_id'], how='left'
)

# 결측치 처리
new_momentum_cols = ['home_in_season_win_rate', 'home_L10_win_rate', 'away_in_season_win_rate', 'away_L10_win_rate']
final_df_with_momentum[new_momentum_cols] = final_df_with_momentum[new_momentum_cols].fillna(0)

# 두 팀의 차이 지표
final_df_with_diff = final_df_with_momentum.copy()

# diff에 decay 적용
decay_cols = [col for col in team_features_df.columns if col.endswith('_decay')]
base_cols = [c for c in decay_cols if c not in ['id', 'year']]
home_mat = final_df_with_diff[[f'home_{c}' for c in base_cols]].copy()
home_mat.columns = base_cols
away_mat = final_df_with_diff[[f'away_{c}' for c in base_cols]].copy()
away_mat.columns = base_cols
diff_arr = home_mat.values - away_mat.values
diff_cols = [f'diff_{c}' for c in base_cols]
diff_df = pd.DataFrame(diff_arr, columns=diff_cols, index=final_df_with_diff.index)
final_df_with_diff = pd.concat([final_df_with_diff, diff_df], axis=1, copy=False)

final_df_with_diff['diff_in_season_win_rate'] = final_df_with_diff['home_in_season_win_rate'] - final_df_with_diff['away_in_season_win_rate']
final_df_with_diff['diff_L10_win_rate'] = final_df_with_diff['home_L10_win_rate'] - final_df_with_diff['away_L10_win_rate']

final_df_with_diff = final_df_with_diff.copy()
final_df_filtered = final_df_with_diff[final_df_with_diff['season'] > 2002].copy()
final_df_complete = final_df_filtered.dropna() # 최종 df


In [486]:
# print(final_df_cleaned.columns.tolist())

In [487]:
# Weighted Soft Voting ensemble
# 타격/투구/수비/종합 + diff 8개모델

final_df_with_diff = final_df_complete.copy()
home_cols = [col for col in final_df_with_diff.columns if col.startswith('home_') and col.endswith('_decay')]
base_cols = [col.replace('home_', '') for col in home_cols]
for base_col in base_cols:
    home_col_name = f'home_{base_col}'
    away_col_name = f'away_{base_col}'
    if home_col_name in final_df_with_diff.columns and away_col_name in final_df_with_diff.columns:
        final_df_with_diff[f'diff_{base_col}'] = final_df_with_diff[home_col_name] - final_df_with_diff[away_col_name]

y_new = final_df_with_diff['home_win']
drop_cols = ['date', 'season', 'home_id', 'away_id', 'home_runs', 'away_runs', 'home_win']
X_new = final_df_with_diff.drop(columns=drop_cols)

RANDOM_SEED = 42
X_train_val_new, X_test_new, y_train_val_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
val_relative_size = 0.1 / 0.9
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_train_val_new, y_train_val_new, test_size=val_relative_size, random_state=RANDOM_SEED, shuffle=True
)


offense_base = ['WAR.x_decay', 'oWAR_decay', 'wRC+_decay', 'wOBA_decay', 'OPS_decay', 'wRC_decay', 'AVG.x_decay', 'OBP.x_decay', 'SLG_decay', 'IsoP_decay', 'BB%.x_decay', 'HR.x_decay', 'RBI_decay', 'RC27_decay'] # 타격 WAR
pitching_base = ['WAR.y_decay', '선발WAR_decay', '구원WAR_decay', 'FIP_decay', 'kwERA_decay', 'ERA_decay', 'WHIP_decay', 'K/9_decay', 'BB/9_decay', 'K%.y_decay', 'BB%.y_decay', 'FIP-_decay', 'ERA-_decay', 'LOB_decay', 'HR/9_decay', 'IP_decay', 'GS_decay', 'GR_decay'] # 투수 WAR
defense_base = ['dWAR_decay', '필딩 RAA_decay', 'Range RAA_decay', '수비 RAA_decay', '포지션 RAA_decay', 'E_decay']
others_base = ['Rep.Wins_decay', 'Pos.Wins_decay', 'RAR_decay']

offense_features = [f'home_{c}' for c in offense_base] + [f'away_{c}' for c in offense_base]
pitching_features = [f'home_{c}' for c in pitching_base] + [f'away_{c}' for c in pitching_base]
defense_features = [f'home_{c}' for c in defense_base] + [f'away_{c}' for c in defense_base]
others_features = [f'home_{c}' for c in others_base] + [f'away_{c}' for c in others_base]

diff_offense_features = [f'diff_{c}' for c in offense_base]
diff_pitching_features = [f'diff_{c}' for c in pitching_base]
diff_defense_features = [f'diff_{c}' for c in defense_base]
diff_others_features = [f'diff_{c}' for c in others_base]

print(f"Model 1 (Home/Away Offense) features: {len(offense_features)}")
print(f"Model 2 (Home/Away Pitching) features: {len(pitching_features)}")
print(f"Model 3 (Diff Offense) features: {len(diff_offense_features)}")
print(f"Model 4 (Diff Pitching) features: {len(diff_pitching_features)}")

common_dt_params = {
    'max_depth': 4,
    'min_samples_leaf': 20,
    'random_state': RANDOM_SEED
}
pipe_offense = Pipeline([('selector', ColumnTransformer([('subset', 'passthrough', offense_features)], remainder='drop')),('model', DecisionTreeClassifier(**common_dt_params))])
pipe_pitching = Pipeline([('selector', ColumnTransformer([('subset', 'passthrough', pitching_features)], remainder='drop')),('model', DecisionTreeClassifier(**common_dt_params))])
pipe_defense = Pipeline([('selector', ColumnTransformer([('subset', 'passthrough', defense_features)], remainder='drop')), ('model', DecisionTreeClassifier(**common_dt_params))])
pipe_others = Pipeline([('selector', ColumnTransformer([('subset', 'passthrough', others_features)], remainder='drop')), ('model', DecisionTreeClassifier(**common_dt_params))])

pipe_diff_offense = Pipeline([('selector', ColumnTransformer([('subset', 'passthrough', diff_offense_features)], remainder='drop')), ('model', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_pitching = Pipeline([('selector', ColumnTransformer([ ('subset', 'passthrough', diff_pitching_features)], remainder='drop')), ('model', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_defense = Pipeline([('selector', ColumnTransformer([ ('subset', 'passthrough', diff_defense_features)], remainder='drop')), ('model', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_others = Pipeline([('selector', ColumnTransformer([ ('subset', 'passthrough', diff_others_features)], remainder='drop')), ('model', DecisionTreeClassifier(**common_dt_params))])


voting_model_v2 = VotingClassifier(
    estimators=[
        ('offense_tree', pipe_offense),
        ('pitching_tree', pipe_pitching),
        ('defense_tree', pipe_defense),
        ('others_tree', pipe_others),
        ('diff_offense_tree', pipe_diff_offense),
        ('diff_pitching_tree', pipe_diff_pitching),
        ('diff_defense_tree', pipe_diff_defense),
        ('diff_others_tree', pipe_diff_others)
    ],
    voting='soft',
    weights=[0.1, 0.1, 0.1, 0.05, 0.2, 0.2, 0.2, 0.05]
)

voting_model_v2.fit(X_train_new, y_train_new)

preds_vote_v2 = voting_model_v2.predict(X_test_new)
pred_probs_vote_v2 = voting_model_v2.predict_proba(X_test_new)[:, 1]

accuracy_vote_v2 = accuracy_score(y_test_new, preds_vote_v2)
roc_auc_vote_v2 = roc_auc_score(y_test_new, pred_probs_vote_v2)
logloss_vote_v2 = log_loss(y_test_new, pred_probs_vote_v2)

print(f"\n[Test Set 평가 결과]")
print(f"Accuracy (정확도): {accuracy_vote_v2:.4f}")
print(f"ROC AUC: {roc_auc_vote_v2:.4f}")
print(f"Log Loss: {logloss_vote_v2:.4f}")

Model 1 (Home/Away Offense) features: 28
Model 2 (Home/Away Pitching) features: 36
Model 3 (Diff Offense) features: 14
Model 4 (Diff Pitching) features: 18

[Test Set 평가 결과]
Accuracy (정확도): 0.5476
ROC AUC: 0.5528
Log Loss: 0.6921


In [488]:
# stacking ensemble
# 타격/투구/수비/종합 + diff 8개모델

final_df_with_diff = final_df_complete.copy()
home_cols = [col for col in final_df_with_diff.columns if col.startswith('home_') and col.endswith('_decay')]
base_cols = [col.replace('home_', '') for col in home_cols]

diff_cols_list = []
for base_col in base_cols:
    name = f'diff_{base_col}'
    home_col_name = f'home_{base_col}'
    away_col_name = f'away_{base_col}'
    if name in final_df_with_diff.columns:
        continue
    if home_col_name in final_df_with_diff.columns and away_col_name in final_df_with_diff.columns:
        diff_col = final_df_with_diff[home_col_name] - final_df_with_diff[away_col_name]
        diff_col.name = name
        diff_cols_list.append(diff_col)

if diff_cols_list:
    final_df_with_diff = pd.concat([final_df_with_diff] + diff_cols_list, axis=1)

y_new = final_df_with_diff['home_win']
drop_cols = ['date', 'season', 'home_id', 'away_id', 'home_runs', 'away_runs', 'home_win']
X_new = final_df_with_diff.drop(columns=drop_cols)
X_new = X_new.loc[:, ~X_new.columns.duplicated(keep='first')]

RANDOM_SEED = 42
X_train_val_new, X_test_new, y_train_val_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
val_relative_size = 0.1 / 0.9
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_train_val_new, y_train_val_new, test_size=val_relative_size, random_state=RANDOM_SEED, shuffle=True
)

# sub-model 별 feature
offense_base = ['WAR.x_decay', 'oWAR_decay', 'wRC+_decay', 'wOBA_decay', 'OPS_decay', 'wRC_decay', 'AVG.x_decay', 'OBP.x_decay', 'SLG_decay', 'IsoP_decay', 'BB%.x_decay', 'HR.x_decay', 'RBI_decay', 'RC27_decay'] # 타격 WAR
pitching_base = ['WAR.y_decay', '선발WAR_decay', '구원WAR_decay', 'FIP_decay', 'kwERA_decay', 'ERA_decay', 'WHIP_decay', 'K/9_decay', 'BB/9_decay', 'K%.y_decay', 'BB%.y_decay', 'FIP-_decay', 'ERA-_decay', 'LOB_decay', 'HR/9_decay', 'IP_decay', 'GS_decay', 'GR_decay'] # 투수 WAR
defense_base = ['dWAR_decay', '필딩 RAA_decay', 'Range RAA_decay', '수비 RAA_decay', '포지션 RAA_decay', 'E_decay']
others_base = ['Rep.Wins_decay', 'Pos.Wins_decay', 'RAR_decay']

offense_features = [f'home_{c}' for c in offense_base] + [f'away_{c}' for c in offense_base]
pitching_features = [f'home_{c}' for c in pitching_base] + [f'away_{c}' for c in pitching_base]
defense_features = [f'home_{c}' for c in defense_base] + [f'away_{c}' for c in defense_base]
others_features = [f'home_{c}' for c in others_base] + [f'away_{c}' for c in others_base]
diff_offense_features = [f'diff_{c}' for c in offense_base]
diff_pitching_features = [f'diff_{c}' for c in pitching_base]
diff_defense_features = [f'diff_{c}' for c in defense_base]
diff_others_features = [f'diff_{c}' for c in others_base]


# sub-model 정의
common_dt_params = {
    'max_depth': 4,
    'min_samples_leaf': 20,
    'random_state': RANDOM_SEED
}

pipe_offense = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', offense_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])
pipe_pitching = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', pitching_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])
pipe_defense = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', defense_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])
pipe_others = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', others_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])
pipe_diff_offense = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', diff_offense_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])
pipe_diff_pitching = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', diff_pitching_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])
pipe_diff_defense = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', diff_defense_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])
pipe_diff_others = Pipeline([
    ('selector', ColumnTransformer([('subset', 'passthrough', diff_others_features)], remainder='drop')),
    ('model', DecisionTreeClassifier(**common_dt_params))
])

# sub-model 목록
estimators_list = [
    ('offense_tree', pipe_offense),
    ('pitching_tree', pipe_pitching),
    ('defense_tree', pipe_defense),
    ('others_tree', pipe_others),
    ('diff_offense_tree', pipe_diff_offense),
    ('diff_pitching_tree', pipe_diff_pitching),
    ('diff_defense_tree', pipe_diff_defense),
    ('diff_others_tree', pipe_diff_others)
]

stacking_model = StackingClassifier(
    estimators=estimators_list,
    final_estimator=LogisticRegression(random_state=RANDOM_SEED),
    cv=5,
    passthrough=False,
    n_jobs=-1
)

# 학습
stacking_model.fit(X_train_new, y_train_new)

# 평가
preds_stack = stacking_model.predict(X_test_new)
pred_probs_stack = stacking_model.predict_proba(X_test_new)[:, 1]

accuracy_stack = accuracy_score(y_test_new, preds_stack)
roc_auc_stack = roc_auc_score(y_test_new, pred_probs_stack)
logloss_stack = log_loss(y_test_new, pred_probs_stack)

print(f"\n[Test Set 평가 결과]")
print(f"Accuracy (정확도): {accuracy_stack:.4f}")
print(f"ROC AUC: {roc_auc_stack:.4f}")
print(f"Log Loss: {logloss_stack:.4f}")



[Test Set 평가 결과]
Accuracy (정확도): 0.5504
ROC AUC: 0.5550
Log Loss: 0.6915


In [489]:
# 이 아래로 해당 년도 전적 학습 데이터로 포함

In [490]:
# baseline 모델
# 간단한 logistic regression 기반 모델
# 두 팀 간의 차이만을 학습

y_base = final_df_complete['home_win']

# diff 데이터만 사용
diff_cols = [col for col in final_df_complete.columns if col.startswith('diff_')]
X_base = final_df_complete[diff_cols]


# 데이터 분할
RANDOM_SEED = 42
X_train_val_base, X_test_base, y_train_val_base, y_test_base = train_test_split(
    X_base, y_base, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
val_relative_size = 0.1 / 0.9
X_train_base, X_val_base, y_train_base, y_val_base = train_test_split(
    X_train_val_base, y_train_val_base, test_size=val_relative_size, random_state=RANDOM_SEED, shuffle=True
)

# 모델 정의(logistic regression 이용)
baseline_model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(random_state=RANDOM_SEED, max_iter=1000))
])

# 모델 학습
baseline_model.fit(X_train_base, y_train_base)

# 모델 평가
preds_base = baseline_model.predict(X_test_base)
pred_probs_base = baseline_model.predict_proba(X_test_base)[:, 1]

accuracy_base = accuracy_score(y_test_base, preds_base)
roc_auc_base = roc_auc_score(y_test_base, pred_probs_base)
logloss_base = log_loss(y_test_base, pred_probs_base)

print(f"\n[Test Set 평가 결과]")
print(f"Accuracy (정확도): {accuracy_base:.4f}")
print(f"ROC AUC: {roc_auc_base:.4f}")
print(f"Log Loss: {logloss_base:.4f}")



[Test Set 평가 결과]
Accuracy (정확도): 0.5469
ROC AUC: 0.5611
Log Loss: 0.6897


In [491]:
# stacking ensemble
# 투구/타격/수비/이외/해당년도전적 + diff 10개모델

y_new = final_df_complete['home_win']
drop_cols = ['date', 'season', 'home_id', 'away_id', 'home_runs', 'away_runs', 'home_win']
X_new = final_df_complete.drop(columns=drop_cols)

RANDOM_SEED = 42
X_train_val_new, X_test_new, y_train_val_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
val_relative_size = 0.1 / 0.9
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_train_val_new, y_train_val_new, test_size=val_relative_size, random_state=RANDOM_SEED, shuffle=True
)


# sub-model feature 목록 정의
offense_base = ['WAR.x_decay', 'oWAR_decay', 'wRC+_decay', 'wOBA_decay', 'OPS_decay', 'wRC_decay', 'AVG.x_decay', 'OBP.x_decay', 'SLG_decay', 'IsoP_decay', 'BB%.x_decay', 'HR.x_decay', 'RBI_decay', 'RC27_decay']
pitching_base = ['WAR.y_decay', '선발WAR_decay', '구원WAR_decay', 'FIP_decay', 'kwERA_decay', 'ERA_decay', 'WHIP_decay', 'K/9_decay', 'BB/9_decay', 'K%.y_decay', 'BB%.y_decay', 'FIP-_decay', 'ERA-_decay', 'LOB_decay', 'HR/9_decay', 'IP_decay', 'GS_decay', 'GR_decay']
defense_base = ['dWAR_decay', '필딩 RAA_decay', 'Range RAA_decay', '수비 RAA_decay', '포지션 RAA_decay', 'E_decay']
others_base = ['Rep.Wins_decay', 'Pos.Wins_decay', 'RAR_decay']
momentum_base = ['in_season_win_rate', 'L10_win_rate']

# Home/Away
offense_features = [f'home_{c}' for c in offense_base] + [f'away_{c}' for c in offense_base]
pitching_features = [f'home_{c}' for c in pitching_base] + [f'away_{c}' for c in pitching_base]
defense_features = [f'home_{c}' for c in defense_base] + [f'away_{c}' for c in defense_base]
others_features = [f'home_{c}' for c in others_base] + [f'away_{c}' for c in others_base]
momentum_features = [f'home_{c}' for c in momentum_base] + [f'away_{c}' for c in momentum_base]
# diff
diff_offense_features = [f'diff_{c}' for c in offense_base]
diff_pitching_features = [f'diff_{c}' for c in pitching_base]
diff_defense_features = [f'diff_{c}' for c in defense_base]
diff_others_features = [f'diff_{c}' for c in others_base]
diff_momentum_features = [f'diff_{c}' for c in momentum_base]

common_dt_params = {
    'max_depth': 4,
    'min_samples_leaf': 20,
    'random_state': RANDOM_SEED
}

# Home/Away
pipe_offense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', offense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_pitching = Pipeline([('s', ColumnTransformer([('p', 'passthrough', pitching_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_defense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', defense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_others = Pipeline([('s', ColumnTransformer([('p', 'passthrough', others_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_momentum = Pipeline([('s', ColumnTransformer([('p', 'passthrough', momentum_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
# diff
pipe_diff_offense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_offense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_pitching = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_pitching_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_defense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_defense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_others = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_others_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_momentum = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_momentum_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])


# 모델 정의
# stacking ensemble
# sub-model: 투구/타격/수비/이외/해당년도전적 + diff 10개모델
estimators_list = [
    ('offense_tree', pipe_offense),
    ('pitching_tree', pipe_pitching),
    ('defense_tree', pipe_defense),
    ('others_tree', pipe_others),
    ('momentum_tree', pipe_momentum),
    ('diff_offense_tree', pipe_diff_offense),
    ('diff_pitching_tree', pipe_diff_pitching),
    ('diff_defense_tree', pipe_diff_defense),
    ('diff_others_tree', pipe_diff_others),
    ('diff_momentum_tree', pipe_diff_momentum)
]

stacking_model_v3 = StackingClassifier(
    estimators=estimators_list,
    final_estimator=LogisticRegression(random_state=RANDOM_SEED),
    cv=5,
    n_jobs=-1
)

# 학습
stacking_model_v3.fit(X_train_new, y_train_new)

# 평가
preds_stack_v3 = stacking_model_v3.predict(X_test_new)
pred_probs_stack_v3 = stacking_model_v3.predict_proba(X_test_new)[:, 1]

accuracy_stack_v3 = accuracy_score(y_test_new, preds_stack_v3)
roc_auc_stack_v3 = roc_auc_score(y_test_new, pred_probs_stack_v3)
logloss_stack_v3 = log_loss(y_test_new, pred_probs_stack_v3)

print(f"\n[Test Set 평가 결과]")
print(f"Accuracy (정확도): {accuracy_stack_v3:.4f}")
print(f"ROC AUC: {roc_auc_stack_v3:.4f}")
print(f"Log Loss: {logloss_stack_v3:.4f}")


[Test Set 평가 결과]
Accuracy (정확도): 0.5315
ROC AUC: 0.5550
Log Loss: 0.6894


In [492]:
# weighted soft voting
# 투구/타격/수비/이외/해당년도전적 + diff 10개모델

y_new = final_df_complete['home_win']
drop_cols = ['date', 'season', 'home_id', 'away_id', 'home_runs', 'away_runs', 'home_win']
X_new = final_df_complete.drop(columns=drop_cols)

RANDOM_SEED = 42
X_train_val_new, X_test_new, y_train_val_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
val_relative_size = 0.1 / 0.9
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_train_val_new, y_train_val_new, test_size=val_relative_size, random_state=RANDOM_SEED, shuffle=True
)


# sub-model 별 feature 목록
offense_base = ['WAR.x_decay', 'oWAR_decay', 'wRC+_decay', 'wOBA_decay', 'OPS_decay', 'wRC_decay', 'AVG.x_decay', 'OBP.x_decay', 'SLG_decay', 'IsoP_decay', 'BB%.x_decay', 'HR.x_decay', 'RBI_decay', 'RC27_decay']
pitching_base = ['WAR.y_decay', '선발WAR_decay', '구원WAR_decay', 'FIP_decay', 'kwERA_decay', 'ERA_decay', 'WHIP_decay', 'K/9_decay', 'BB/9_decay', 'K%.y_decay', 'BB%.y_decay', 'FIP-_decay', 'ERA-_decay', 'LOB_decay', 'HR/9_decay', 'IP_decay', 'GS_decay', 'GR_decay']
defense_base = ['dWAR_decay', '필딩 RAA_decay', 'Range RAA_decay', '수비 RAA_decay', '포지션 RAA_decay', 'E_decay']
others_base = ['Rep.Wins_decay', 'Pos.Wins_decay', 'RAR_decay']
momentum_base = ['in_season_win_rate', 'L10_win_rate']

# Home/Away
offense_features = [f'home_{c}' for c in offense_base] + [f'away_{c}' for c in offense_base]
pitching_features = [f'home_{c}' for c in pitching_base] + [f'away_{c}' for c in pitching_base]
defense_features = [f'home_{c}' for c in defense_base] + [f'away_{c}' for c in defense_base]
others_features = [f'home_{c}' for c in others_base] + [f'away_{c}' for c in others_base]
momentum_features = [f'home_{c}' for c in momentum_base] + [f'away_{c}' for c in momentum_base]
# Diff
diff_offense_features = [f'diff_{c}' for c in offense_base]
diff_pitching_features = [f'diff_{c}' for c in pitching_base]
diff_defense_features = [f'diff_{c}' for c in defense_base]
diff_others_features = [f'diff_{c}' for c in others_base]
diff_momentum_features = [f'diff_{c}' for c in momentum_base]


# sub-model 정의
# 지표별 개별 트리 10개
common_dt_params = {
    'max_depth': 4,
    'min_samples_leaf': 20,
    'random_state': RANDOM_SEED
}

pipe_offense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', offense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_pitching = Pipeline([('s', ColumnTransformer([('p', 'passthrough', pitching_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_defense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', defense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_others = Pipeline([('s', ColumnTransformer([('p', 'passthrough', others_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_momentum = Pipeline([('s', ColumnTransformer([('p', 'passthrough', momentum_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_offense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_offense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_pitching = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_pitching_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_defense = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_defense_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_others = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_others_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])
pipe_diff_momentum = Pipeline([('s', ColumnTransformer([('p', 'passthrough', diff_momentum_features)], remainder='drop')), ('m', DecisionTreeClassifier(**common_dt_params))])


estimators_list = [
    ('offense_tree', pipe_offense),
    ('pitching_tree', pipe_pitching),
    ('defense_tree', pipe_defense),
    ('others_tree', pipe_others),
    ('momentum_tree', pipe_momentum),
    ('diff_offense_tree', pipe_diff_offense),
    ('diff_pitching_tree', pipe_diff_pitching),
    ('diff_defense_tree', pipe_diff_defense),
    ('diff_others_tree', pipe_diff_others),
    ('diff_momentum_tree', pipe_diff_momentum)
]

# 현재는 해당년도 전적에 높은 가중치 들어간 상태
# 조절해 보아도 유의미한 차이는 없었음
manual_weights = [0.05, 0.05, 0.05, 0.05, 0.30, 0.05, 0.05, 0.05, 0.05, 0.30]

# 최종 모델 정의
# soft voting, 수동 가중치
voting_model_manual = VotingClassifier(
    estimators=estimators_list,
    voting='soft',
    weights=manual_weights,
    n_jobs=-1
)

# 학습
voting_model_manual.fit(X_train_new, y_train_new)

# 모델 평가
preds_manual = voting_model_manual.predict(X_test_new)
pred_probs_manual = voting_model_manual.predict_proba(X_test_new)[:, 1]

accuracy_manual = accuracy_score(y_test_new, preds_manual)
roc_auc_manual = roc_auc_score(y_test_new, pred_probs_manual)
logloss_manual = log_loss(y_test_new, pred_probs_manual)

print(f"\n[Test Set 평가 결과]")
print(f"Accuracy (정확도): {accuracy_manual:.4f}")
print(f"ROC AUC: {roc_auc_manual:.4f}")
print(f"Log Loss: {logloss_manual:.4f}")


[Test Set 평가 결과]
Accuracy (정확도): 0.5301
ROC AUC: 0.5549
Log Loss: 0.6894


In [493]:
# XGBoost
# 모든 feature 사용


y_xgb = final_df_complete['home_win']
drop_cols = ['date', 'season', 'home_id', 'away_id', 'home_runs', 'away_runs', 'home_win']
X_xgb = final_df_complete.drop(columns=drop_cols)

print(f"X shape: {X_xgb.shape}")

RANDOM_SEED = 42
X_train_val_xgb, X_test_xgb, y_train_val_xgb, y_test_xgb = train_test_split(
    X_xgb, y_xgb, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
val_relative_size = 0.1 / 0.9
X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(
    X_train_val_xgb, y_train_val_xgb, test_size=val_relative_size, random_state=RANDOM_SEED, shuffle=True
)
# 모델 정의
model_xgb_all = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=100,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_SEED,
    early_stopping_rounds=50
)

model_xgb_all.fit(
    X_train_xgb,
    y_train_xgb,
    eval_set=[(X_val_xgb, y_val_xgb)],
    verbose=False
)

preds_xgb = model_xgb_all.predict(X_test_xgb)
pred_probs_xgb = model_xgb_all.predict_proba(X_test_xgb)[:, 1]

print(f"\n[Test Set 평가 결과]")
print(f"Accuracy (정확도): {accuracy_score(y_test_xgb, preds_xgb):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test_xgb, pred_probs_xgb):.4f}")
print(f"Log Loss: {log_loss(y_test_xgb, pred_probs_xgb):.4f}")

X shape: (14279, 382)

[Test Set 평가 결과]
Accuracy (정확도): 0.5427
ROC AUC: 0.5591
Log Loss: 0.6888


In [494]:
# XGBoost
# 전체 feature 중 영향력 top-20 feature 선별하여 학습

y_xgb = final_df_complete['home_win']
drop_cols = ['date', 'season', 'home_id', 'away_id', 'home_runs', 'away_runs', 'home_win']
X_xgb = final_df_complete.drop(columns=[c for c in drop_cols if c in final_df_complete.columns])

RANDOM_SEED = 42
X_train_val_xgb, X_test_xgb, y_train_val_xgb, y_test_xgb = train_test_split(
    X_xgb, y_xgb, test_size=0.1, random_state=RANDOM_SEED, shuffle=True
)
val_relative_size = 0.1 / 0.9
X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(
    X_train_val_xgb, y_train_val_xgb, test_size=val_relative_size, random_state=RANDOM_SEED, shuffle=True
)
# 영향력 확인 위함
model_xgb_all = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=400,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_SEED
)

model_xgb_all.fit(
    X_train_xgb, y_train_xgb,
    eval_set=[(X_val_xgb, y_val_xgb)],
    verbose=False
)
# 영향력 높은 feature 20개 선택해 새 dataset 정의
booster = model_xgb_all.get_booster()
imp = booster.get_score(importance_type='gain')
if len(imp) > 0 and all(k.startswith('f') for k in imp.keys()):
    name_map = {f"f{i}": n for i, n in enumerate(X_train_xgb.columns)}
    imp_named = {name_map[k]: v for k, v in imp.items() if k in name_map}
else:
    imp_named = imp

imp_series = pd.Series(imp_named, dtype=float).reindex(X_train_xgb.columns).fillna(0.0)
top_features = imp_series.sort_values(ascending=False).head(20).index.tolist()
print(f"Top20 features: {top_features}")

X_train_top = X_train_xgb[top_features]
X_val_top   = X_val_xgb[top_features]
X_test_top  = X_test_xgb[top_features]
# 선택한 feature 기반 학습
model_xgb_top20 = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=100,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_SEED
)
model_xgb_top20.fit(
    X_train_top, y_train_xgb,
    eval_set=[(X_val_top, y_val_xgb)],
    verbose=False
)

preds = model_xgb_top20.predict(X_test_top)
probs = model_xgb_top20.predict_proba(X_test_top)[:, 1]

print("\n[Test Set 평가 결과]")
print(f"Accuracy: {accuracy_score(y_test_xgb, preds):.4f}")
print(f"ROC AUC : {roc_auc_score(y_test_xgb, probs):.4f}")
print(f"LogLoss : {log_loss(y_test_xgb, probs):.4f}")


Top20 features: ['home_BABIP.x_decay', 'home_RPW.y_decay', 'home_FIP_decay', 'diff_AVG.x_decay', 'diff_ERA_decay', 'diff_wRC_decay', 'away_rRA9-_decay', 'home_AVG.x_decay', 'away_W_decay', 'home_SO.y_decay', 'home_FIP-_decay', 'home_LOB_decay', 'diff_kwERA_decay', 'away_K%.x_decay', 'home_WAR.x_decay', 'away_BB.y_decay', 'diff_K/BB_decay', 'diff_SLG_decay', 'away_연봉.x_decay', 'diff_ePA_decay']

[Test Set 평가 결과]
Accuracy: 0.5623
ROC AUC : 0.5761
LogLoss : 0.6869
