In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from autogluon.tabular import TabularPredictor

# ======================
# 0. 설정
# ======================
BASE_PATH = "open_track1/"  # ✅ 너 환경에 맞게 폴더 경로만 확인

PATH_TRAIN = os.path.join(BASE_PATH, "train.csv")
PATH_TEST_INDEX = os.path.join(BASE_PATH, "test.csv")  # test 파일들의 path가 적힌 인덱스
PATH_MATCH_INFO = os.path.join(BASE_PATH, "match_info.csv")
PATH_SAMPLE_SUB = os.path.join(BASE_PATH, "sample_submission.csv")

# last K events
K = 20

# bin 설정
BIN_X = 5.0
BIN_Y = 5.0
NX = int(np.ceil(105 / BIN_X))  # 21
NY = int(np.ceil(68 / BIN_Y))   # 14

# 학습 시간 제한: 10분
TIME_LIMIT = 600
PRESETS = "medium_quality"  # ⭐ 10분 제한에서는 good_quality보다 medium이 안정적/빠름

HYPERPARAMETERS = {
    "GBM": {},
    "CAT": {},
}

TOP_K_SOFTAVG = 3  # 확률 가중 평균에서 사용할 top-k

# ======================
# 1. 데이터 로드
# ======================
train = pd.read_csv(PATH_TRAIN)
test_index = pd.read_csv(PATH_TEST_INDEX)
match_info = pd.read_csv(PATH_MATCH_INFO)
sample_sub = pd.read_csv(PATH_SAMPLE_SUB)

# test 이벤트 파일들 로드 (test.csv에 path가 존재한다고 가정)
test_events_list = []
for _, row in test_index.iterrows():
    test_path = os.path.join(BASE_PATH, str(row["path"]).lstrip("./"))
    df_ep = pd.read_csv(test_path)
    test_events_list.append(df_ep)

test_events = pd.concat(test_events_list, ignore_index=True)

# train/test 합치기
train["is_train"] = 1
test_events["is_train"] = 0
events = pd.concat([train, test_events], ignore_index=True)

# ======================
# 2. 기본 정렬 + episode 인덱스
# ======================
# (컬럼명은 네 데이터 기준: game_episode, time_seconds, action_id 사용)
events = events.sort_values(
    ["game_episode", "time_seconds", "action_id"]
).reset_index(drop=True)

events["event_idx"] = events.groupby("game_episode").cumcount()
events["n_events"] = events.groupby("game_episode")["event_idx"].transform("max") + 1
events["ep_idx_norm"] = events["event_idx"] / (events["n_events"] - 1).clip(lower=1)

# ======================
# 3. 기본 수치 피처 (시간/이동)
# ======================
events["prev_time"] = events.groupby("game_episode")["time_seconds"].shift(1)
events["dt"] = (events["time_seconds"] - events["prev_time"]).fillna(0)

events["dx"] = events["end_x"] - events["start_x"]
events["dy"] = events["end_y"] - events["start_y"]
events["dist"] = np.sqrt(events["dx"]**2 + events["dy"]**2)

# dt=0 회피
events["speed"] = events["dist"] / events["dt"].replace(0, 1e-3)

# 구역/레인
events["x_zone"] = (events["start_x"] / (105/7)).astype(int).clip(0, 6)

events["lane"] = pd.cut(
    events["start_y"],
    bins=[0, 68/3, 2*68/3, 68],
    labels=[0, 1, 2],
    include_lowest=True
).astype(int)

# ======================
# 4. 공간/각도/코너 관련 피처
# ======================
# 골대 중앙(105, 34) 기준 각도
events["angle_to_goal"] = np.arctan2(
    34 - events["start_y"],
    105 - events["start_x"]
) * 180 / np.pi

# 코너까지 거리 (상단/하단 코너)
events["dist_corner_top"] = np.sqrt((105 - events["start_x"])**2 + (0 - events["start_y"])**2)
events["dist_corner_bottom"] = np.sqrt((105 - events["start_x"])**2 + (68 - events["start_y"])**2)
events["dist_to_nearest_corner"] = events[["dist_corner_top", "dist_corner_bottom"]].min(axis=1)

# 코너 지역 플래그 (대충 x>100 & y near sideline)
events["is_corner_area"] = (
    (events["start_x"] > 100) &
    ((events["start_y"] < 5) | (events["start_y"] > 63))
).astype(int)

# 코너 상호작용
events["angle_goal_x_corner"] = events["angle_to_goal"] * events["is_corner_area"]
events["dist_corner_x_angle"] = events["dist_to_nearest_corner"] * events["angle_to_goal"]

# 핵심 3개 (네가 추가했던 것 유지)
events["dist_to_sideline"] = events["start_y"].apply(lambda y: min(y, 68 - y))

events["angle_to_goal_center"] = np.arctan2(
    34 - events["start_y"],
    105 - events["start_x"]
)  # radians

events["time_pos_inter"] = events["ep_idx_norm"] * events["start_x"]

# ======================
# 5. 라벨 생성 (Train의 마지막 이벤트 end_x/end_y -> bin)
# ======================
train_events = events[events["is_train"] == 1].copy()

last_events = (
    train_events
    .groupby("game_episode", as_index=False)
    .tail(1)
    .copy()
)

last_events["target_x_bin"] = (last_events["end_x"] / BIN_X).astype(int).clip(0, NX - 1)
last_events["target_y_bin"] = (last_events["end_y"] / BIN_Y).astype(int).clip(0, NY - 1)

labels = last_events[["game_episode", "target_x_bin", "target_y_bin"]].copy()

# ======================
# 6. episode-level meta 구성 (마지막 이벤트 기준)
# ======================
# (데이터에 team_id, is_home, period_id 등이 있다고 가정)
ep_meta = last_events[[
    "game_episode", "game_id", "team_id", "is_home", "period_id", "time_seconds"
]].rename(columns={"team_id": "final_team_id"}).copy()

# 경기 시간(분) 간단 변환
ep_meta["game_clock_min"] = np.where(
    ep_meta["period_id"] == 1,
    ep_meta["time_seconds"] / 60,
    45 + ep_meta["time_seconds"] / 60
)

# final_team_id 붙이기
events = events.merge(
    ep_meta[["game_episode", "final_team_id"]],
    on="game_episode",
    how="left"
)
events["is_final_team"] = (events["team_id"] == events["final_team_id"]).astype(int)

# ======================
# 7. 누수 방지: 마지막 이벤트 마스킹
# ======================
events["last_idx"] = events.groupby("game_episode")["event_idx"].transform("max")
events["is_last"] = (events["event_idx"] == events["last_idx"]).astype(int)

mask_last = events["is_last"] == 1
for col in ["end_x", "end_y", "dx", "dy", "dist", "speed"]:
    events.loc[mask_last, col] = np.nan

# ======================
# 8. 카테고리 인코딩
# ======================
events["type_name"] = events["type_name"].fillna("__NA__")
events["result_name"] = events["result_name"].fillna("__NA__")

events["type_id"] = LabelEncoder().fit_transform(events["type_name"])
events["res_id"] = LabelEncoder().fit_transform(events["result_name"])

# team_id가 문자열일 수도 있으니 처리
if events["team_id"].dtype == "object":
    events["team_id_enc"] = LabelEncoder().fit_transform(events["team_id"].astype(str))
else:
    events["team_id_enc"] = events["team_id"].astype(int)

# ======================
# 9. lastK 추출
# ======================
events["rev_idx"] = events.groupby("game_episode")["event_idx"].transform(lambda s: s.max() - s)
lastK = events[events["rev_idx"] < K].copy()

def assign_pos(df):
    df = df.sort_values("event_idx")
    L = len(df)
    df["pos_in_K"] = np.arange(K - L, K)
    return df

lastK = lastK.groupby("game_episode", group_keys=False).apply(assign_pos)

# ======================
# 10. wide feature 생성 (pivot)
# ======================
num_cols = [
    "start_x", "start_y", "end_x", "end_y",
    "dx", "dy", "dist", "speed", "dt",
    "ep_idx_norm", "x_zone", "lane",
    "is_final_team",
    "angle_to_goal", "dist_to_nearest_corner", "is_corner_area",
    "angle_goal_x_corner", "dist_corner_x_angle",
    "dist_to_sideline", "angle_to_goal_center", "time_pos_inter"
]

cat_cols = [
    "type_id", "res_id", "team_id_enc",
    "is_home", "period_id", "is_last"
]

wide = lastK[["game_episode", "pos_in_K"] + num_cols + cat_cols].copy()

wide_num = wide.pivot_table(
    index="game_episode", columns="pos_in_K",
    values=num_cols, aggfunc="first"
)
wide_cat = wide.pivot_table(
    index="game_episode", columns="pos_in_K",
    values=cat_cols, aggfunc="first"
)

wide_num.columns = [f"{c}_{int(p)}" for c, p in wide_num.columns]
wide_cat.columns = [f"{c}_{int(p)}" for c, p in wide_cat.columns]

X = pd.concat([wide_num, wide_cat], axis=1).reset_index()

# episode meta 붙이기
X = X.merge(
    ep_meta[["game_episode", "game_id", "game_clock_min", "final_team_id", "is_home", "period_id"]],
    on="game_episode",
    how="left"
)

# 라벨 붙이기 (train만 존재)
X = X.merge(labels, on="game_episode", how="left")

# ======================
# 11. Train / Test 분리
# ======================
train_mask = X["game_episode"].isin(labels["game_episode"])
X_train = X[train_mask].copy()
X_test = X[~train_mask].copy()

y_train_x = X_train["target_x_bin"].astype(int)
y_train_y = X_train["target_y_bin"].astype(int)

drop_cols = ["game_episode", "game_id", "target_x_bin", "target_y_bin"]

X_train_feat = X_train.drop(columns=drop_cols).fillna(0)
X_test_feat = X_test.drop(columns=[c for c in drop_cols if c in X_test.columns]).fillna(0)

# ======================
# 12. AutoGluon 학습 (10분 제한)
# ======================
# X-bin
train_x = X_train_feat.copy()
train_x["target_x_bin"] = y_train_x

predictor_x = TabularPredictor(
    label="target_x_bin",
    problem_type="multiclass",
    eval_metric="log_loss",
    path="ag_models_xbin_10min"
).fit(
    train_data=train_x,
    time_limit=TIME_LIMIT,
    presets=PRESETS,
    hyperparameters=HYPERPARAMETERS,
    verbosity=2
)

# Y-bin
train_y = X_train_feat.copy()
train_y["target_y_bin"] = y_train_y

predictor_y = TabularPredictor(
    label="target_y_bin",
    problem_type="multiclass",
    eval_metric="log_loss",
    path="ag_models_ybin_10min"
).fit(
    train_data=train_y,
    time_limit=TIME_LIMIT,
    presets=PRESETS,
    hyperparameters=HYPERPARAMETERS,
    verbosity=2
)

# ======================
# 13. 확률 → 연속 좌표 복원 (RMSE 대응 핵심)
# ======================
def proba_to_coord(proba: np.ndarray, bin_size: float, top_k: int = 3) -> float:
    """
    proba: (num_bins,) 확률벡터
    bin_size: BIN_X or BIN_Y
    """
    top_idx = np.argsort(proba)[-top_k:]
    weights = proba[top_idx]
    wsum = weights.sum()
    if wsum <= 0:
        # 예외 처리: 확률합이 0이면 argmax로 fallback
        idx = int(np.argmax(proba))
        return idx * bin_size + bin_size / 2

    weights = weights / wsum
    coords = top_idx * bin_size + bin_size / 2
    return float(np.sum(weights * coords))

# ======================
# 14. Test 예측 (SoftAvg)
# ======================
proba_x = predictor_x.predict_proba(X_test_feat).values  # (N, NX)
proba_y = predictor_y.predict_proba(X_test_feat).values  # (N, NY)

pred_x = np.array([proba_to_coord(p, BIN_X, TOP_K_SOFTAVG) for p in proba_x])
pred_y = np.array([proba_to_coord(p, BIN_Y, TOP_K_SOFTAVG) for p in proba_y])

pred_x = np.clip(pred_x, 0, 105)
pred_y = np.clip(pred_y, 0, 68)

# ======================
# 15. 제출 파일 생성 (형식 충돌 방지: 덮어쓰기 방식)
# ======================
pred_df = X_test[["game_episode"]].copy()
pred_df["end_x"] = pred_x
pred_df["end_y"] = pred_y

# sample_submission이 end_x/end_y를 이미 갖고 있으니 "merge suffix 충돌" 피해서 덮어쓰기
sub = sample_sub.copy().set_index("game_episode")
pred_df = pred_df.set_index("game_episode")

sub["end_x"] = pred_df["end_x"]
sub["end_y"] = pred_df["end_y"]

sub = sub.reset_index()

# 컬럼 순서 강제
sub = sub[["game_episode", "end_x", "end_y"]]

out_name = "submission_bin_softavg_10min_full.csv"
sub.to_csv(out_name, index=False)

print(f"✅ Saved submission: {out_name}")
print(sub.head(10))
print("Columns:", sub.columns.tolist())



  lastK = lastK.groupby("game_episode", group_keys=False).apply(assign_pos)
  X_train_feat = X_train.drop(columns=drop_cols).fillna(0)
  X_test_feat = X_test.drop(columns=[c for c in drop_cols if c in X_test.columns]).fillna(0)
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.13.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 25.1.0: Mon Oct 20 19:32:41 PDT 2025; root:xnu-12377.41.6~2/RELEASE_ARM64_T6000
CPU Count:          8
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       3.76 GB / 16.00 GB (23.5%)
Disk Space Avail:   79.83 GB / 460.43 GB (17.3%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "/Users/yangjinmo/Desktop/k_league_ml/ag_models_xbin_10min"
Train Data Rows:    15435
Train Data Columns: 538
Label Column:       target_x_bin
Problem Type:       multiclass
Preprocessing data ...
Train Data

✅ Saved submission: submission_bin_softavg_10min_full.csv
  game_episode      end_x      end_y
0     153363_1  66.803431   4.271621
1     153363_2  37.273295  62.977507
2     153363_6  35.577678  65.158605
3     153363_7  52.682555   4.893350
4     153363_8  82.271261   7.022204
5     153363_9  76.906888  67.139524
6    153363_10  57.111227   5.347243
7    153363_12  66.878253   4.063411
8    153363_13  32.663202  66.192351
9    153363_15  59.366657   4.496272
Columns: ['game_episode', 'end_x', 'end_y']


In [13]:
# =======================
# 결과 확인 및 3단계 검증법
# (BIN Classification + Soft Average 버전)
# =======================

from autogluon.tabular import TabularPredictor
import pandas as pd
import numpy as np

# =======================
# 실행 플래그
# =======================
RUN_STEP1_FEATURE_IMPORTANCE = False   # ⚠️ 매우 느림
RUN_STEP2_LEADERBOARD = True
RUN_STEP3_SITUATIONAL_ANALYSIS = True

# =======================
# 모델 경로 & 제출 파일 (⭐ 중요: 새 경로)
# =======================
model_path_x = "ag_models_xbin_10min"
model_path_y = "ag_models_ybin_10min"
submission_filename = "submission_bin_softavg_10min_full.csv"

print(f"모델 경로: {model_path_x}, {model_path_y}")
print(f"제출 파일: {submission_filename}")

# =======================
# 모델 로드
# =======================
predictor_x = TabularPredictor.load(model_path_x)
predictor_y = TabularPredictor.load(model_path_y)

print("✓ 모델 로드 완료")
print("=" * 70)
print("3단계 검증법 (BIN + SoftAvg)")
print("=" * 70)

# ============================================================
# 1단계: Feature Importance (참고용)
# ============================================================
if RUN_STEP1_FEATURE_IMPORTANCE:
    print("\n[1단계] Feature Importance 확인")

    print("\n[X-bin 모델]")
    fi_x = predictor_x.feature_importance(data=X_train_feat)
    print(fi_x.iloc[:, 0].sort_values(ascending=False).head(20))

    print("\n[Y-bin 모델]")
    fi_y = predictor_y.feature_importance(data=X_train_feat)
    print(fi_y.iloc[:, 0].sort_values(ascending=False).head(20))

else:
    print("\n1단계: Feature Importance 건너뜀")

# ============================================================
# 2단계: Leaderboard (log_loss 기준)
# ============================================================
if RUN_STEP2_LEADERBOARD:
    print("\n" + "=" * 70)
    print("[2단계] Leaderboard (log_loss 기준)")
    print("=" * 70)

    print("\n[X-bin 모델]")
    lb_x = predictor_x.leaderboard(silent=True)
    print(lb_x.head(5))
    print(f"✓ Best log_loss (X): {lb_x.iloc[0]['score_val']:.5f}")

    print("\n[Y-bin 모델]")
    lb_y = predictor_y.leaderboard(silent=True)
    print(lb_y.head(5))
    print(f"✓ Best log_loss (Y): {lb_y.iloc[0]['score_val']:.5f}")

    # ⚠️ Train accuracy는 참고용 (RMSE 대회)
    print("\n[Train Accuracy 참고용]")
    pred_x_train = predictor_x.predict(X_train_feat)
    pred_y_train = predictor_y.predict(X_train_feat)

    acc_x = (pred_x_train == y_train_x).mean()
    acc_y = (pred_y_train == y_train_y).mean()

    print(f"Train Accuracy X-bin: {acc_x:.4f}")
    print(f"Train Accuracy Y-bin: {acc_y:.4f}")

else:
    print("\n2단계: Leaderboard 건너뜀")

# ============================================================
# 3단계: 상황별 예측 분포 분석 (⭐ 핵심)
# ============================================================
if RUN_STEP3_SITUATIONAL_ANALYSIS:
    print("\n" + "=" * 70)
    print("[3단계] 상황별 예측 분포 분석")
    print("=" * 70)

    # 제출 파일 로드
    sub = pd.read_csv(submission_filename)

    # --------------------------------------------------------
    # X_test에서 존재하는 컬럼만 안전하게 선택
    # --------------------------------------------------------
    merge_cols = ["game_episode"]

    if "period_id" in X_test.columns:
        merge_cols.append("period_id")

    if "game_clock_min" in X_test.columns:
        merge_cols.append("game_clock_min")

    # 병합
    merged = X_test[merge_cols].merge(
        sub, on="game_episode", how="left"
    )

    # --------------------------------------------------------
    # 컬럼 존재 sanity check
    # --------------------------------------------------------
    if ("end_x" not in merged.columns) or ("end_y" not in merged.columns):
        raise ValueError(
            f"❌ 제출 파일 컬럼 오류\n"
            f"merged columns: {merged.columns.tolist()}"
        )

    # --------------------------------------------------------
    # 전체 분포
    # --------------------------------------------------------
    print("\n[전체 예측 좌표 통계]")
    print(merged[["end_x", "end_y"]].describe())

    # --------------------------------------------------------
    # 전 / 후반 비교
    # --------------------------------------------------------
    if "period_id" in merged.columns:
        first_half = merged[merged["period_id"] == 1]
        second_half = merged[merged["period_id"] == 2]

        if len(first_half) > 0:
            print("\n[전반전 예측 분포]")
            print(first_half[["end_x", "end_y"]].describe())

        if len(second_half) > 0:
            print("\n[후반전 예측 분포]")
            print(second_half[["end_x", "end_y"]].describe())

    # --------------------------------------------------------
    # 공격 방향 sanity check (X축)
    # --------------------------------------------------------
    mean_x = merged["end_x"].mean()
    print("\n[공격 방향 sanity check]")
    print(
        f"end_x 평균: {mean_x:.2f} "
        "(값이 클수록 전진 패스 성향)"
    )

    # --------------------------------------------------------
    # Y축 중앙 쏠림 sanity check
    # --------------------------------------------------------
    std_y = merged["end_y"].std()
    print("\n[Y축 분산 sanity check]")
    print(
        f"end_y 표준편차: {std_y:.2f} "
        "(너무 작으면 중앙 쏠림 의심)"
    )

    # --------------------------------------------------------
    # 시간대별 (선택)
    # --------------------------------------------------------
    if "game_clock_min" in merged.columns:
        early = merged[merged["game_clock_min"] < 30]
        late = merged[merged["game_clock_min"] > 75]

        if len(early) > 0:
            print("\n[초반 (0~30분)]")
            print(early[["end_x", "end_y"]].describe())

        if len(late) > 0:
            print("\n[후반 막판 (75분~)]")
            print(late[["end_x", "end_y"]].describe())

else:
    print("\n3단계: 상황 분석 건너뜀")

print("\n" + "=" * 70)
print("검증 완료 (BIN + SoftAvg)")
print("=" * 70)




모델 경로: ag_models_xbin_10min, ag_models_ybin_10min
제출 파일: submission_bin_softavg_10min_full.csv
✓ 모델 로드 완료
3단계 검증법 (BIN + SoftAvg)

1단계: Feature Importance 건너뜀

[2단계] Leaderboard (log_loss 기준)

[X-bin 모델]
                 model  score_val eval_metric  pred_time_val    fit_time  \
0  WeightedEnsemble_L2  -2.104453    log_loss       0.034077  391.328376   
1             CatBoost  -2.119763    log_loss       0.007326  314.799729   
2             LightGBM  -2.155898    log_loss       0.025724   76.478824   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                0.001027           0.049823            2       True   
1                0.007326         314.799729            1       True   
2                0.025724          76.478824            1       True   

   fit_order  
0          3  
1          2  
2          1  
✓ Best log_loss (X): -2.10445

[Y-bin 모델]
                 model  score_val eval_metric  pred_time_val    fit_time  \
0  WeightedEnsemble_L2  