In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from autogluon.tabular import TabularPredictor

# ----------------------
# 0. 설정
# ----------------------
BASE_PATH = "open_track1/"
PATH_TRAIN = os.path.join(BASE_PATH, "train.csv")
PATH_TEST = os.path.join(BASE_PATH, "test.csv")
PATH_MATCH_INFO = os.path.join(BASE_PATH, "match_info.csv")
PATH_SAMPLE_SUB = os.path.join(BASE_PATH, "sample_submission.csv")

K = 20   # 마지막 K 이벤트 사용 (20~32 사이 선택)

# ----------------------
# 1. 데이터 로드
# ----------------------
train = pd.read_csv(PATH_TRAIN)
test_index = pd.read_csv(PATH_TEST)
match_info = pd.read_csv(PATH_MATCH_INFO)
sample_sub = pd.read_csv(PATH_SAMPLE_SUB)

test_events_list = []
for _, row in test_index.iterrows():
    # path가 "./test/..." 형식이므로 BASE_PATH와 결합
    test_path = os.path.join(BASE_PATH, row["path"].lstrip("./"))
    df_ep = pd.read_csv(test_path)
    test_events_list.append(df_ep)

test_events = pd.concat(test_events_list, ignore_index=True)

train["is_train"] = 1
test_events["is_train"] = 0

events = pd.concat([train, test_events], ignore_index=True)

# ----------------------
# 2. 기본 정렬 + episode 내 인덱스
# ----------------------
events = events.sort_values(["game_episode", "time_seconds", "action_id"]).reset_index(drop=True)

events["event_idx"] = events.groupby("game_episode").cumcount()
events["n_events"] = events.groupby("game_episode")["event_idx"].transform("max") + 1
events["ep_idx_norm"] = events["event_idx"] / (events["n_events"] - 1).clip(lower=1)

# ----------------------
# 3. 시간/공간 feature
# ----------------------
# Δt
events["prev_time"] = events.groupby("game_episode")["time_seconds"].shift(1)
events["dt"] = events["time_seconds"] - events["prev_time"]
events["dt"] = events["dt"].fillna(0.0)

# 이동량/거리
events["dx"] = events["end_x"] - events["start_x"]
events["dy"] = events["end_y"] - events["start_y"]
events["dist"] = np.sqrt(events["dx"]**2 + events["dy"]**2)

# 속도 (dt=0 보호)
events["speed"] = events["dist"] / events["dt"].replace(0, 1e-3)

# zone / lane (필요시 범위 조정)
events["x_zone"] = (events["start_x"] / (105/7)).astype(int).clip(0, 6)
events["lane"] = pd.cut(
    events["start_y"],
    bins=[0, 68/3, 2*68/3, 68],
    labels=[0, 1, 2],
    include_lowest=True
).astype(int)

# ----------------------
# 4. 라벨 및 episode-level 메타 (train 전용)
# ----------------------
train_events = events[events["is_train"] == 1].copy()

last_events = (
    train_events
    .groupby("game_episode", as_index=False)
    .tail(1)
    .copy()
)

labels = last_events[["game_episode", "end_x", "end_y"]].rename(
    columns={"end_x": "target_x", "end_y": "target_y"}
)

# episode-level 메타 (마지막 이벤트 기준)
ep_meta = last_events[["game_episode", "game_id", "team_id", "is_home", "period_id", "time_seconds"]].copy()
ep_meta = ep_meta.rename(columns={"team_id": "final_team_id"})

# game_clock (분 단위, 0~90+)
ep_meta["game_clock_min"] = np.where(
    ep_meta["period_id"] == 1,
    ep_meta["time_seconds"] / 60.0,
    45.0 + ep_meta["time_seconds"] / 60.0
)

# ----------------------
# 5. 공격 팀 플래그 (final_team vs 상대)
# ----------------------
# final_team_id를 전체 events에 붙임
events = events.merge(
    ep_meta[["game_episode", "final_team_id"]],
    on="game_episode",
    how="left"
)

events["is_final_team"] = (events["team_id"] == events["final_team_id"]).astype(int)

# ----------------------
# 6. 입력용 events에서 마지막 이벤트 타깃 정보 가리기
# ----------------------
# is_last 플래그
events["last_idx"] = events.groupby("game_episode")["event_idx"].transform("max")
events["is_last"] = (events["event_idx"] == events["last_idx"]).astype(int)

# labels는 이미 뽑아놨으니, 입력쪽에서만 end_x, end_y, dx, dy, dist, speed 지움
mask_last = events["is_last"] == 1
for col in ["end_x", "end_y", "dx", "dy", "dist", "speed"]:
    events.loc[mask_last, col] = np.nan

# ----------------------
# 7. 카테고리 인코딩 (type_name, result_name, team_id 등)
# ----------------------
events["type_name"] = events["type_name"].fillna("__NA_TYPE__")
events["result_name"] = events["result_name"].fillna("__NA_RES__")

le_type = LabelEncoder()
le_res = LabelEncoder()

events["type_id"] = le_type.fit_transform(events["type_name"])
events["res_id"] = le_res.fit_transform(events["result_name"])

# team_id는 그대로 써도 되지만, 문자열이면 숫자로 매핑
if events["team_id"].dtype == "object":
    le_team = LabelEncoder()
    events["team_id_enc"] = le_team.fit_transform(events["team_id"])
else:
    events["team_id_enc"] = events["team_id"].astype(int)

# ----------------------
# 8. 마지막 K 이벤트만 사용 (lastK)
# ----------------------
# rev_idx: 0이 마지막 이벤트
events["rev_idx"] = events.groupby("game_episode")["event_idx"].transform(
    lambda s: s.max() - s
)

lastK = events[events["rev_idx"] < K].copy()

# pos_in_K: 0~(K-1), 앞쪽 패딩 고려해서 뒤에 실제 이벤트가 모이게
def assign_pos_in_K(df):
    df = df.sort_values("event_idx")  # 오래된 → 최근
    L = len(df)
    df = df.copy()
    df["pos_in_K"] = np.arange(K - L, K)
    return df

lastK = lastK.groupby("game_episode", group_keys=False).apply(assign_pos_in_K)

# ----------------------
# 9. wide feature pivot
# ----------------------
# 사용할 이벤트 피처 선택
num_cols = [
    "start_x", "start_y",
    "end_x", "end_y",
    "dx", "dy", "dist", "speed",
    "dt",
    "ep_idx_norm",
    "x_zone", "lane",
    "is_final_team",
]

cat_cols = [
    "type_id",
    "res_id",
    "team_id_enc",
    "is_home",
    "period_id",
    "is_last",
]

feature_cols = num_cols + cat_cols

wide = lastK[["game_episode", "pos_in_K"] + feature_cols].copy()

# 숫자형 pivot
wide_num = wide.pivot_table(
    index="game_episode",
    columns="pos_in_K",
    values=num_cols,
    aggfunc="first"
)

# 범주형 pivot
wide_cat = wide.pivot_table(
    index="game_episode",
    columns="pos_in_K",
    values=cat_cols,
    aggfunc="first"
)

# 컬럼 이름 평탄화
wide_num.columns = [f"{c}_{int(pos)}" for (c, pos) in wide_num.columns]
wide_cat.columns = [f"{c}_{int(pos)}" for (c, pos) in wide_cat.columns]

X = pd.concat([wide_num, wide_cat], axis=1).reset_index()  # game_episode 포함

# episode-level 메타 붙이기
X = X.merge(ep_meta[["game_episode", "game_id", "game_clock_min", "final_team_id", "is_home", "period_id"]],
            on="game_episode", how="left")

# train 라벨 붙이기
X = X.merge(labels, on="game_episode", how="left")  # test는 NaN

# ----------------------
# 10. train/test 분리
# ----------------------
train_mask = X["game_episode"].isin(labels["game_episode"])
X_train = X[train_mask].copy()
X_test = X[~train_mask].copy()

y_train_x = X_train["target_x"].astype(float)
y_train_y = X_train["target_y"].astype(float)

# 모델 입력에서 빼야 할 컬럼들
drop_cols = [
    "game_episode",
    "game_id",
    "target_x",
    "target_y",
]

X_train_feat = X_train.drop(columns=drop_cols)
X_test_feat = X_test.drop(columns=[c for c in drop_cols if c in X_test.columns])

# NaN 채우기 (LGBM은 NaN 다루긴 하지만, 깔끔하게)
X_train_feat = X_train_feat.fillna(0)
X_test_feat = X_test_feat.fillna(0)

# ----------------------
# 11. AutoGluon 학습
# ----------------------
# X 좌표 예측 모델
print("=" * 50)
print("X 좌표 모델 학습 시작...")
print("=" * 50)

train_data_x = X_train_feat.copy()
train_data_x["target_x"] = y_train_x

predictor_x = TabularPredictor(
    label="target_x",
    problem_type="regression",
    eval_metric="rmse",
    path="ag_models_x"  # 모델 저장 경로
).fit(
    train_data=train_data_x,
    time_limit=1800,  # 30분 (필요에 따라 조정: 600=10분, 1800=30분, 3600=1시간)
    presets="best_quality",  # 최고 품질 모드 (빠른 테스트: "good_quality" 또는 "medium_quality")
    verbosity=2
)

print("\n" + "=" * 50)
print("Y 좌표 모델 학습 시작...")
print("=" * 50)

train_data_y = X_train_feat.copy()
train_data_y["target_y"] = y_train_y

predictor_y = TabularPredictor(
    label="target_y",
    problem_type="regression",
    eval_metric="rmse",
    path="ag_models_y"  # 모델 저장 경로
).fit(
    train_data=train_data_y,
    time_limit=1800,  # 30분 (필요에 따라 조정: 600=10분, 1800=30분, 3600=1시간)
    presets="best_quality",  # 최고 품질 모드 (빠른 테스트: "good_quality" 또는 "medium_quality")
    verbosity=2
)

# ----------------------
# 12. test 예측
# ----------------------
print("=" * 50)
print("Test 데이터 예측 중...")
print("=" * 50)

pred_x = predictor_x.predict(X_test_feat)
pred_y = predictor_y.predict(X_test_feat)

# 필드 범위로 클립
pred_x = np.clip(pred_x, 0, 105)
pred_y = np.clip(pred_y, 0, 68)

# ----------------------
# 13. submission 생성
# ----------------------
sub = sample_sub.copy()

# X_test에는 game_episode가 있으니, test_index와 align
pred_df = X_test[["game_episode"]].copy()
pred_df["end_x"] = pred_x
pred_df["end_y"] = pred_y

sub = sub.drop(columns=["end_x", "end_y"], errors="ignore")
sub = sub.merge(pred_df, on="game_episode", how="left")

sub.to_csv("submission_autogluon_lastK.csv", index=False)
print("Saved submission_autogluon_lastK.csv")


  lastK = lastK.groupby("game_episode", group_keys=False).apply(assign_pos_in_K)
  X_train_feat = X_train_feat.fillna(0)
  X_test_feat = X_test_feat.fillna(0)
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.13.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 25.1.0: Mon Oct 20 19:32:41 PDT 2025; root:xnu-12377.41.6~2/RELEASE_ARM64_T6000
CPU Count:          8
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       3.57 GB / 16.00 GB (22.3%)
Disk Space Avail:   15.38 GB / 460.43 GB (3.3%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine 

X 좌표 모델 학습 시작...


Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3687.36 MB
	Train Data (Original)  Memory Usage: 46.35 MB (1.3% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 44 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Useless Original Features (Count: 21): ['is_final_team_19', 'is_last_0', 'is_last_1', 'is_last_2', 'is_last_3', 'is_last_4', 'is_last_5', 'is_last_6', 'is_last_7', 'is_last_8', 'is_last_9', 'is_last_10', 'is_last_11', 'is_last_12', 'is_last_13', 'is_last_14',


Y 좌표 모델 학습 시작...


	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 44 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Useless Original Features (Count: 21): ['is_final_team_19', 'is_last_0', 'is_last_1', 'is_last_2', 'is_last_3', 'is_last_4', 'is_last_5', 'is_last_6', 'is_last_7', 'is_last_8', 'is_last_9', 'is_last_10', 'is_last_11', 'is_last_12', 'is_last_13', 'is_last_14', 'is_last_15', 'is_last_16', 'is_last_17', 'is_last_18', 'is_last_19']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all ro

Test 데이터 예측 중...
Saved submission_autogluon_lastK.csv


In [3]:
# ----------------------
# 결과 확인 (학습 완료 후 실행)
# ----------------------
from autogluon.tabular import TabularPredictor
import pandas as pd
import numpy as np

# 저장된 모델 로드
predictor_x = TabularPredictor.load("ag_models_x")
predictor_y = TabularPredictor.load("ag_models_y")

print("=" * 50)
print("모델 성능 확인")
print("=" * 50)

# X 좌표 모델 리더보드
print("\n[X 좌표 모델 - 리더보드]")
leaderboard_x = predictor_x.leaderboard(silent=True)
print(leaderboard_x.head(10))

# Y 좌표 모델 리더보드
print("\n[Y 좌표 모델 - 리더보드]")
leaderboard_y = predictor_y.leaderboard(silent=True)
print(leaderboard_y.head(10))

# Train 데이터로 성능 평가 (첫 번째 셀 실행 후 사용 가능)
try:
    print("\n" + "=" * 50)
    print("Train 데이터 성능 평가")
    print("=" * 50)
    
    # X 좌표 평가
    y_pred_train_x = predictor_x.predict(X_train_feat)
    rmse_x = np.sqrt(np.mean((y_train_x - y_pred_train_x) ** 2))
    print(f"\nX 좌표 RMSE (Train): {rmse_x:.4f}")
    
    # Y 좌표 평가
    y_pred_train_y = predictor_y.predict(X_train_feat)
    rmse_y = np.sqrt(np.mean((y_train_y - y_pred_train_y) ** 2))
    print(f"Y 좌표 RMSE (Train): {rmse_y:.4f}")
except NameError:
    print("\n(첫 번째 셀을 먼저 실행해야 Train 데이터 평가가 가능합니다)")

# 제출 파일 확인
print("\n" + "=" * 50)
print("제출 파일 확인")
print("=" * 50)
sub = pd.read_csv("submission_autogluon_lastK.csv")
print(f"\n제출 파일 행 수: {len(sub)}")
print(f"\n제출 파일 샘플:")
print(sub.head(10))
print(f"\n제출 파일 통계:")
print(sub.describe())


모델 성능 확인

[X 좌표 모델 - 리더보드]
                  model  score_val              eval_metric  pred_time_val  \
0   WeightedEnsemble_L3 -11.738371  root_mean_squared_error      11.849119   
1   WeightedEnsemble_L2 -11.742939  root_mean_squared_error       2.881452   
2       CatBoost_BAG_L2 -11.793024  root_mean_squared_error       8.744761   
3  CatBoost_r177_BAG_L2 -11.801320  root_mean_squared_error       8.742393   
4     LightGBMXT_BAG_L2 -11.809666  root_mean_squared_error       8.838531   
5  LightGBM_r131_BAG_L1 -11.815296  root_mean_squared_error       0.538745   
6  ExtraTreesMSE_BAG_L2 -11.836941  root_mean_squared_error       9.913169   
7       LightGBM_BAG_L2 -11.842222  root_mean_squared_error       8.818286   
8  LightGBM_r131_BAG_L2 -11.843725  root_mean_squared_error       9.158671   
9  CatBoost_r177_BAG_L1 -11.851047  root_mean_squared_error       0.025972   

      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  \
0  1071.772211                0.000380  