In [2]:
!pip install -Uq tabpfn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.0/611.0 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m

In [49]:
import pandas as pd
import numpy as np
import os
import random
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GroupKFold

from tabpfn import TabPFNRegressor
from tabpfn.constants import ModelVersion

import torch

import gc

In [50]:
def corr_feature_selection(df, threshold=0.95):

    corr = df.corr(numeric_only=True).abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

    drop_cols = [col for col in upper.columns if any(upper[col] > threshold)]

    df = df.drop(columns=drop_cols)

    return df, drop_cols

# setting

In [51]:
file_path = '/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/'

SEED = 810

device = "cuda" if torch.cuda.is_available() else 'cpu'

In [52]:
def seed_every(seed=SEED):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

seed_every(SEED)

In [53]:
def euclid_np(pred_xy, true_xy):
    return float(np.mean(np.sqrt(((pred_xy - true_xy) ** 2).sum(axis=1))))

# modeling

In [54]:
def make_model():
    m = TabPFNRegressor.create_default_for_version(ModelVersion.V2)

    m.ignore_pretraining_limits = True
    m.n_estimators = 16
    m.average_before_softmax = True
    m.batch_size_inference = 32
    m.subsample_features = 0.8
    m.random_state = SEED
    m.device = device

    return m

In [55]:
def fit(Xtr, yd):
  model = make_model()
  model.fit(Xtr, yd)

  return model

# Data Load

In [96]:
train_df = pd.read_parquet(file_path + 'x_train_full.parquet')
test_df = pd.read_parquet(file_path + 'x_test_full.parquet')

y_dx_t = np.load(file_path + 'y_dx.npy')
y_dy_t = np.load(file_path + 'y_dy.npy')

train = train_df.copy()
test = test_df.copy()

y_dx = y_dx_t.copy()
y_dy = y_dy_t.copy()

In [100]:
train['player_id_curr_t0']

Unnamed: 0,player_id_curr_t0
0,259
1,214
2,445
3,412
4,6
...,...
15430,303
15431,204
15432,294
15433,97


# CV Loop

In [97]:
train_start_x = train["start_x_t0"].to_numpy()
train_start_y = train["start_y_t0"].to_numpy()
test_start_x  = test["start_x_t0"].to_numpy()
test_start_y  = test["start_y_t0"].to_numpy()

test_eps = test["game_episode"].tolist()
train_game = train["game_id"].to_numpy()

drop_cols = ["game_episode", "start_x_t0", "start_y_t0"]
x_all = train.drop(columns=drop_cols).fillna(0.0)
x_test = test.drop(columns=drop_cols).fillna(0.0)

groups = train_game

In [85]:
N_OUTER_FOLDS = 5
N_INNER_FOLDS = 3

CLIP_OUTPUT = True

In [86]:
outer_gkf = GroupKFold(n_splits=N_OUTER_FOLDS)

best_vals = []
pred_test_x_folds = []
pred_test_y_folds = []

In [102]:
def add_target_encoding(X_train, y_dx_train, y_dy_train, X_val, X_test, col_name="player_id_curr_t0"):
    # Target encoding with mean dx/dy
    df_stat = X_train[[col_name]].copy()
    df_stat["target_dx"] = y_dx_train
    df_stat["target_dy"] = y_dy_train

    stats = df_stat.groupby(col_name).agg({
        "target_dx": "mean",
        "target_dy": "mean"
    }).reset_index()

    stats.columns = [col_name, f"{col_name}_mean_dx", f"{col_name}_mean_dy"]

    global_mean_dx = y_dx_train.mean()
    global_mean_dy = y_dy_train.mean()

    X_train_out = X_train.merge(stats, on=col_name, how="left")
    X_val_out   = X_val.merge(stats, on=col_name, how="left")
    X_test_out  = X_test.merge(stats, on=col_name, how="left")

    cols = [f"{col_name}_mean_dx", f"{col_name}_mean_dy"]

    # Fill missing values with global mean
    for df in [X_train_out, X_val_out, X_test_out]:
        df[cols[0]] = df[cols[0]].fillna(global_mean_dx)
        df[cols[1]] = df[cols[1]].fillna(global_mean_dy)

    return X_train_out, X_val_out, X_test_out

In [None]:
for fold, (tr_idx, val_idx) in enumerate(outer_gkf.split(x_all, groups = groups)):
  print(f"\n===== Outer Fold {fold} =====")

  # data split
  x_tr = x_all.iloc[tr_idx].copy()
  x_val = x_all.iloc[val_idx].copy()

  ydx_tr = y_dx[tr_idx]
  ydy_tr = y_dy[tr_idx]
  ydx_val = y_dx[val_idx]
  ydy_val = y_dy[val_idx]

  val_sx = train_start_x[val_idx]
  val_sy = train_start_y[val_idx]

  x_test_curr = x_test.copy()

  # Target Encoding
  # Player ID
  x_tr, x_val, x_test_curr = add_target_encoding(
      x_tr, ydx_tr, ydy_tr, x_val, x_test_curr, col_name='player_id_curr_t0'
  )
  # team id
  x_tr, x_val, x_test_curr = add_target_encoding(
      x_tr, ydx_tr, ydy_tr, x_val, x_test_curr, col_name='team_id_curr_t0'
  )
  # opp team id
  x_tr, x_val, x_test_curr = add_target_encoding(
      x_tr, ydx_tr, ydy_tr, x_val, x_test_curr, col_name='new_opp_team_id_curr_t0'
  )

  # remove game_id befor training
  drop_features = ['game_id']
  x_tr = x_tr.drop(columns=drop_features, errors = 'ignore')
  x_val = x_val.drop(columns=drop_features, errors = 'ignore')
  x_test_curr = x_test_curr.drop(columns=drop_features, errors = 'ignore')

  # 1: Inner CV for OOF
  inner_groups = groups[tr_idx]
  inner_gkf = GroupKFold(n_splits=N_INNER_FOLDS)

  oof_pred1_dx = np.zeros(len(tr_idx), dtype=np.float64)
  oof_pred1_dy = np.zeros(len(tr_idx), dtype=np.float64)

  x_tr_reset = x_tr.reset_index(drop=True)
  ydx_tr_local = ydx_tr
  ydy_tr_local = ydy_tr

  for inner_fold, (itr, iva) in enumerate(inner_gkf.split(x_tr_reset, groups=inner_groups)):
    x_itr = x_tr_reset.iloc[itr]
    x_iva = x_tr_reset.iloc[iva]

    mx1, my1 = fit(x_itr, ydx_tr_local[itr]), fit(x_itr, ydy_tr_local[itr])

    oof_pred1_dx[iva] = mx1.predict(x_iva)
    oof_pred1_dy[iva] = my1.predict(x_iva)
    gc.collect()

  # Calculate Residuals
  res_tr_dx = ydx_tr_local - oof_pred1_dx
  res_tr_dy = ydy_tr_local - oof_pred1_dy

  x_tr_stage2 = x_tr_reset.copy()
  x_tr_stage2["pred1_dx"] = oof_pred1_dx
  x_tr_stage2["pred1_dy"] = oof_pred1_dy

  # 6. Fit Final Stage 1 Models
  mx1_full, my1_full = fit(x_tr, ydx_tr), fit(x_tr, ydy_tr)

  pred1_val_dx = mx1_full.predict(x_val)
  pred1_val_dy = my1_full.predict(x_val)
  pred1_test_dx = mx1_full.predict(x_test_curr)
  pred1_test_dy = my1_full.predict(x_test_curr)

  # 7. Fit Stage 2 Models (Residuals)
  mx2, my2 = fit(x_tr_stage2, res_tr_dx), fit(x_tr_stage2, res_tr_dy)

  x_val_stage2 = x_val.copy()
  x_val_stage2["pred1_dx"] = pred1_val_dx
  x_val_stage2["pred1_dy"] = pred1_val_dy

  x_te_stage2 = x_test_curr.copy()
  x_te_stage2["pred1_dx"] = pred1_test_dx
  x_te_stage2["pred1_dy"] = pred1_test_dy

  pred2_val_dx = mx2.predict(x_val_stage2)
  pred2_val_dy = my2.predict(x_val_stage2)

  pred2_test_dx = mx2.predict(x_te_stage2)
  pred2_test_dy = my2.predict(x_te_stage2)

  # 8. Ensemble Stage 1 + Stage 2
  final_val_dx = pred1_val_dx + pred2_val_dx
  final_val_dy = pred1_val_dy + pred2_val_dy
  final_test_dx = pred1_test_dx + pred2_test_dx
  final_test_dy = pred1_test_dy + pred2_test_dy

  # Restore coordinates & Clip
  va_pred_x = val_sx + final_val_dx
  va_pred_y = val_sy + final_val_dy

  if CLIP_OUTPUT:
    va_pred_x = np.clip(va_pred_x, 0.0, 105.0)
    va_pred_y = np.clip(va_pred_y, 0.0, 68.0)

  va_true_x = val_sx + ydx_val
  va_true_y = val_sy + ydy_val

  val_dist = euclid_np(
      np.stack([va_pred_x, va_pred_y], axis=1),
      np.stack([va_true_x, va_true_y], axis=1),
      )
  best_vals.append(val_dist)
  print(f"Outer Fold {fold}: best val_dist = {val_dist:.5f}")

  test_pred_x = test_start_x + final_test_dx
  test_pred_y = test_start_y + final_test_dy

  if CLIP_OUTPUT:
      te_pred_x = np.clip(te_pred_x, 0.0, 105.0)
      te_pred_y = np.clip(te_pred_y, 0.0, 68.0)

  pred_test_x_folds.append(te_pred_x)
  pred_test_y_folds.append(te_pred_y)

  gc.collect()


===== Outer Fold 0 =====


tabpfn-v2-regressor.ckpt:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

# summary & submission

In [29]:
print("\n========== CV Summary (2-Stage Residual) ==========")
for f, v in enumerate(best_vals):
    print(f"Fold {f}: best val_dist = {v:.5f}")
print(f"Mean val_dist = {np.mean(best_vals):.5f}")
print(f"Std  val_dist = {np.std(best_vals):.5f}")

pred_test_x = np.mean(np.stack(pred_test_x_folds, axis=0), axis=0)
pred_test_y = np.mean(np.stack(pred_test_y_folds, axis=0), axis=0)

if CLIP_OUTPUT:
    pred_test_x = np.clip(pred_test_x, 0.0, 105.0)
    pred_test_y = np.clip(pred_test_y, 0.0, 68.0)

out_df = pd.DataFrame({
    "game_episode": test_eps,
    "end_x": pred_test_x,
    "end_y": pred_test_y,
})

sample_sub = pd.read_csv(file_path + 'sample_submission.csv')
sub = sample_sub.merge(out_df, on="game_episode", how="left")

if "end_x_y" in sub.columns:
    sub["end_x"] = sub["end_x_y"]
    sub["end_y"] = sub["end_y_y"]

sub["end_x"] = sub["end_x"].clip(0.0, 105.0)
sub["end_y"] = sub["end_y"].clip(0.0, 68.0)

sub = sub[["game_episode", "end_x", "end_y"]]
sub.to_csv(file_path + 'output/submission4', index=False)
print("Saved")