In [3]:
import pandas as pd
import numpy as np
import os
import random

import warnings
warnings.filterwarnings('ignore')

In [4]:
# 파일 경로
file_path = '/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/'

In [29]:
train_df = pd.read_parquet(file_path + 'train_df.parquet')
test_df = pd.read_parquet(file_path + 'test_df.parquet')

train = train_df.copy()
test = test_df.copy()

#flatten to wide + Dataset

In [50]:
K = 3

num_cols = [
    'new_start_x_norm','new_start_y_norm',
    'new_end_x_norm','new_end_y_norm',
    'new_dx_norm','new_dy_norm','new_dist_norm',
    'new_cos_angle','new_sin_angle',
    'new_dist_from_center', 'new_goal_open_angle'
    'new_dist_to_goal','new_dist_to_touchline',
    'new_speed',
    'new_player_avg_dx','new_player_avg_dy','new_player_avg_dist'
]

obj_cols = ['is_home',
            'new_zone_x','new_zone_y',
            'new_final_third','new_tactical_zone',
            'new_is_attack','new_is_setpiece','new_player_roles',
            'new_type_id','result_name_curr','player_id_curr','team_id_curr','new_opp_team_id_curr'
            ]

context_cols = ['new_current_team_rest','new_opp_team_rest','new_rest_diff',
                'period_id','game_id','episode_id'
                'new_match_hour','new_match_phase','new_is_weekend','new_best_start_time',
                'new_player_avg_start_y','new_player_avg_start_x','new_player_avg_dx','new_player_avg_dy','new_player_avg_dist'
                ]

In [51]:
def build_wide_for_episode(df: pd.DataFrame, is_train: bool, K: int):
    df = df.copy()

    g = df.sort_values(['game_id']).reset_index(drop=True)
    L = len(df)
    if L <= 0:
        return None

    last = L - 1

    # Targets
    if is_train:
        ex = g.loc[last, "end_x"]
        ey = g.loc[last, "end_y"]
        sx = g.loc[last, "start_x"]
        sy = g.loc[last, "start_y"]
        if pd.isna(ex) or pd.isna(ey) or pd.isna(sx) or pd.isna(sy):
            return None
        y_dx = float(ex - sx)
        y_dy = float(ey - sy)

    # prev time
    time_norm = g["new_time_norm"].values
    dt = np.diff(time_norm, prepend=time_norm[0])
    dt = np.where(np.isfinite(dt), dt, 0.0)
    dt[0] = 0.0

    pad_num = 0.0
    pad_obj = -1

    feat = {}

    # Wide format construction (t0..tK)
    for i in range(K+1):
        tname = f"t{i}"
        idx = last - i

        if idx < 0:
            for c in num_cols:
                feat[f"{c}_{tname}"] = pad_num
            feat[f"dt_norm_{tname}"] = pad_num

            for c in obj_cols:
                feat[f"{c}_{tname}"] = pad_obj
            feat[f"mask_{tname}"] = 0
            continue

        row = g.iloc[idx]
        for c in num_cols:
            v = row.get(c, np.nan)
            if pd.isna(v):
                v = pad_num
            feat[f"{c}_{tname}"] = float(v)

        feat[f"dt_norm_{tname}"] = float(dt[idx]) if idx < len(dt) else 0.0

        for c in obj_cols:
            v = row.get(c, np.nan)
            if pd.isna(v):
                v = pad_obj
            feat[f"{c}_{tname}"] = int(v)

        feat[f"mask_{tname}"] = 1

    row0 = g.iloc[last]
    for c in context_cols:
      v = row0.get(c, -1)
      feat[c] = v

    # Prevent leakage: remove t0 end-based features
    leak_cols_t0 = ['new_end_x_norm','new_end_y_norm','new_dx_norm','new_dy_norm',
                    'new_dist_norm','new_cos_angle','new_sin_angle',
                    "new_speed",'new_time_norm']

    for c in leak_cols_t0:
        feat[f"{c}_t0"] = 0.0

    # Keep raw start coordinates for decoding
    feat["start_x_t0"] = float(g.loc[last, "start_x"])
    feat["start_y_t0"] = float(g.loc[last, "start_y"])

    if is_train:
        return feat, y_dx, y_dy
    else:
        return feat

In [52]:
def build_wide_dataset(df: pd.DataFrame, is_train: bool, K: int):
    rows = []
    ydxs, ydys = [], []

    df["game_episode"] = (
    df["game_id"].astype(str)
    + "_" + df["period_id"].astype(str)
    + "_" + df["episode_id"].astype(str)
)

    for ep, g in df.groupby("game_episode", sort=False):
        out = build_wide_for_episode(g, is_train=is_train, K=K)
        if out is None:
            continue

        if is_train:
            feat, ydx, ydy = out
            ydxs.append(ydx)
            ydys.append(ydy)
        else:
            feat = out

        feat["game_episode"] = ep
        feat["game_id"] = int(g["game_id"].iloc[0])
        rows.append(feat)

    X = pd.DataFrame(rows)
    if is_train:
        return X, np.array(ydxs, dtype=np.float32), np.array(ydys, dtype=np.float32)
    else:
        return X

In [53]:
print("Building wide train...")
x_train_full, y_dx, y_dy = build_wide_dataset(train, is_train=True, K=K)
print("Building wide test...")
x_test_full = build_wide_dataset(test, is_train=False, K=K)

Building wide train...
Building wide test...


# Save

In [54]:
print(x_train_full.shape)
print(x_train_full.memory_usage().sum() / 1024**2, "MB")

(15435, 142)
16.722003936767578 MB


In [55]:
x_train_full.to_parquet(file_path + "x_train_full.parquet")
x_test_full.to_parquet(file_path  + "x_test_full.parquet")

np.save(file_path + "y_dx.npy", y_dx)
np.save(file_path + "y_dy.npy", y_dy)

In [56]:
x_train_full['player_id_curr']

KeyError: 'player_id_curr'