In [None]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans

In [None]:
# 파일 경로
file_path = '/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/'

# train + test + match_info

In [None]:
# 데이터 로드
try:
  train_df = pd.read_csv(file_path + 'train.csv')
  test_df = pd.read_csv(file_path + 'test.csv')
  match_df = pd.read_csv(file_path + 'match_info.csv')
  print('data load success')
except Exception as e:
  print(f'data load fail: {e}')

# 데이터 모양 확인
print(f'train: {train_df.shape}, test: {test_df.shape}, match: {match_df.shape}')

data load success
train: (356721, 15), test: (2414, 3), match: (228, 17)


In [None]:
# test data + path data
test_events_list = []
for _, row in test_df.iterrows():
  re_path = os.path.join(file_path, row['path'].lstrip('./'))
  # re_path = Path(file_path + row['path']).resolve()
  df_ep = pd.read_csv(re_path)
  test_events_list.append(df_ep)

test_events = pd.concat(test_events_list, ignore_index=True)
print(test_events.shape)

In [None]:
# 데이터 합치기
train_df['is_train'] = 1
test_events['is_train'] = 0

all_df = pd.concat([train_df, test_events], sort=False, ignore_index=True)
all_df['is_train'] = all_df['is_train'].astype(int)

In [None]:
test_events.isnull().sum()

Unnamed: 0,0
game_id,0
period_id,0
episode_id,0
time_seconds,0
team_id,0
player_id,0
action_id,0
type_name,0
result_name,21129
start_x,0


In [None]:
# 데이터 합치기
match_df = match_df.drop_duplicates('game_id')
merged_df = all_df.merge(match_df, on = 'game_id', how = 'left')

In [None]:
merged_df.to_csv('/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/merged_df.csv', index=False)

# 데이터 로드

In [None]:
merged_df = pd.read_csv(file_path + 'merged_df.csv')

In [None]:
all_df = merged_df.copy()

origin_train_df = all_df[all_df['is_train'] == 1].copy()
origin_test_df = all_df[all_df['is_train'] == 0].copy()

# 데이터 증강

In [None]:
def augment_data(df_to_augment):

    augmented_data = df_to_augment.copy()
    augmented_data['start_y'] = 68.0 - augmented_data['start_y']
    augmented_data['end_y'] = 68.0 - augmented_data['end_y']
    augmented_data['game_episode'] = augmented_data['game_episode'].astype(str) + '_aug'
    return augmented_data

augment_train_df = augment_data(origin_train_df)

aug_df = pd.concat([origin_train_df, augment_train_df, origin_test_df], ignore_index=True)

origin_df = aug_df[~aug_df['game_episode'].str.contains('_aug')].copy()

# match_info process

In [None]:
def match_process(match_df):
  df = match_df.copy()

  # 실제 시간 및 요일 정의
  df['match_date_kst'] = pd.to_datetime(df['game_date']) + pd.Timedelta(hours=9)
  df['new_match_hour'] = df['match_date_kst'].dt.hour
  df['new_is_weekend'] = (df['match_date_kst'].dt.weekday >= 5).astype(int)

  home = df[['match_date_kst', 'home_team_id']].rename(columns={'home_team_id': 'team_id'})
  away = df[['match_date_kst', 'away_team_id']].rename(columns={'away_team_id': 'team_id'})

  # 휴식일 정의
  full = pd.concat([home, away], ignore_index=True)
  full['only_date'] = full['match_date_kst'].dt.normalize()
  full = full.drop_duplicates().sort_values(['team_id', 'only_date'])
  full['prev_date'] = full.groupby('team_id')['only_date'].shift(1)
  full['rest_days'] = (full['only_date'] - full['prev_date']).dt.days
  full['rest_days'] = full['rest_days'].fillna(7).clip(0,14)

  # team_id + 휴식일 결합
  rest_map = dict(zip(zip(full['team_id'], full['only_date'].dt.date.astype(str)), full['rest_days']))

  # 홈팀 기준 휴식일/휴식일 차이
  df['date_str'] = df['match_date_kst'].dt.date.astype(str)

  home_key = list(zip(df['home_team_id'], df['date_str']))
  df['home_rest'] = pd.Series(rest_map).reindex(home_key).fillna(7).values

  away_key = list(zip(df['away_team_id'], df['date_str']))
  df['away_rest'] = pd.Series(rest_map).reindex(away_key).fillna(7).values

  df['new_rest_diff'] = df['home_rest'] - df['away_rest']

  # 경기 시작 시간 분류
  df['new_best_start_time'] = np.where(
      (df['new_is_weekend'] == 1) & df['new_match_hour'].isin([14, 16, 19]),
      1,
      np.where(
          (df['new_is_weekend'] == 0) & (df['new_match_hour'] == 19),
          1,
          0
      )
  )

  return df

# train+test process

In [None]:
# 선수 유형 변수
def player_roles(df):
    df = df.copy()
    stats = df.groupby('player_id').agg({'start_x': 'mean', 'type_name': lambda x: list(x)}).reset_index()

    # 골키퍼 유형 생성
    stats['is_gk'] = stats['type_name'].apply(lambda x: any(t in ['Catch','Parry'] for t in x))

    # 필드 플레이어 유형 생성
    stats['player_role'] = 0
    field = stats[~stats['is_gk']].copy()
    if len(field) > 0:
      km = KMeans(n_clusters=3, random_state=810, n_init=10).fit(field[['start_x']])
      # print(km.cluster_centers_.flatten())
      idx = np.argsort(km.cluster_centers_.flatten())
      # print(idx)
      cmap = {old: new + 1 for new, old in enumerate(idx)}
      # print(cmap)
      field['role'] = pd.Series(km.labels_, index=field.index).map(cmap)
      stats.loc[field.index, 'player_role'] = field['role']
      # stats = stats.merge(
      # field[['role']],
      # left_index=True,
      # right_index=True,
      # how='left'
      # )
      # stats = stats.rename(columns={'role': 'p_player_role'})

    player_roles_map = dict(zip(stats['player_id'], stats['player_role']))

    return player_roles_map

player_roles = player_roles(origin_df)

In [None]:
# 선수별 특성 변수
def player_features(df):
    df = df.copy()

    df['step_dx'] = df['end_x'] - df['start_x']
    df['step_dy'] = df['end_y'] - df['start_y']
    df['step_dist'] = np.sqrt(df['step_dx'] ** 2 + df['step_dy'] ** 2)

    player_stats_map = df.groupby('player_id').agg({'step_dist': 'mean', 'step_dx': 'mean', 'step_dy': 'mean', 'start_x': 'mean', 'start_y': 'mean'}).to_dict('index')

    return player_stats_map

player_features =  player_features(origin_df)

In [None]:
# 타입 정의
def types_encoder(df):
  event_types = df['type_name'].unique().tolist()
  type_map = {t: i for i, t in enumerate(event_types)}

  return type_map

type_map = types_encoder(origin_df)

In [None]:
def add_player_features(df):
  df = df.copy()

  # 선수 유형 + 특성
  df['new_player_roles'] = df['player_id'].map(player_roles)

  df['new_player_avg_dist'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('step_dist', np.nan))
  df['new_player_avg_dx'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('step_dx', np.nan))
  df['new_player_avg_dy'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('step_dy', np.nan))
  df['new_player_avg_start_x'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('start_dx', np.nan))
  df['new_player_avg_start_y'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('start_dy', np.nan))

  # 타입/상황 유형
  df['new_type_id'] = df['type_name'].map(type_map)

  sp_keys = ['Corner','Freekick','Penalty','Kick off','Throw-In']
  df['new_is_setpiece'] = df['type_name'].astype(str).apply(lambda x: 1 if any(k in x for k in sp_keys) else 0)

  return df

In [None]:
def add_match_features(df):
  df = df.copy()

  df = df.sort_values(['game_episode', 'action_id'])

  df['player_id_num'] = df['player_id'].astype(str).str.extract(r'(\d+)').astype(float)

  if 'is_home' not in df.columns:
    df['is_home'] = 0
  df['is_home'] = df['is_home'].astype(int)

  # 팀별 휴식일
  df['new_current_team_rest'] = np.where(df['is_home'] == 1, df['home_rest'], df['away_rest'])
  df['new_opp_team_rest'] = np.where(df['is_home'] == 1, df['away_rest'], df['home_rest'])

  return df

In [None]:
def add_motion_features(df):
  df = df.copy()

  df['new_time_norm'] = df['time_seconds'] / 3600.0

  # 현재 상태
  df['new_current_dx'] = df['end_x'] - df['start_x']
  df['new_current_dy'] = df['end_y'] - df['start_y']
  df['new_current_dist'] = np.sqrt(df['new_current_dx']**2 + df['new_current_dy']**2)
  df['new_current_angle'] = np.arctan2(df['new_current_dy'], df['new_current_dx'])

  # 좌표 정규화
  df['new_start_x_norm'] = df['start_x'] / 105.0
  df['new_end_x_norm'] = df['end_x'] / 105.0
  df['new_start_y_norm'] = df['start_y'] / 68.0
  df['new_end_y_norm'] = df['end_y'] / 68.0

  df['new_dx_norm'] = df['new_start_x_norm'] - df['new_end_x_norm']
  df['new_dy_norm'] = df['new_start_y_norm'] - df['new_end_y_norm']
  df['new_dist_norm'] = np.sqrt(df['new_dx_norm']**2 + df['new_dy_norm'] **2)
  df['new_angle_norm'] = np.arctan2(df['new_dy_norm'], df['new_dx_norm'])

  # 이동 거리
  df['new_dist_norm'] = np.sqrt(df['new_dx_norm']**2 + df['new_dy_norm'] **2)

  dist_grp = df.groupby('game_episode')['new_dist_norm']
  df["new_dist_std"] = dist_grp.transform('std')
  df["new_dist_mean"] = dist_grp.transform('mean')
  df["new_dist_trend"] = df['new_dist_norm'] - df["new_dist_mean"]

  # 이동 방향
  df['new_vector_angle'] = np.arctan2(df['new_dy_norm'], df['new_dx_norm'])
  df['new_sin_angle'] = np.sin(df['new_vector_angle'])
  df['new_cos_angle'] = np.cos(df['new_vector_angle'])

  # 이동 속도
  df['time_delta'] = df.groupby('game_episode')['time_seconds'].diff().fillna(0)
  df['new_speed'] = df['new_current_dist'] / df['time_delta']
  df.loc[df['time_delta'] == 0, 'new_speed'] = 0

  return df

In [None]:
def add_spatial_features(df):
  df = df.copy()

  # 중심으로 부터의 거리
  df['new_dist_from_center'] = np.sqrt((52.5 - df['start_x']) ** 2 + (34 - df['start_y']) ** 2)

  # 상대 골대까지의 거리
  goal_x = 105
  goal_y = 34
  dx = goal_x - df['start_x']
  dy = goal_y - df['start_y']
  df['new_dist_to_goal'] = np.sqrt(dx**2 + dy**2)

  # 골대까지의 개방각
  y_min = 30.34
  y_max = 37.66
  x_goal = 105
  angle_min = np.arctan2(y_min - df['start_y'], x_goal - df['start_x'])
  angle_max = np.arctan2(y_max - df['start_y'], x_goal - df['start_x'])
  df['new_goal_open_angle'] = np.abs(angle_max - angle_min)

  # 경기장 구역
  df['new_zone_x'] = (df['start_x'] / 105.0 * 6).astype(int).clip(0,5)
  df['new_zone_y'] = (df['start_y'] / 68.0 * 3).astype(int).clip(0,2)

  df['new_tactical_zone'] = df['new_zone_y'] * 6 + df['new_zone_x']

  df['new_final_third'] = (df['start_x'] > (105 / 3 * 2)).astype(int)

  df['new_dist_to_touchline'] = np.minimum(df['start_y'], 68 - df['start_y'])

  return df

In [None]:
def add_time_series_features(df):
  df = df.copy()

  df['new_time_delta'] = df.groupby('game_episode')['time_seconds'].diff().fillna(0)
  time_grp = df.groupby('game_episode')['new_time_delta']
  df["new_time_mean"] = time_grp.transform('mean')

  grp = df.groupby('game_episode')

  # 에피소드별 첫 이벤트부터 시간 차이
  df['new_episode_time_delta'] = df['time_seconds'] - grp['time_seconds'].transform('min')
  # 15분 단위 구간
  df['new_match_phase'] = (df['time_seconds'] // 900).astype(int)

  # game_episode별 에피소드 누적합
  df['new_event_idx'] = grp['game_episode'].transform('cumcount')

  # game_episode별 에피소드 누적 비중
  df['event_count'] = grp['new_event_idx'].transform('max')+1
  df['new_event_norm'] = df['new_event_idx'] / (df['event_count'] - 1).clip(lower=1)

  # 현재 팀 공격 여부
  last_events = (df
                 .groupby('game_episode', as_index=False)
                 .tail(1)
                 )
  meta_events = last_events[['game_episode', 'game_id', 'team_id', 'period_id']]
  meta_events = meta_events.rename(columns={'team_id': 'final_team_id'})

  df = df.merge(meta_events[['game_episode', 'final_team_id']],
                on = 'game_episode',
                how='left'
                )
  df['new_is_attack'] = (df['team_id'] == df['final_team_id']).astype(int)

  return df

In [None]:
# def add_lag_features(df):
#   df = df.copy()

#   grp = df.groupby('game_episode')

#   num_lag = 1
#   # lag 1 features
#   shift_cols = ['start_x', 'start_y', 'is_home',
#                 'new_current_dx', 'new_current_dy', 'new_current_dist', 'new_current_angle','new_speed',
#                 'new_is_setpiece','new_tactical_zone','new_final_third',
#                 'new_player_roles', 'new_type_id','new_is_attack',
#                 'player_id_num','time_seconds']

#   for i in range(1, num_lag + 1):
#     s = grp[shift_cols].shift(i)
#     s.columns = [f'new_lag_{i}_{c}' for c in shift_cols]
#     df = pd.concat([df, s], axis=1)

#     df['new_angle_to_goal_diff'] = np.abs(df['new_current_angle'] - df[f'new_lag_{i}_new_current_angle'])
#     df.loc[df['new_angle_to_goal_diff'] > np.pi, 'new_angle_to_goal_diff'] = 2 * np.pi - df['new_angle_to_goal_diff']

#     df[f'new_is_same_player_{i}'] = (df['player_id_num'] == df[f'new_lag_{i}_player_id_num']).astype(int)
#     df[f'new_is_same_role_{i}'] = (df['new_player_roles'] == df[f'new_lag_{i}_new_player_roles']).astype(int)
#     df[f'new_type_transition_{i}'] = df['new_type_id'] * 100 + df[f'new_lag_{i}_new_type_id'].fillna(-100)

#   return df

In [75]:
def add_opponent_team_id(df, team_col='team_id'):
  df = df.copy()

  teams = df.groupby('game_id')[team_col].unique()

  game_map = {
     g: (t[0], t[1]) if len(t) == 2 else (-1, -1) for g, t in teams.items()
  }

  teama = df['game_id'].map(lambda g: game_map[g][0])
  teamb = df['game_id'].map(lambda g: game_map[g][1])

  df['new_opp_team_id'] = np.where(df[team_col] == teama, teamb,
                                     np.where(df[team_col] == teamb, teama, -1)
                                     )
  return df

In [76]:
def create_features(df):
  df = match_process(df)
  df = add_match_features(df)
  df = add_player_features(df)
  df = add_motion_features(df)
  df = add_spatial_features(df)
  df = add_time_series_features(df)
  df = add_opponent_team_id(df)
  # df = add_lag_features(df)

  return df

In [77]:
process_df = create_features(aug_df)

In [78]:
process_df.to_csv(file_path + 'process_df.csv', index=False)