In [None]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
# 파일 경로
file_path = '/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/'

# train + test + match_info

In [None]:
# 데이터 로드
try:
  train_df = pd.read_csv(file_path + 'train.csv')
  test_df = pd.read_csv(file_path + 'test.csv')
  match_df = pd.read_csv(file_path + 'match_info.csv')
  print('data load success')
except Exception as e:
  print(f'data load fail: {e}')

# 데이터 모양 확인
print(f'train: {train_df.shape}, test: {test_df.shape}, match: {match_df.shape}')

data load success
train: (356721, 15), test: (2414, 3), match: (228, 17)


In [None]:
# test data + path data
test_events_list = []
for _, row in test_df.iterrows():
  re_path = os.path.join(file_path, row['path'].lstrip('./'))
  # re_path = Path(file_path + row['path']).resolve()
  df_ep = pd.read_csv(re_path)
  test_events_list.append(df_ep)

test_events = pd.concat(test_events_list, ignore_index=True)
print(test_events.shape)

In [None]:
# 데이터 합치기
train_df['is_train'] = 1
test_events['is_train'] = 0

all_df = pd.concat([train_df, test_events], sort=False, ignore_index=True)
all_df['is_train'] = all_df['is_train'].astype(int)

In [None]:
test_events.isnull().sum()

Unnamed: 0,0
game_id,0
period_id,0
episode_id,0
time_seconds,0
team_id,0
player_id,0
action_id,0
type_name,0
result_name,21129
start_x,0


In [None]:
# 데이터 합치기
match_df = match_df.drop_duplicates('game_id')
merged_df = all_df.merge(match_df, on = 'game_id', how = 'left')

In [None]:
merged_df.to_csv('/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/merged_df.csv', index=False)

# 데이터 로드

In [None]:
merged_df = pd.read_csv(file_path + 'merged_df.csv')

In [None]:
all_df = merged_df.copy()

# 데이터 증강

In [None]:
def augment_data(df_to_augment):

    augmented_data = df_to_augment.copy()
    augmented_data['start_y'] = 68.0 - augmented_data['start_y']
    augmented_data['end_y'] = 68.0 - augmented_data['end_y']
    augmented_data['game_episode'] = augmented_data['game_episode'].astype(str) + '_aug'
    return augmented_data

origin_train_df = all_df[all_df['is_train'] == 1].copy()
origin_test_df = all_df[all_df['is_train'] == 0].copy()

augment_train_df = augment_data(origin_train_df)

aug_df = pd.concat([origin_train_df, augment_train_df, origin_test_df], ignore_index=True)

origin_df = aug_df[~aug_df['game_episode'].str.contains('_aug')].copy()

# match_info process

In [None]:
def match_process(match_df):
  df = match_df.copy()

  # 실제 시간 및 요일 정의
  df['match_date_kst'] = pd.to_datetime(df['game_date']) + pd.Timedelta(hours=9)
  df['new_match_hour'] = df['match_date_kst'].dt.hour
  df['new_is_weekend'] = (df['match_date_kst'].dt.weekday >= 5).astype(int)

  home = df[['match_date_kst', 'home_team_id']].rename(columns={'home_team_id': 'team_id'})
  away = df[['match_date_kst', 'away_team_id']].rename(columns={'away_team_id': 'team_id'})

  # 휴식일 정의
  full = pd.concat([home, away], ignore_index=True)
  full['only_date'] = full['match_date_kst'].dt.normalize()
  full = full.drop_duplicates().sort_values(['team_id', 'only_date'])
  full['prev_date'] = full.groupby('team_id')['only_date'].shift(1)
  full['rest_days'] = (full['only_date'] - full['prev_date']).dt.days
  full['rest_days'] = full['rest_days'].fillna(7).clip(0,14)

  # team_id + 휴식일 결합
  rest_map = dict(zip(zip(full['team_id'], full['only_date'].dt.date.astype(str)), full['rest_days']))

  # 홈팀 기준 휴식일/휴식일 차이
  df['date_str'] = df['match_date_kst'].dt.date.astype(str)

  home_key = list(zip(df['home_team_id'], df['date_str']))
  df['new_home_rest'] = pd.Series(rest_map).reindex(home_key).fillna(7).values

  away_key = list(zip(df['away_team_id'], df['date_str']))
  df['new_away_rest'] = pd.Series(rest_map).reindex(away_key).fillna(7).values

  df['new_rest_diff'] = df['new_home_rest'] - df['new_away_rest']

  # 경기 시작 시간 분류
  df['new_best_start_time'] = np.where(
      (df['new_is_weekend'] == 1) & df['new_match_hour'].isin([14, 16, 19]),
      1,
      np.where(
          (df['new_is_weekend'] == 0) & (df['new_match_hour'] == 19),
          1,
          0
      )
  )

  return df

process_df = match_process(aug_df)

# train+test process

In [None]:
# 선수 유형 변수
def player_roles(df):
    df = df.copy()
    stats = df.groupby('player_id').agg({'start_x': 'mean', 'type_name': lambda x: list(x)}).reset_index()

    # 골키퍼 유형 생성
    stats['is_gk'] = stats['type_name'].apply(lambda x: any(t in ['Catch','Parry'] for t in x))

    # 필드 플레이어 유형 생성
    stats['player_role'] = 0
    field = stats[~stats['is_gk']].copy()
    if len(field) > 0:
      km = KMeans(n_clusters=3, random_state=810, n_init=10).fit(field[['start_x']])
      # print(km.cluster_centers_.flatten())
      idx = np.argsort(km.cluster_centers_.flatten())
      # print(idx)
      cmap = {old: new + 1 for new, old in enumerate(idx)}
      # print(cmap)
      field['role'] = pd.Series(km.labels_, index=field.index).map(cmap)
      stats.loc[field.index, 'player_role'] = field['role']
      # stats = stats.merge(
      # field[['role']],
      # left_index=True,
      # right_index=True,
      # how='left'
      # )
      # stats = stats.rename(columns={'role': 'p_player_role'})

    player_roles_map = dict(zip(stats['player_id'], stats['player_role']))

    return player_roles_map

player_roles = player_roles(origin_df)

In [None]:
# 선수별 특성 변수
def player_features(df):
    df = df.copy()

    df['step_dx'] = df['end_x'] - df['start_x']
    df['step_dy'] = df['end_y'] - df['start_y']
    df['step_dist'] = np.sqrt(df['step_dx'] ** 2 + df['step_dy'] ** 2)

    player_stats_map = df.groupby('player_id').agg({'step_dist': 'mean', 'step_dx': 'mean', 'step_dy': 'mean', 'start_x': 'mean', 'start_y': 'mean'}).to_dict('index')

    return player_stats_map

player_features =  player_features(origin_df)

In [None]:
# 타입 정의
def types_encoder(df):
  event_types = df['type_name'].unique().tolist()
  type_map = {t: i for i, t in enumerate(event_types)}

  return type_map

type_map = types_encoder(origin_df)

In [None]:
def create_features(df):
  df = df.sort_values(['game_episode', 'action_id'])

  df['player_id_num'] = df['player_id'].astype(str).str.extract(r'(\d+)').astype(float)

  if 'is_home' not in df.columns:
    df['is_home'] = 0
  df['is_home'] = df['is_home'].astype(int)

  # 팀별 휴식일
  df['new_current_team_rest'] = np.where(df['is_home'] == 1, df['new_home_rest'], df['new_away_rest'])
  df['new_opp_team_rest'] = np.where(df['is_home'] == 1, df['new_away_rest'], df['new_home_rest'])

  # 선수별 유형 + 특성
  df['new_player_roles'] = df['player_id'].map(player_roles)

  df['new_player_avg_dist'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('step_dist', np.nan))
  df['new_player_avg_dx'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('step_dx', np.nan))
  df['new_player_avg_dy'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('step_dy', np.nan))
  df['new_player_avg_start_x'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('start_dx', np.nan))
  df['new_player_avg_start_y'] = df['player_id'].map(lambda x: player_features.get(x, {}).get('start_dy', np.nan))

  # 타입별 유형 + 특성
  df['new_type_id'] = df['type_name'].map(type_map)

  sp_keys = ['Corner','Freekick','Penalty','Kick off','Throw-In']
  df['new_is_setpiece'] = df['type_name'].astype(str).apply(lambda x: 1 if any(k in x for k in sp_keys) else 0)

  df['new_is_goal'] = np.where(
      (df['type_name'] == 'Goal'),
      np.where(df['is_home'] == 1,
               1,
               -1),
      0
  )
  # game_episode별 누적 득점 차이
  df['new_cumulative_score_diff'] = df.groupby('game_episode')['new_is_goal'].cumsum().shift(1).fillna(0)
  # 다음 에피소드 슈팅 여부
  df['new_next_action_is_shot'] = ((df['type_name'].shift(-1) == 'Shoot') | (df['type_name'].shift(-1) == 'Goal')).astype(int)
  # 에피소드가 넘어가면서 슈팅 여부 초기화
  is_new_episode = df['game_episode'] != df['game_episode'].shift(1)
  df.loc[is_new_episode, 'new_next_action_is_shot'] = 0
  # 동점 상황
  df['new_is_draw'] = (df['new_cumulative_score_diff'] == 0).astype(int)
  # 동점 상황에서 다음 에피소드 슈팅 여부
  df['new_is_draw_next_shot'] = (df['new_is_draw'] * df['new_next_action_is_shot']).astype(int)

  # 현재 상태
  df['new_current_dx'] = df['end_x'] - df['start_x']
  df['new_current_dy'] = df['end_y'] - df['start_y']
  df['new_current_dist'] = np.sqrt(df['new_current_dx']**2 + df['new_current_dy']**2)
  df['new_current_angle'] = np.arctan2(df['new_current_dy'], df['new_current_dx'])

  # 이동량
  df['new_dx'] = np.sqrt((df['end_x'] - df['start_x']) ** 2)
  df['new_dy'] = np.sqrt((df['end_y'] - df['start_y']) ** 2)

  # 이동 거리
  df['new_dist'] = np.sqrt(df['new_dx'] ** 2 + df['new_dy'] ** 2)

  # 이동 방향
  df['vector_angle'] = np.arctan2(
      df['end_y'] - df['start_y'],
      df['end_x'] - df['start_x']
    )
  # df['new_angle'] = np.degrees(
  #   np.arctan2(df['end_y'] - df['start_y'],
  #              df['end_x'] - df['start_x'])
  #   )
  df['new_sin_angle'] = np.sin(df['vector_angle'])
  df['new_cos_angle'] = np.cos(df['vector_angle'])

  # 속도 (dt=0 보호)
  df['new_time_delta'] = df.groupby('game_episode')['time_seconds'].diff().fillna(0)
  df['new_speed'] = df['new_dist'] / df['new_time_delta']
  df.loc[df['new_time_delta'] == 0, 'new_speed'] = 0

  # 중심으로부터 떨어진 거리
  df['new_dist_from_center'] = np.sqrt((52.5 - df['start_x']) ** 2 + (34 - df['start_y']) ** 2)

  # 상대 골대까지의 거리
  goal_x = 105
  goal_y = 34
  dx = goal_x - df['start_x']
  dy = goal_y - df['start_y']
  df['new_dist_to_away'] = np.sqrt(dx**2 + dy**2)
  df['new_angle_to_away'] = np.arctan2(goal_y - df['start_y'], goal_x - df['start_x'])

  # 우리 골대부터 떨어진 거리
  goal_x = 0
  goal_y = 34
  dx = df['start_x'] - goal_x
  dy = df['start_y'] - goal_y
  df['new_dist_from_home'] = np.sqrt(dx**2 + dy**2)

  # 골대까지의 개방각
  y_min = 30.34
  y_max = 37.66
  x_goal = 105
  angle_min = np.arctan2(y_min - df['start_y'], x_goal - df['start_x'])
  angle_max = np.arctan2(y_max - df['start_y'], x_goal - df['start_x'])
  df['new_goal_open_angle'] = np.abs(angle_max - angle_min)

  # 경기장 구역
  df['new_zone_x'] = (df['start_x'] / 105.0 * 6).astype(int).clip(0,5)
  df['new_zone_y'] = (df['start_y'] / 68.0 * 3).astype(int).clip(0,2)
  df['new_tactical_zone'] = df['new_zone_y'] * 6 + df['new_zone_x']
  df['new_final_third'] = (df['start_x'] > (105 / 3 * 2)).astype(int)
  df['new_dist_to_touchline'] = np.minimum(df['start_y'], 68 - df['start_y'])
  df['new_is_near_touchline'] = (df['new_dist_to_touchline'] < 5).astype(int)


  df['new_min_times'] = df['time_seconds'] // 60
  df['new_total_min_times'] = np.where(
      df['period_id'] == 1,
      df['time_seconds'] // 60.0,
      df['time_seconds'] // 60.0 + 45
  )


  grp = df.groupby('game_episode')
  # 이전시간
  df['new_prev_time'] = grp['time_seconds'].shift(1).fillna(0)
  # 전후 이벤트 시간 차이
  # df['new_time_delta'] = grp['time_seconds'].diff().fillna(0)
  # 에피소드별 첫 이벤트부터 시간 차이
  df['new_episode_time_delta'] = df['time_seconds'] - grp['time_seconds'].transform('min')
  # 15분 단위 구간
  df['new_match_phase'] = (df['time_seconds'] // 900).astype(int)
  # game_episode별 에피소드 누적합
  df['new_event_idx'] =grp.cumcount()
  # 하나의 game_episode 이벤트 수
  df['new_event_count'] = grp['new_event_idx'].transform('max')+1
  # game_episode별 에피소드 누적 비중
  df['new_event_norm'] = df['new_event_idx'] / (df['new_event_count'] - 1).clip(lower=1)
  # 마지막 에피소드 여부
  last_idx = grp['new_event_idx'].transform('max')
  df['new_last_episode'] = (df['new_event_idx'] == last_idx).astype(int)

  # 마지막 에피소드
  last_events = (df
                 .groupby('game_episode', as_index=False)
                 .tail(1)
                 )
  meta_events = last_events[['game_episode', 'game_id', 'team_id', 'period_id']]
  meta_events = meta_events.rename(columns={'team_id': 'final_team_id'})
  # 원데이터와 마지막 에피소드 결합
  df = df.merge(meta_events[['game_episode', 'final_team_id']],
                on = 'game_episode',
                how='left'
                )
  # 팀 아이디와 마지막 에피소드 동일 여부 확인
  df['new_is_attack'] = (df['team_id'] == df['final_team_id']).astype(int)

  # Redefine grp after `new_is_attack` is added
  grp = df.groupby('game_episode')

  # 직전 이벤트 생성 함수
  num_lag = 1

  # lag 1 features
  shift_cols = ['start_x', 'start_y', 'is_home',
                'new_current_dx', 'new_current_dy', 'new_current_dist', 'new_current_angle',
                'new_is_setpiece','new_tactical_zone','new_final_third',
                'new_player_roles', 'new_type_id','new_is_goal','new_is_attack',
                'player_id_num','time_seconds']

  for i in range(1, num_lag + 1):
    s = grp[shift_cols].shift(i)
    s.columns = [f'new_lag_{i}_{c}' for c in shift_cols]
    df = pd.concat([df, s], axis=1)

    # df[f'new_lag_{i}_dx'] = df[f'lag_{i}_new_current_dx']
    # df[f'new_lag_{i}_dy'] = df[f'lag_{i}_new_current_dy']
    # df[f'new_lag_{i}_dist'] = df[f'lag_{i}_new_current_dist']
    # df[f'new_lag_{i}_angle'] = df[f'lag_{i}_new_current_angle']
    # df[f'new_lag_{i}_is_home'] = df[f'lag_{i}_new_is_home']
    # df[f'new_lag_{i}_is_setpiece'] = df[f'lag_{i}_new_is_setpiece']
    # df[f'new_lag_{i}_tactical_zone'] = df[f'lag_{i}_new_tactical_zone']
    # df[f'new_lag_{i}_final_third'] = df[f'lag_{i}_new_final_third']
    # df[f'new_lag_{i}_is_goal'] = df[f'lag_{i}_new_is_goal']
    # df[f'new_lag_{i}_is_attack'] = df[f'lag_{i}_new_is_attack']

    df['new_angle_to_goal_diff'] = np.abs(df['new_current_angle'] - df[f'new_lag_{i}_new_current_angle'])
    df.loc[df['new_angle_to_goal_diff'] > np.pi, 'new_angle_to_goal_diff'] = 2 * np.pi - df['new_angle_to_goal_diff']

    df[f'new_is_same_player_{i}'] = (df['player_id_num'] == df[f'new_lag_{i}_player_id_num']).astype(int)
    df[f'new_is_same_role_{i}'] = (df['new_player_roles'] == df[f'new_lag_{i}_new_player_roles']).astype(int)
    df[f'new_type_transition_{i}'] = df['new_type_id'] * 100 + df[f'new_lag_{i}_new_type_id'].fillna(-100)

  # final_df = df.groupby('game_episode').tail(1).reset_index(drop=True)
  # final_df['target_x'] = final_df['end_x']
  # final_df['target_y'] = final_df['end_y']

  return df

final_process_df = create_features(process_df)

In [None]:
# final_process_df = final_process_df.drop('Unnamed: 0', axis=1)

In [None]:
final_process_df.to_csv(file_path + 'final_process_df5.csv', index=False)

# fill missing

In [None]:
# preprocessing data load
process_df = pd.read_csv(file_path + 'final_process_df5.csv')
select_df = process_df.copy()

In [None]:
select_df.head()

Unnamed: 0,game_id,period_id,episode_id,time_seconds,team_id,player_id,action_id,type_name,result_name,start_x,...,new_lag_1_new_player_roles,new_lag_1_new_type_id,new_lag_1_new_is_goal,new_lag_1_new_is_attack,new_lag_1_player_id_num,new_lag_1_time_seconds,new_angle_to_goal_diff,new_is_same_player_1,new_is_same_role_1,new_type_transition_1
0,126283,1,1,0.667,2354,344559,0,Pass,Successful,52.418205,...,,,,,,,,0,0,-100.0
1,126283,1,1,3.667,2354,250036,2,Pass,Successful,32.01324,...,3.0,0.0,0.0,1.0,344559.0,0.667,2.416403,0,0,0.0
2,126283,1,1,4.968,2354,500145,4,Carry,,37.371285,...,1.0,0.0,0.0,1.0,250036.0,3.667,0.454472,0,0,100.0
3,126283,1,1,8.2,2354,500145,5,Pass,Successful,38.39157,...,2.0,1.0,0.0,1.0,500145.0,4.968,0.365523,1,1,1.0
4,126283,1,1,11.633,2354,142106,7,Pass,Successful,34.578705,...,2.0,0.0,0.0,1.0,500145.0,8.2,2.122548,0,1,0.0


In [None]:
def drop_features(df):
  df = df.copy()

  meta_cols = ['start_x', 'start_y', 'end_x', 'end_y', 'game_episode', 'target_x', 'target_y',
               'period_id','result_name','is_home','is_train']

  # 불필요 변수 제거
  # drop_cols = [
  #     # match
  #     'game_id', 'season_id','competition_id','game_date',
  #     'date_str','final_team_id',
  #     'home_team_id','away_team_id','game_day','match_date_kst'
  #     'home_score','away_score','venue','competition_name', 'match_date_kst',
  #     'country_name','season_name','home_team_name_ko','away_team_name_ko','home_team_name','away_team_name',
  #     # games
  #     'episode_id','time_seconds', 'team_id', 'player_id_num', 'player_id','action_id','type_name'
  # ]

  # cols = [c for c in cols if c not in drop_cols]

  cols = []
  cols.extend(df.columns[df.columns.str.startswith('new_')].tolist())
  cols.extend(df.columns[df.columns.str.startswith('lag_')].tolist())

  used_cols = cols + meta_cols
  used_cols = sorted(list(set(used_cols)))

  df_filtered = df[used_cols].copy()
  print(f"최종 사용 피처 수: {len(used_cols)}")
  print(f"전체 데이터 건수: {len(df_filtered)}")

  return df_filtered

select_df = drop_features(select_df)

최종 사용 피처 수: 84
전체 데이터 건수: 33284


In [None]:
# 결측치 처리
def fill_missing(df):
  df = df.copy()
  numeric_cols = df.select_dtypes(include='number').columns
  for c in numeric_cols:
    df[c] = df[c].fillna(0.0)

  obj_cols = df.select_dtypes(exclude='number').columns
  for o in obj_cols:
    df[o] = df[o].fillna('missing').astype(str)

  # print(sum(df.isnull().sum()))

  return df

select_df = fill_missing(select_df)

# features selection + normalize

In [None]:
train_df = select_df[select_df['is_train'] == 1].copy()
test_df = select_df[select_df['is_train'] == 0].copy()

In [None]:
# # 변수 상관도
# def corr_pairs(df):
#     corr_features = df.columns
#     drop_cols = ['game_episode','result_name','target_x','target_y']
#     corr_features = corr_features.drop(drop_cols)

#     corr_matrix = df[corr_features].corr().drop_duplicates()
#     pairs = (
#         corr_matrix.stack()
#         .reset_index()
#         .rename(columns={'level_0': 'feature_1',
#                          'level_1': 'feature_2',
#                          0: 'corr'})
#     )

#     pairs = pairs[pairs['feature_1'] != pairs['feature_2']]
#     pairs['pair'] = pairs.apply(lambda x: '-'.join(sorted([x['feature_1'], x['feature_2']])), axis=1)
#     pairs = pairs.drop_duplicates('pair').drop(columns='pair')

#     pairs = pairs.sort_values('corr', key=lambda x: x.abs(), ascending=False)

#     return pairs

# pairs_df = corr_pairs(train_df)

In [None]:
def normalize_features(df):
#   ss = StandardScaler()
#   num_cols = df.select_dtypes(include='number').columns
#   df[num_cols] = ss.fit_transform(df[num_cols])

  le = LabelEncoder()
  obj_cols = df.select_dtypes(exclude='number').columns.drop('game_episode').to_list()
  for c in obj_cols:
    df[c] = le.fit_transform(df[c])

  return df

train_df = normalize_features(train_df)
test_df = normalize_features(test_df)

In [None]:
train_df.to_csv(file_path + 'final_train_df.csv', index=False)
test_df.to_csv(file_path + 'final_test_df.csv', index=False)