In [None]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
# 파일 경로
file_path = '/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/'

# train + test + match_info

In [None]:
# 데이터 로드
try:
  train_df = pd.read_csv(file_path + 'train.csv')
  test_df = pd.read_csv(file_path + 'test.csv')
  match_df = pd.read_csv(file_path + 'match_info.csv')
  print('data load success')
except Exception as e:
  print(f'data load fail: {e}')

# 데이터 모양 확인
print(f'train: {train_df.shape}, test: {test_df.shape}, match: {match_df.shape}')

data load success
train: (356721, 15), test: (2414, 3), match: (228, 17)


In [None]:
# 여기서부터

In [None]:
# test data + path data
test_events_list = []
for _, row in test_df.iterrows():
  re_path = os.path.normpath(file_path + row['path'])
  # re_path = Path(file_path + row['path']).resolve()
  df_ep = pd.read_csv(re_path)
  test_events_list.append(df_ep)

test_events = pd.concat(test_events_list, ignore_index=True)
print(test_events.shape)

In [None]:
# 데이터 합치기
train_df['is_train'] = 1
test_events['is_train'] = 0

all_df = pd.concat([train_df, test_events], sort=False, ignore_index=True)
all_df['is_train'] = all_df['is_train'].astype(int)

In [None]:
test_events.isnull().sum()

In [None]:
# 데이터 합치기
match_df = match_df.drop_duplicates('game_id')
merged_df = all_df.merge(match_df, on = 'game_id', how = 'left')

In [None]:
merged_df.to_csv('/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/merged_df.csv', index=False)

# 데이터 로드

In [None]:
merged_df = pd.read_csv('/content/drive/MyDrive/1데이콘/K리그-서울시립대공개AI경진대회/dataset/merged_df.csv')

In [None]:
all_df = merged_df.copy()

# origin_process_split

In [None]:
# def origin_process_split(df):
#   pro_data = df.copy()
#   pro_data = pro_data.sort_values(['game_id', 'episode_id', 'time_seconds', 'action_id'])
#   pro_data['game_episode'] = 'p_' + pro_data['game_episode'].astype(str)

#   split_df = pd.concat([df, pro_data], ignore_index=True).sort_values(['game_id', 'episode_id', 'time_seconds', 'action_id'])

#   # 원본/전처리용 데이터 구분
#   origin_df = split_df[~split_df['game_episode'].str.contains('p_')].copy()
#   process_df = split_df[split_df['game_episode'].str.contains('p_')].copy()

#   return origin_df, process_df

In [None]:
# origin_df, process_df = origin_process_split(all_df)

# display(f'Orgin Data Shape: {origin_df.shape}')
# display(f'Preprocess Data Shape: {process_df.shape}')

# match_info process

In [None]:
def match_process(match_df):
  df = match_df.copy()

  # 실제 시간 및 요일 정의
  df['p_match_date_kst'] = pd.to_datetime(df['game_date']) + pd.Timedelta(hours=9)
  df['p_match_hour'] = df['p_match_date_kst'].dt.hour
  df['p_is_weekend'] = (df['p_match_date_kst'].dt.weekday >= 5).astype(int)

  home = df[['p_match_date_kst', 'home_team_id']].rename(columns={'home_team_id': 'team_id'})
  away = df[['p_match_date_kst', 'away_team_id']].rename(columns={'away_team_id': 'team_id'})

  # 휴식일 정의
  full = pd.concat([home, away], ignore_index=True)
  full['p_only_date'] = full['p_match_date_kst'].dt.normalize()
  full = full.drop_duplicates().sort_values(['team_id', 'p_only_date'])
  full['p_prev_date'] = full.groupby('team_id')['p_only_date'].shift(1)
  full['p_rest_days'] = (full['p_only_date'] - full['p_prev_date']).dt.days
  full['p_rest_days'] = full['p_rest_days'].fillna(7).clip(0,14)

  # team_id + 휴식일 결합
  rest_map = dict(zip(zip(full['team_id'], full['p_only_date'].dt.date.astype(str)), full['p_rest_days']))

  # 홈팀 기준 휴식일/휴식일 차이
  df['p_date_str'] = df['p_match_date_kst'].dt.date.astype(str)

  home_key = list(zip(df['home_team_id'], df['p_date_str']))
  df['p_home_rest'] = pd.Series(rest_map).reindex(home_key).fillna(7).values

  away_key = list(zip(df['away_team_id'], df['p_date_str']))
  df['p_away_rest'] = pd.Series(rest_map).reindex(away_key).fillna(7).values

  df['p_rest_diff'] = df['p_home_rest'] - df['p_away_rest']

  # df['p_home_rest'] = df.apply(lambda x: rest_map.get((x['home_team_id'], x['p_date_str']), 7), axis=1)
  # df['p_away_rest'] = df.apply(lambda x: rest_map.get((x['away_team_id'], x['p_date_str']), 7), axis=1)
  # df['p_rest_diff'] = df['p_home_rest'] - df['p_away_rest']

  # 이벤트 수행 팀 기준 팀별 휴식일
  df['p_current_team_rest'] = np.where(df['is_home'] == 1, df['p_home_rest'], df['p_away_rest'])
  df['p_away_team_rest'] = np.where(df['is_home'] == 1, df['p_away_rest'], df['p_home_rest'])

  # 경기 시작 시간 분류
  df['p_start_time'] = np.where(
      (df['p_is_weekend'] == 1) & df['p_match_hour'].isin([14, 16, 19]),
      1,
      np.where(
          (df['p_is_weekend'] == 0) & (df['p_match_hour'] == 19),
          1,
          0
      )
  )
  # df['p_start_time'] = np.where(
  #     df['p_is_weekend'] == 1,
  #     df['p_match_hour'].isin([14, 16, 19]).astype(int),
  #     df['p_match_hour'].isin([19]).astype(int)
  #     )

  return df

# process_df = match_process(process_df)

# train+test process

In [None]:
# 좌표 정규화 함수
# # 좌표 / 경기장 끝 좌표
def normalize_coord(df):
  aug_df = df.copy()
  aug_df['nor_start_x'] = 105.0 - aug_df['start_x']
  aug_df['nor_end_x'] = 105.0 - aug_df['end_x']
  aug_df['nor_start_y'] = 68.0 - aug_df['start_y']
  aug_df['nor_end_y'] = 68.0 - aug_df['end_y']
  aug_df['game_episode'] = aug_df['game_episode'].astype(str) + '_aug'

  dfs = pd.concat([df, aug_df], ignore_index=True)

  return dfs

# aug_df = normalize_coord(all_df)

In [None]:
def origin_coord_feature(df):
  origin_df = df[~df['game_episode'].str.contains('_aug')].copy()

  # 정규화된 좌표
  origin_df['nor_step_dx'] = origin_df['end_x'] - origin_df['start_x']
  origin_df['nor_step_dy'] = origin_df['end_y'] - origin_df['start_y']
  origin_df['nor_step_dist'] = np.sqrt(origin_df['nor_step_dx']**2 + origin_df['nor_step_dy']**2)

  # 선수별 정보
  # player_stats_map = origin_df.groupby('player_id').agg({'nor_step_dist': 'mean', 'nor_step_dx': 'mean', 'nor_step_dy': 'mean'}).to_dict('index')
  # df['nor_player_avg_dist'] = df['player_id'].map(lambda x: player_stats_map.get(x, {}).get('nor_step_dist', np.nan))
  # df['nor_player_avg_dx'] = df['player_id'].map(lambda x: player_stats_map.get(x, {}).get('nor_step_dx', np.nan))
  # df['nor_player_avg_dy'] = df['player_id'].map(lambda x: player_stats_map.get(x, {}).get('nor_step_dy', np.nan))

  player_stats = origin_df.groupby('player_id')[['nor_step_dist','nor_step_dx','nor_step_dy']].mean().reset_index()
  df = df.merge(player_stats, on='player_id', how='left')

  return df

# aug_df = origin_coord_feature(aug_df)

In [None]:
# 시간 변수
def add_minute_feature(df):
  df.copy()

  df['p_game_time_min'] = df['time_seconds'] // 60
  df['p_game_time__min_total'] = np.where(
      df['period_id'] == 1,
      df['time_seconds'] / 60.0,
      df['time_seconds'] / 60.0 + 45
  )

  # 이전시간
  df['p_prev_time'] = df.groupby('game_episode')['time_seconds'].shift(1).fillna(0)
  # 전후 이벤트 시간 차이
  df['p_time_delta'] = df.groupby('game_episode')['time_seconds'].diff().fillna(0)
  # 에피소드별 첫 이벤트부터 시간 차이
  df['p_time_episode_time'] = df['time_seconds'] - df.groupby('game_episode')['time_seconds'].transform('min')
  # 15분 단위 구간
  df['p_match_phase'] = (df['time_seconds'] // 900).astype(int)

  return df

# process_df = add_minute_feature(process_df)

In [None]:
# 좌표 파생변수
def create_coord_features(df):
  df = df.copy()

  # 이동량
  df['p_dx'] = np.sqrt((df['end_x'] - df['start_x']) ** 2)
  df['p_dy'] = np.sqrt((df['end_y'] - df['start_y']) ** 2)

  # 이동 거리
  df['p_dist'] = np.sqrt(df['p_dx'] ** 2 + df['p_dy'] ** 2)

  # 이동 방향
  df['p_vector_angle'] = np.arctan2(
      df['end_y'] - df['start_y'],
      df['end_x'] - df['start_x']
    )

  df['p_angle'] = np.degrees(
    np.arctan2(df['end_y'] - df['start_y'],
               df['end_x'] - df['start_x'])
    )

  # 속도 (dt=0 보호)
  df['p_speed'] = df['p_dist'] / df['p_time_delta']
  df.loc[df['p_time_delta'] == 0, 'p_speed'] = 0

  # 중심으로부터 떨어진 거리
  df['p_dist_from_center'] = np.sqrt((52.5 - df['start_x']) ** 2 + (34 - df['start_y']) ** 2)

  # 상대 골대까지의 거리
  goal_x = 105
  goal_y = 34
  # goal_y_min = 30.34
  # goal_y_max = 37.66
  dx = goal_x - df['start_x']
  dy = goal_y - df['start_y']
  # dy = np.where(
  #   df['start_y'] < goal_y_min,
  #   goal_y_min - df['start_y'],
  #   np.where(
  #       df['start_y'] > goal_y_max,
  #       df['start_y'] - goal_y_max,
  #       0
  #     )
  # )
  df['p_dist_to_away'] = np.sqrt(dx**2, dy**2)
  df['p_angle_to_away'] = np.arctan2(34 - df['start_y'], 105 - df['start_x'])

  # 우리 골대부터 떨어진 거리
  goal_x = 0
  goal_y = 34
  # goal_y_min = 30.34
  # goal_y_max = 37.66
  dx = df['start_x'] - goal_x
  dy = df['start_y'] - goal_y
  # dy = np.where(
  #   df['start_y'] < goal_y_min,
  #   goal_y_min - df['start_y'],
  #   np.where(
  #       df['start_y'] > goal_y_max,
  #       df['start_y'] - goal_y_max,
  #       0
  #     )
  # )
  df['p_dist_from_home'] = np.sqrt(dx**2, dy**2)

  # 골대까지의 개방각
  y_min = 30.34
  y_max = 37.66
  x_goal = 105
  angle_min = np.arctan2(y_min - df['start_y'], x_goal - df['start_x'])
  angle_max = np.arctan2(y_max - df['start_y'], x_goal - df['start_x'])
  df['p_goal_open_angle'] = np.abs(angle_max - angle_min)

  # 경기장 구역
  df['p_zone_x'] = (df['start_x'] / 105.0 * 6).astype(int).clip(0,5)
#   df['p_y_zone'] =  pd.cut(df["start_y"],
#     bins=[0, 68/3, 2*68/3, 68],
#     labels=[0, 1, 2],
#     include_lowest=True
#     ).astype(int)
  df['p_zone_y'] = (df['start_y'] / 68.0 * 3).astype(int).clip(0,2)
  df['p_tactical_zone'] = df['p_zone_y'] * 6 + df['p_zone_x']

  df['p_final_third'] = (df['start_x'] > (105/3*2)).astype(int)
  df['p_dist_to_touchline'] = np.minimum(df['start_y'], 68 - df['start_y'])
  df['p_is_near_touchline'] = (df['p_dist_to_touchline'] < 5).astype(int)


  # 시간별 움직임 변화
#   stats = df[df['period_id']==1]

#   groupby('game_eposide')

#   df['minute_first'] = train_df[train_df['period_id']==1]['time_seconds'] // 60
# minute_stats_first = train_df.groupby('minute_first')['pass_distance'].mean().reset_index()

# train_df['minute_second'] = train_df[train_df['period_id']==2]['time_seconds'] // 60
# minute_stats_second = train_df.groupby('minute_second')['pass_distance'].mean().reset_index()
  return df

# process_df = create_coord_features(process_df)

In [None]:
# # 패스 파생변수
# def create_pass_features(df):
#   df.copy()

#   # 패스 각도
#   df['p_pass_vector_angle'] = np.arctan2(
#       df['end_y'] - df['start_y'],
#       df['end_x'] - df['start_x']
#     )

#   df['p_pass_angle'] = np.degrees(
#     np.arctan2(df['end_y'] - df['start_y'],
#                df['end_x'] - df['start_x'])
#     )

#   # 패스 유형
#   threshold = 0.0
#   diff_x = df['end_x'] - df['start_x']
#   df['p_pass_Front_Back'] = np.where(
#           diff_x > threshold, '전방패스', '후방패스'
#   )

#   df['p_pass_Left_Right'] = np.where(
#       (np.abs(df['p_pass_angle']) >= 60) & (np.abs(df['p_pass_angle']) <= 120), '횡패스', '비횡패스'
#   )

#   # 패스 거리
#   # df['p_pass_dist'] = np.sqrt(
#   #     (df['end_x'] - df['start_x']) ** 2 +
#   #     (df['end_y'] - df['start_y']) ** 2
#   # )

#   # # 패스 유형
#   # bins = [0.0, 15.0, 30.0, 105.0]
#   # labels = ['숏패스', '중거리패스', '롱패스']
#   # df['pre_pass_type'] = pd.cut(df['pre_pass_dist'], bins=bins, labels=labels)

#   return df

# process_df = create_pass_features(process_df)
# # https://portal.kleague.com/main/schedule/popupKleagueHelp.do

In [None]:
# type_name 파생변수
def create_type_features(df):
  df = df.copy()

  # 타입 인코딩
  event_types = df['type_name'].unique().tolist()
  type_map = {t: i for i, t in enumerate(event_types)}
  df['p_type_id'] = df['type_name'].map(type_map)

  # 세트피스 정의
  sp_keys = ['Corner','Freekick','Penalty','Kick off','Throw-In']
  df['p_is_setpiece'] = df['type_name'].astype(str).apply(lambda x: 1 if any(k in x for k in sp_keys) else 0)

  # 득점 상황
  df['p_goal_event'] = np.where(
      (df['type_name'] == 'Goal'),
      np.where(df['is_home'] == 1,
               1,
               -1),
      0
  )

  # game_episode별 누적 득점 차이
  df['p_cumulative_score_diff'] = df.groupby('game_episode')['p_goal_event'].cumsum().shift(1).fillna(0)

  # 다음 에피소드 슈팅 여부
  df['p_next_action_is_shot'] = ((df['type_name'].shift(-1) == 'Shoot') | (df['type_name'].shift(-1) == 'Goal')).astype(int)

  # 에피소드가 넘어가면서 슈팅 여부 초기화
  is_new_episode = df['game_episode'] != df['game_episode'].shift(1)
  df.loc[is_new_episode, 'p_next_action_is_shot'] = 0

  # 동점 상황
  df['p_is_draw'] = (df['p_cumulative_score_diff'] == 0).astype(int)

  # 동점 상황에서 다음 에피소드 슈팅 여부
  df['p_is_draw_next_shot'] = (df['p_is_draw'] * df['p_next_action_is_shot']).astype(int)

  return df

# process_df = create_type_features(process_df)

In [None]:
# 공격팀 이벤트 여부
def create_attack_status(df):
  df = df.copy()

  # 마지막 에피소드
  last_events = (df
                 .groupby('game_episode', as_index=False)
                 .tail(1)
                 )

  meta_events = last_events[['game_episode', 'game_id', 'team_id', 'period_id']]
  meta_events = meta_events.rename(columns={'team_id': 'final_team_id'})

  # 원데이터와 마지막 에피소드 결합
  df = df.merge(meta_events[['game_episode', 'final_team_id']],
                on = 'game_episode',
                how='left'
                )

  # 팀 아이디와 마지막 에피소드 동일 여부 확인
  df['p_is_attack'] = (df['team_id'] == df['final_team_id']).astype(int)

  return df

# process_df = create_attack_status(process_df)

In [None]:
# 에피소드 파생변수
def create_eposide_feature(df):
  df = df.copy()

  # game_episode별 에피소드 누적합
  df['p_event_idx'] = df.groupby('game_episode').cumcount()

  # 하나의 game_episode 이벤트 수
  df['p_event_count'] = df.groupby('game_episode')['p_event_idx'].transform('max')+1

  # game_episode별 에피소드 누적 비중
  df['p_event_norm'] = df['p_event_idx'] / (df['p_event_count'] - 1).clip(lower=1)

  # 마지막 에피소드 여부
  last_idx = df.groupby('game_episode')['p_event_idx'].transform('max')
  df['p_last_episode'] = (df['p_event_idx'] == last_idx).astype(int)

  return df

# process_df = create_eposide_feature(process_df)

In [None]:
# 선수 특성 변수
def create_player_roles(p_df):
  def roles(df):
    df = df.copy()
    stats = df.groupby('player_id').agg({'start_x': 'mean', 'type_name': lambda x: list(x)}).reset_index()

    # 골키퍼 규칙 생성
    stats['is_gk'] = stats['type_name'].apply(lambda x: any(t in ['Catch','Parry'] for t in x))

    # 필드 플레이어 규칙 생성
    stats['player_role'] = 0
    field = stats[~stats['is_gk']].copy()
    if len(field) > 0:
      km = KMeans(n_clusters=3, random_state=810, n_init=10).fit(field[['start_x']])
      # print(km.cluster_centers_.flatten())
      idx = np.argsort(km.cluster_centers_.flatten())
      # print(idx)
      cmap = {old: new + 1 for new, old in enumerate(idx)}
      # print(cmap)
      field['role'] = pd.Series(km.labels_, index=field.index).map(cmap)
      stats.loc[field.index, 'player_role'] = field['role']
      # stats = stats.merge(
      # field[['role']],
      # left_index=True,
      # right_index=True,
      # how='left'
      # )
      # stats = stats.rename(columns={'role': 'p_player_role'})

    return dict(zip(stats['player_id'], stats['player_role']))

  # 원데이터에 player_roles 적용
  p_df = p_df.copy()
  player_roles = roles(p_df)
  p_df['p_player_roles'] = p_df['player_id'].map(player_roles)

  return p_df

# process_df = create_player_roles(process_df)

In [None]:
# 선수별 파생변수
def create_player_features(df):
  df = df.copy()
  player_means = df.groupby('player_id')[['p_dx',
                                          'p_dy',
                                          'p_dist',
                                          'p_speed',
                                          'p_dist_from_center',
                                          'p_dist_to_away',
                                          'p_dist_from_home',
                                          'p_vector_angle',
                                          'p_angle',
                                          'p_event_count'
                                          ]].mean()
  player_means.columns = [f'p_player_{col}_mean' for col in player_means.columns]
  df = df.merge(player_means, on='player_id', how='left')

  return df

# process_df = create_player_features(process_df)

# final process

In [None]:
def features_process(df):
  df = df.copy()
  df = normalize_coord(df)
  df = origin_coord_feature(df)
  df = match_process(df)
  df = add_minute_feature(df)
  df = create_coord_features(df)
  df = create_type_features(df)
  df = create_attack_status(df)
  df = create_eposide_feature(df)
  df = create_player_roles(df)
  df = create_player_features(df)

  return df

process_df = features_process(all_df)

In [None]:
process_df.isnull().sum()[30:40]

Unnamed: 0,0
away_team_name,0
away_team_name_ko,0
nor_start_x,409831
nor_end_x,412245
nor_start_y,409831
nor_end_y,412245
nor_step_dist,0
nor_step_dx,0
nor_step_dy,0
p_match_date_kst,0


In [None]:
# 직전 이벤트 생성 함수
num_lag = 1

def last_events_features(df):
  df = df.copy()

  if 'is_home' not in df.columns:
    df['is_home'] = 0
  df['is_home'] = df['is_home'].astype(int)

  group_df = df.groupby('game_episode')

  # lag 1 features
  shift_cols = ['player_id', 'is_home',
                'start_x', 'start_y', 'p_dx', 'p_dy', 'p_dist', 'p_angle', 'p_vector_angle', 'p_angle_to_away',
                'p_is_setpiece','p_tactical_zone','p_final_third',
                'p_player_roles', 'p_type_id',
                'p_goal_event','p_is_attack',
                'time_seconds','p_time_delta']

  for i in range(1, num_lag + 1):
    s = group_df[shift_cols].shift(i)
    s.columns = [f'lag_{i}_{c}' for c in shift_cols]
    df = pd.concat([df, s], axis=1)

    df[f'p_lag_{i}_dx'] = df[f'lag_{i}_p_dx']
    df[f'p_lag_{i}_dy'] = df[f'lag_{i}_p_dy']
    df[f'p_lag_{i}_dist'] = df[f'lag_{i}_p_dist']
    df[f'p_lag_{i}_p_angle_to_away'] = df[f'lag_{i}_p_angle_to_away']

    df['p_angle_to_goal_diff'] = np.abs(df['p_angle_to_away'] - df[f'lag_{i}_p_angle_to_away'])
    df.loc[df['p_angle_to_goal_diff'] > np.pi, 'p_angle_to_goal_diff'] = 2 * np.pi - df['p_angle_to_goal_diff']

    df[f'p_is_same_player_{i}'] = (df['player_id'] == df[f'lag_{i}_player_id']).astype(int)
    df[f'p_is_same_role_{i}'] = (df['p_player_roles'] == df[f'lag_{i}_p_player_roles']).astype(int)
    df[f'p_type_transition_{i}'] = df['p_type_id'] * 100 + df[f'lag_{i}_p_type_id'].fillna(-100)

  final_df = df.groupby('game_episode').tail(1).reset_index(drop=True)
  final_df['target_dx'] = final_df['end_x'] - final_df['start_x']
  final_df['target_dy'] = final_df['end_y'] - final_df['start_y']

  return final_df

process_df = last_events_features(process_df)

In [None]:
# final_process_df = final_process_df.drop('Unnamed: 0', axis=1)

In [None]:
process_df.to_csv('/content/drive/MyDrive/1데이콘/Track1알고리즘부문:K리그-서울시립대공개AI경진대회/dataset/process_df.csv', index=False)

# features selection

In [None]:
# preprocessing data load
process_df = pd.read_csv('/content/drive/MyDrive/1데이콘/Track1알고리즘부문:K리그-서울시립대공개AI경진대회/dataset/process_df.csv')
select_df = process_df.copy()

In [None]:
def drop_features(df):
  feature_cols = df.columns

  # 불필요 변수 제거
  drop_cols = [
     'season_id',
     'competition_id',
     'game_date',
     'competition_name',
     'country_name',
     'season_name',
     'home_team_name',
     'home_team_name_ko',
     'away_team_name',
     'away_team_name_ko',
     'result_name',
     'p_date_str',
     'p_match_date_kst',
     'type_name'
     ]
  feature_cols = feature_cols.drop(drop_cols)

  # 중복제거
  feature_cols = sorted(list(feature_cols))

  print(f"최종 사용 피처 수: {len(feature_cols)}")
  print(f"전체 데이터 건수: {len(df)}")

  return df[feature_cols]

# select_df = drop_features(select_df)

In [None]:
# 결측치 처리
def fill_null(df):
  numeric_cols = df.select_dtypes(include='number').columns.to_list()
  for c in numeric_cols:
    df[c] = df[c].fillna(df[c].median())

  # print(sum(df.isnull().sum()))

  return df

# select_df = fill_null(select_df)

In [None]:
# 변수 상관도
def corr_pairs(df):
    corr_features = df.columns
    drop_cols = ['game_episode', 'venue', 'target_dx','target_dy']
    corr_features = corr_features.drop(drop_cols)

    corr_matrix = df[corr_features].corr()
    pairs = (
        corr_matrix.stack()
        .reset_index()
        .rename(columns={'level_0': 'feature_1',
                         'level_1': 'feature_2',
                         0: 'corr'})
    )

    pairs = pairs[pairs['feature_1'] != pairs['feature_2']]
    pairs['pair'] = pairs.apply(lambda x: '-'.join(sorted([x['feature_1'], x['feature_2']])), axis=1)
    pairs = pairs.drop_duplicates('pair').drop(columns='pair')

    pairs = pairs.sort_values('corr', key=lambda x: x.abs(), ascending=False)

    return pairs

# pairs_df = corr_pairs(select_df)

In [None]:
def normalize_features(df):
  ss = StandardScaler()
  numeric_cols = df.select_dtypes(include='number').columns.drop(
      ['game_id',
      'period_id',
      'episode_id',
      'time_seconds',
      'team_id',
      'player_id',
      'action_id',
      'is_home',
      'is_train',
      'game_day',
      'home_team_id',
      'away_team_id',
      'home_score',
      'away_score']
      ).to_list()
  df[numeric_cols] = ss.fit_transform(df[numeric_cols])

  le = LabelEncoder()
  category_cols = df.select_dtypes(exclude='number').columns.drop('game_episode').to_list()
  for c in category_cols:
    df[c] = le.fit_transform(df[c])

  return df

# select_df = normalize_features(select_df)

In [None]:
def final_process(df):
  df = drop_features(df)
  df = fill_null(df)
  # df =corr_pairs(df)
  df = normalize_features(df)

  return df

final_dfs = final_process(select_df)

최종 사용 피처 수: 111
전체 데이터 건수: 35698


In [None]:
final_dfs.to_csv('/content/drive/MyDrive/1데이콘/Track1알고리즘부문:K리그-서울시립대공개AI경진대회/dataset/final_dfs.csv', index=False)

In [None]:
final_dfs.shape

(35698, 111)