In [4]:
!pip install optuna





In [29]:
from scipy.spatial import cKDTree
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import xgboost as xgb

In [30]:
# 프로젝트 디렉토리 경로 설정
project_dir = r'C:\Users\82106\Desktop\코드 리뷰\data\nfl-big-data-bowl-2025'

# 필요한 데이터셋 로드
games = pd.read_csv(f'{project_dir}/games.csv')
plays = pd.read_csv(f'{project_dir}/plays.csv')
players = pd.read_csv(f'{project_dir}/players.csv')
player_play = pd.read_csv(f'{project_dir}/player_play.csv')
tracking_week_1 = pd.read_csv(f'{project_dir}/tracking_week_1.csv')
tracking_week_2 = pd.read_csv(f'{project_dir}/tracking_week_2.csv')

In [31]:
# gameDate 칼럼을 날짜형식으로
games['gameDate'] = pd.to_datetime(games['gameDate'])

In [32]:
# 1주차에 열린 특정팀의 경기를 필터링
sf_chi_games = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'CHI')) |
        ((games['homeTeamAbbr'] == 'CHI') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 1)
]

### San Francisco 49ers(SF)의 2022시즌 1-8주차 경기들을 필터링

In [33]:
# Filter for Week 2 game between San Francisco 49ers and Seattle Seahawks on 09/18/2022
sf_sea_games = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'SEA')) |
        ((games['homeTeamAbbr'] == 'SEA') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 2) &
    (games['gameDate'] == '2022-09-18')
]

# Filter for Week 3 game between San Francisco 49ers and Denver Broncos on 09/25/2022
sf_den_games = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'DEN')) |
        ((games['homeTeamAbbr'] == 'DEN') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 3) &
    (games['gameDate'] == '2022-09-25')
]

# Filter for Week 4 game between San Francisco 49ers and Los Angeles Rams on 10/03/2022
sf_lar_games_w4 = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'LAR')) |
        ((games['homeTeamAbbr'] == 'LAR') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 4) &
    (games['gameDate'] == '2022-10-03')
]

# Filter for Week 5 game between San Francisco 49ers and Carolina Panthers on 10/09/2022
sf_car_games = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'CAR')) |
        ((games['homeTeamAbbr'] == 'CAR') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 5) &
    (games['gameDate'] == '2022-10-09')
]

# Filter for Week 6 game between San Francisco 49ers and Atlanta Falcons on 10/16/2022
sf_atl_games = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'ATL')) |
        ((games['homeTeamAbbr'] == 'ATL') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 6) &
    (games['gameDate'] == '2022-10-16')
]

# Filter for Week 7 game between San Francisco 49ers and Kansas City Chiefs on 10/23/2022
sf_kc_games = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'KC')) |
        ((games['homeTeamAbbr'] == 'KC') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 7) &
    (games['gameDate'] == '2022-10-23')
]

# Filter for Week 8 game between San Francisco 49ers and Los Angeles Rams on 10/30/2022
sf_lar_games_w8 = games[
    (
        ((games['homeTeamAbbr'] == 'SF') & (games['visitorTeamAbbr'] == 'LAR')) |
        ((games['homeTeamAbbr'] == 'LAR') & (games['visitorTeamAbbr'] == 'SF'))
    ) &
    (games['week'] == 8) &
    (games['gameDate'] == '2022-10-30')
]

In [34]:
# 선택한 경기들 병합 및 병합데이터를 list로 변환
selected_games = pd.concat([
    sf_chi_games, sf_sea_games, sf_den_games, sf_lar_games_w4,
    sf_car_games, sf_atl_games, sf_kc_games, sf_lar_games_w8
])
selected_game_ids = selected_games['gameId'].tolist()

In [35]:
# plays 와 player_play에  selected_game_ids 경기들 선택
plays = plays[plays['gameId'].isin(selected_game_ids)]
player_play = player_play[player_play['gameId'].isin(selected_game_ids)]

In [36]:

tracking_data_list = []
# 1주차 부터 9주차까지 반복 
for week in range(1, 9):
    # 주차별 tracking 데이터 로드
    tracking_week = pd.read_csv(f'{project_dir}/tracking_week_{week}.csv')
    # tracking_week의 selected_game_ids 선별
    tracking_week_filtered = tracking_week[tracking_week['gameId'].isin(selected_game_ids)]
    tracking_data_list.append(tracking_week_filtered)


In [37]:
# tracking_data_list 병합
tracking_data = pd.concat(tracking_data_list, ignore_index=True)
# playtype에서 run 과 pass 선택
tracking_data = tracking_data[tracking_data['frameType'] == 'BEFORE_SNAP']

In [38]:
# Merge games and plays data
plays = plays.merge(
    games[['gameId', 'homeTeamAbbr', 'visitorTeamAbbr']],
    on='gameId', how='left'
)

# Create target variable 'playType' (run or pass)
plays['playType'] = plays['isDropback'].map({True: 'pass', False: 'run'})
plays = plays[plays['playType'].isin(['run', 'pass'])]

### Feature Engineering


#### 게임 시간 정보

In [39]:
plays['score_difference'] = plays['preSnapHomeScore'] - plays['preSnapVisitorScore']

# '1:30' → 90초 식으로 시간을 초로 변환 하는 함수
def parse_game_clock(clock_str):
    minutes, seconds = map(int, clock_str.split(':'))
    return minutes * 60 + seconds

plays['seconds_remaining'] = plays.apply(lambda row: (4 - row['quarter']) * 15 * 60 + parse_game_clock(row['gameClock']), axis=1)

#### 플레이어 포메이션 변수

In [40]:
# 원-핫 인코딩 진행
formation_dummies = pd.get_dummies(plays['offenseFormation'], prefix='formation')
plays = pd.concat([plays, formation_dummies], axis=1)

#### 플레이어 모션 변수

In [41]:
# gameId, playId 별로 그룹화 하고 sum으로 집계
motion_features = player_play.groupby(['gameId', 'playId']).agg({
    'inMotionAtBallSnap': 'sum',
    'shiftSinceLineset': 'sum',
    'motionSinceLineset': 'sum'
}).reset_index()

plays = plays.merge(motion_features, on=['gameId', 'playId'], how='left')

#### 디펜스 커버 변수

In [42]:
# 원-핫 인코딩 진행
## prefix 는 새로운 열들의 이름앞에 붙는 접두사 생성
coverage_dummies = pd.get_dummies(plays['pff_passCoverage'], prefix='coverage')
plays = pd.concat([plays, coverage_dummies], axis=1)

#### Pre_snap player 포지션 변수

In [43]:
# 스냅전 순간의 공격팀과 수비팀의 평균 위치를 계산
def get_presnap_positions(group):
    presnap = group[group['frameType'] == 'BEFORE_SNAP']
    if presnap.empty:
        return pd.Series({'presnap_offense_x_mean': np.nan, 'presnap_offense_y_mean': np.nan,
                          'presnap_defense_x_mean': np.nan, 'presnap_defense_y_mean': np.nan})
    
    # Get the first gameId and playId from the group
    game_id = group['gameId'].iloc[0]
    play_id = group['playId'].iloc[0]
    
    # Use .loc to select the possession team
    possession_team = plays.loc[(plays['gameId'] == game_id) & (plays['playId'] == play_id), 'possessionTeam'].iloc[0]
    
    presnap['teamType'] = np.where(presnap['club'] == possession_team, 'offense', 'defense')
    
    offense = presnap[presnap['teamType'] == 'offense']
    defense = presnap[presnap['teamType'] == 'defense']
    
    return pd.Series({
        'presnap_offense_x_mean': offense['x'].mean(),
        'presnap_offense_y_mean': offense['y'].mean(),
        'presnap_defense_x_mean': defense['x'].mean(),
        'presnap_defense_y_mean': defense['y'].mean()
    })

In [44]:
presnap_positions = tracking_data.groupby(['gameId', 'playId']).apply(get_presnap_positions).reset_index()
plays = plays.merge(presnap_positions, on=['gameId', 'playId'], how='left')

In [45]:
tracking_data = tracking_data.merge(
    plays[['gameId', 'playId', 'possessionTeam', 'homeTeamAbbr', 'visitorTeamAbbr']],
    on=['gameId', 'playId'],
    how='left'
)

In [46]:
def identify_team_type(row):
    if pd.isna(row['nflId']):
        return 'football'
    elif row['club'] == row['possessionTeam']:
        return 'offense'
    else:
        return 'defense'
    
tracking_data['teamType'] = tracking_data.apply(identify_team_type, axis=1)

In [None]:
def is_offense(row):
    return 1 if row['teamType'] == 'offense' else 0