This notebook generates model-ready features from raw season statistics.

**Goals:**
- Create efficiency metrics (e.g., points per game, yards per play).
- Calculate ratios and composite stats (e.g., turnover %, score %).
- Add target variable: whether the team won the Super Bowl.

**Tools:**
- pandas, numpy

In [1]:
import pandas as pd

df = pd.read_csv('../data/processed/nfl_team_stats_2003_2023_cleaned.csv')

df['superbowl_winner'] = 0

winners = {
    2003: 'New England Patriots',
    2004: 'New England Patriots',
    2005: 'Pittsburgh Steelers',
    2006: 'Indianapolis Colts',
    2007: 'New York Giants',
    2008: 'Pittsburgh Steelers',
    2009: 'New Orleans Saints',
    2010: 'Green Bay Packers',
    2011: 'New York Giants',
    2012: 'Baltimore Ravens',
    2013: 'Seattle Seahawks',
    2014: 'New England Patriots',
    2015: 'Denver Broncos',
    2016: 'New England Patriots',
    2017: 'Philadelphia Eagles',
    2018: 'New England Patriots',
    2019: 'Kansas City Chiefs',
    2020: 'Tampa Bay Buccaneers',
    2021: 'Los Angeles Rams',
    2022: 'Kansas City Chiefs',
    2023: 'Kansas City Chiefs',
}

for year, team in winners.items():
    df.loc[(df['year'] == year) & (df['team'] == team), 'superbowl_winner'] = 1

# Choose features
features = ['points_diff', 'score_pct', 'turnover_pct', 'pass_td', 'rush_td', 'penalties']
X = df[features]
y = df['superbowl_winner']

# Show balance of labels
print(y.value_counts())

superbowl_winner
0    651
1     21
Name: count, dtype: int64


In [2]:
eff_cols = ['points', 'g', 'total_yards', 'plays_offense', 
            'pass_cmp', 'pass_att', 'rush_yds', 'rush_att']

In [3]:
df['points_per_game'] = df['points'] / df['g']
df['yards_per_play'] = df['total_yards'] / df['plays_offense']
df['completion_rate'] = df['pass_cmp'] / df['pass_att']
df['rush_avg'] = df['rush_yds'] / df['rush_att']

df.fillna(0, inplace=True)

In [4]:
df[['points_per_game', 'yards_per_play', 'completion_rate', 'rush_avg']].describe()

Unnamed: 0,points_per_game,yards_per_play,completion_rate,rush_avg
count,672.0,672.0,672.0,672.0
mean,22.221327,5.341469,0.618942,4.200477
std,4.450797,0.505036,0.042933,0.440002
min,10.5,3.946225,0.487705,3.13948
25%,18.875,4.991153,0.589315,3.907316
50%,22.150735,5.321297,0.62042,4.172836
75%,25.3125,5.703374,0.652562,4.476632
max,37.875,6.837349,0.734104,5.533333


In [5]:
features = [
    'points_diff', 'score_pct', 'turnover_pct',
    'pass_td', 'rush_td', 'penalties',
    'points_per_game', 'yards_per_play', 'completion_rate', 'rush_avg'
]

X = df[features]
y = df['superbowl_winner']