In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Ensure that all tables have been downloaded
print("Data Availabe: ")
os.listdir('./nfl-big-data-bowl-2025/')

Data Availabe: 


['plays.csv',
 'tracking_week_5.csv',
 'tracking_week_4.csv',
 'tracking_week_6.csv',
 'tracking_week_7.csv',
 'tracking_week_3.csv',
 'tracking_week_2.csv',
 'tracking_week_1.csv',
 'players.csv',
 'games.csv',
 'tracking_week_9.csv',
 'tracking_week_8.csv',
 'player_play.csv']

In [3]:
!du -sh nfl-big-data-bowl-2025

7.6G	nfl-big-data-bowl-2025


In [19]:
plays_df = pd.read_csv('./nfl-big-data-bowl-2025/plays.csv')
players_df = pd.read_csv('./nfl-big-data-bowl-2025/players.csv')
games_df = pd.read_csv('./nfl-big-data-bowl-2025/games.csv')
player_play_df = pd.read_csv('./nfl-big-data-bowl-2025/player_play.csv')
tracking_weeks = pd.DataFrame()
for week in range(1, 10):
    current_week = pd.read_csv(f'./nfl-big-data-bowl-2025/tracking_week_{week}.csv')
    tracking_weeks = pd.concat([tracking_weeks, current_week], ignore_index=True)


# NFL Big Data Bowl 2025 Dataset Summary for Blitz Prediction

## 1. `games.csv`
- **Purpose:** Info about each game.
- **Important Variables:**
  - `gameId` (primary key)
  - `season`, `week`
  - `homeTeamAbbr`, `visitorTeamAbbr`
- **Usefulness:** Mostly for joining, basic game context (e.g., week, matchup). Not critical for blitz prediction itself.

---

## 2. `plays.csv`
- **Purpose:** Play-level metadata.
- **Important Variables:**
  - `gameId`, `playId` (keys)
  - `quarter`, `down`, `yardsToGo`
  - `possessionTeam`, `defensiveTeam`
  - `offenseFormation`
  - `playDescription`
  - `isDropback` (Boolean: did QB drop back)
  - `pff_passCoverage` (type of defensive coverage)
  - `pff_manZone` (man vs zone coverage)
  - `playAction` (play-action pass or not)
- **Usefulness:** Crucial for understanding pre-snap situation and defensive alignment; used for labels or features in blitz prediction.

---

## 3. `players.csv`
- **Purpose:** Static player information.
- **Important Variables:**
  - `nflId` (player key)
  - `position`
  - `displayName`
- **Usefulness:** Helpful for interpreting player roles; minor for pure blitz prediction unless modeling specific player tendencies.

---

## 4. `player_play.csv`
- **Purpose:** Player-level stats per play.
- **Important Variables:**
  - `gameId`, `playId`, `nflId` (keys)
  - `teamAbbr`
  - `wasInitialPassRusher` (binary, key for identifying blitzers!)
  - `causedPressure`
  - `timeToPressureAsPassRusher`
  - `getOffAsPassRusher`
- **Usefulness:** **Very important** — allows you to know who blitzed (extra rushers) and pressure dynamics.

---

## 5. `tracking_week_[1-9].csv`
- **Purpose:** Frame-by-frame tracking data (player movement).
- **Important Variables:**
  - `gameId`, `playId`, `nflId`
  - `frameId`, `time`
  - `x`, `y` (player positions)
  - `s`, `a`, `dis` (speed, acceleration, distance moved)
  - `o` (orientation) and `dir` (direction)
  - `event` (snap, ball release, etc.)
- **Usefulness:** **Critical** — Used to generate **pre-snap features** like player alignments, movement, speed, and timing at snap. Core inputs for neural networks.

In [None]:
plays_df.head()

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,yardsGained,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPointsAdded,isDropback,pff_runConceptPrimary,pff_runConceptSecondary,pff_runPassOption,pff_passCoverage,pff_manZone
0,2022102302,2655,(1:54) (Shotgun) J.Burrow pass short middle to...,3,1,10,CIN,ATL,CIN,21,...,9,0.004634,-0.004634,0.702717,True,,,0,Cover-3,Zone
1,2022091809,3698,(2:13) (Shotgun) J.Burrow pass short right to ...,4,1,10,CIN,DAL,CIN,8,...,4,0.002847,-0.002847,-0.240509,True,,,0,Quarters,Zone
2,2022103004,3146,(2:00) (Shotgun) D.Mills pass short right to D...,4,3,12,HOU,TEN,HOU,20,...,6,0.000205,-0.000205,-0.21848,True,,,0,Quarters,Zone
3,2022110610,348,(9:28) (Shotgun) P.Mahomes pass short left to ...,1,2,10,KC,TEN,TEN,23,...,4,-0.001308,0.001308,-0.427749,True,,,0,Quarters,Zone
4,2022102700,2799,(2:16) (Shotgun) L.Jackson up the middle to TB...,3,2,8,BAL,TB,TB,27,...,-1,0.027141,-0.027141,-0.638912,False,MAN,READ OPTION,0,Cover-1,Man


In [None]:
players_df.head()

Unnamed: 0,nflId,height,weight,birthDate,collegeName,position,displayName
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan


In [None]:
games_df.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23


In [None]:
player_play_df.head()

Unnamed: 0,gameId,playId,nflId,teamAbbr,hadRushAttempt,rushingYards,hadDropback,passingYards,sackYardsAsOffense,hadPassReception,...,wasRunningRoute,routeRan,blockedPlayerNFLId1,blockedPlayerNFLId2,blockedPlayerNFLId3,pressureAllowedAsBlocker,timeToPressureAllowedAsBlocker,pff_defensiveCoverageAssignment,pff_primaryDefensiveCoverageMatchupNflId,pff_secondaryDefensiveCoverageMatchupNflId
0,2022090800,56,35472,BUF,0,0,0,0,0,0,...,,,47917.0,,,0.0,,,,
1,2022090800,56,42392,BUF,0,0,0,0,0,0,...,,,47917.0,,,0.0,,,,
2,2022090800,56,42489,BUF,0,0,0,0,0,1,...,1.0,IN,,,,,,,,
3,2022090800,56,44875,BUF,0,0,0,0,0,0,...,,,43335.0,,,0.0,,,,
4,2022090800,56,44985,BUF,0,0,0,0,0,0,...,1.0,OUT,,,,,,,,


In [None]:
tracking_weeks.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,frameType,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022091200,64,35459.0,Kareem Jackson,1,BEFORE_SNAP,2022-09-13 00:16:03.5,22.0,DEN,right,51.06,28.55,0.72,0.37,0.07,246.17,68.34,huddle_break_offense
1,2022091200,64,35459.0,Kareem Jackson,2,BEFORE_SNAP,2022-09-13 00:16:03.6,22.0,DEN,right,51.13,28.57,0.71,0.36,0.07,245.41,71.21,
2,2022091200,64,35459.0,Kareem Jackson,3,BEFORE_SNAP,2022-09-13 00:16:03.7,22.0,DEN,right,51.2,28.59,0.69,0.23,0.07,244.45,69.9,
3,2022091200,64,35459.0,Kareem Jackson,4,BEFORE_SNAP,2022-09-13 00:16:03.8,22.0,DEN,right,51.26,28.62,0.67,0.22,0.07,244.45,67.98,
4,2022091200,64,35459.0,Kareem Jackson,5,BEFORE_SNAP,2022-09-13 00:16:03.9,22.0,DEN,right,51.32,28.65,0.65,0.34,0.07,245.74,62.83,


We only include BEFORE_SNAP data because we are trying to predict if they are going to blitz, and that is the only information available for making this decision in the actual play.

In [None]:
tracking_weeks_filtered = tracking_weeks[tracking_weeks['frameType'] == 'BEFORE_SNAP' ]
tracking_weeks_filtered.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,frameType,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022091200,64,35459.0,Kareem Jackson,1,BEFORE_SNAP,2022-09-13 00:16:03.5,22.0,DEN,right,51.06,28.55,0.72,0.37,0.07,246.17,68.34,huddle_break_offense
1,2022091200,64,35459.0,Kareem Jackson,2,BEFORE_SNAP,2022-09-13 00:16:03.6,22.0,DEN,right,51.13,28.57,0.71,0.36,0.07,245.41,71.21,
2,2022091200,64,35459.0,Kareem Jackson,3,BEFORE_SNAP,2022-09-13 00:16:03.7,22.0,DEN,right,51.2,28.59,0.69,0.23,0.07,244.45,69.9,
3,2022091200,64,35459.0,Kareem Jackson,4,BEFORE_SNAP,2022-09-13 00:16:03.8,22.0,DEN,right,51.26,28.62,0.67,0.22,0.07,244.45,67.98,
4,2022091200,64,35459.0,Kareem Jackson,5,BEFORE_SNAP,2022-09-13 00:16:03.9,22.0,DEN,right,51.32,28.65,0.65,0.34,0.07,245.74,62.83,


In [None]:
master_df = pd.DataFrame()

In [None]:
# Merge player_play_df with players_df to get player names
master_df = player_play_df.merge(players_df[['nflId', 'displayName', 'position']], on='nflId', how='left')
master_df.head()

Unnamed: 0,gameId,playId,nflId,teamAbbr,hadRushAttempt,rushingYards,hadDropback,passingYards,sackYardsAsOffense,hadPassReception,...,blockedPlayerNFLId1,blockedPlayerNFLId2,blockedPlayerNFLId3,pressureAllowedAsBlocker,timeToPressureAllowedAsBlocker,pff_defensiveCoverageAssignment,pff_primaryDefensiveCoverageMatchupNflId,pff_secondaryDefensiveCoverageMatchupNflId,displayName,position
0,2022090800,56,35472,BUF,0,0,0,0,0,0,...,47917.0,,,0.0,,,,,Rodger Saffold,G
1,2022090800,56,42392,BUF,0,0,0,0,0,0,...,47917.0,,,0.0,,,,,Mitch Morse,C
2,2022090800,56,42489,BUF,0,0,0,0,0,1,...,,,,,,,,,Stefon Diggs,WR
3,2022090800,56,44875,BUF,0,0,0,0,0,0,...,43335.0,,,0.0,,,,,Dion Dawkins,T
4,2022090800,56,44985,BUF,0,0,0,0,0,0,...,,,,,,,,,Isaiah McKenzie,WR


In [14]:
# Merge player_play_df with plays_df on both playId and gameId
master_df = master_df.merge(plays_df, on=['playId', 'gameId'], how='left')
master_df.head()

Unnamed: 0,gameId,playId,nflId,teamAbbr,hadRushAttempt,rushingYards,hadDropback,passingYards,sackYardsAsOffense,hadPassReception,...,yardsGained,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPointsAdded,isDropback,pff_runConceptPrimary,pff_runConceptSecondary,pff_runPassOption,pff_passCoverage,pff_manZone
0,2022090800,56,35472,BUF,0,0,0,0,0,0,...,6,-3.1e-05,3.1e-05,0.00442,True,,,0,Cover 6-Left,Zone
1,2022090800,56,42392,BUF,0,0,0,0,0,0,...,6,-3.1e-05,3.1e-05,0.00442,True,,,0,Cover 6-Left,Zone
2,2022090800,56,42489,BUF,0,0,0,0,0,1,...,6,-3.1e-05,3.1e-05,0.00442,True,,,0,Cover 6-Left,Zone
3,2022090800,56,44875,BUF,0,0,0,0,0,0,...,6,-3.1e-05,3.1e-05,0.00442,True,,,0,Cover 6-Left,Zone
4,2022090800,56,44985,BUF,0,0,0,0,0,0,...,6,-3.1e-05,3.1e-05,0.00442,True,,,0,Cover 6-Left,Zone


In [None]:
# Merge master_df with games_df on gameId
master_df = master_df.merge(games_df, on='gameId', how='left')
master_df.head()

Unnamed: 0,gameId,playId,nflId,teamAbbr,hadRushAttempt,rushingYards,hadDropback,passingYards,sackYardsAsOffense,hadPassReception,...,pff_passCoverage,pff_manZone,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,56,35472,BUF,0,0,0,0,0,0,...,Cover 6-Left,Zone,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
1,2022090800,56,42392,BUF,0,0,0,0,0,0,...,Cover 6-Left,Zone,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
2,2022090800,56,42489,BUF,0,0,0,0,0,1,...,Cover 6-Left,Zone,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
3,2022090800,56,44875,BUF,0,0,0,0,0,0,...,Cover 6-Left,Zone,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
4,2022090800,56,44985,BUF,0,0,0,0,0,0,...,Cover 6-Left,Zone,2022,1,9/8/2022,20:20:00,LA,BUF,10,31


: 

In [None]:
# Merge master_df with tracking_weeks on gameId, playId, and nflId
master_df = master_df.merge(tracking_weeks, on=['gameId', 'playId', 'nflId'], how='left')
master_df.head()

In [17]:
tracking_weeks.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,frameType,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event,original_playId
0,2022091200,W1_64,35459.0,Kareem Jackson,1,BEFORE_SNAP,2022-09-13 00:16:03.5,22.0,DEN,right,51.06,28.55,0.72,0.37,0.07,246.17,68.34,huddle_break_offense,64
1,2022091200,W1_64,35459.0,Kareem Jackson,2,BEFORE_SNAP,2022-09-13 00:16:03.6,22.0,DEN,right,51.13,28.57,0.71,0.36,0.07,245.41,71.21,,64
2,2022091200,W1_64,35459.0,Kareem Jackson,3,BEFORE_SNAP,2022-09-13 00:16:03.7,22.0,DEN,right,51.2,28.59,0.69,0.23,0.07,244.45,69.9,,64
3,2022091200,W1_64,35459.0,Kareem Jackson,4,BEFORE_SNAP,2022-09-13 00:16:03.8,22.0,DEN,right,51.26,28.62,0.67,0.22,0.07,244.45,67.98,,64
4,2022091200,W1_64,35459.0,Kareem Jackson,5,BEFORE_SNAP,2022-09-13 00:16:03.9,22.0,DEN,right,51.32,28.65,0.65,0.34,0.07,245.74,62.83,,64


## Generate IsBlitz Outcome Variable

In [22]:
# how many unique plays are there, figure this out by finding unique combinations of gameId and playId using tracking_weeks
unique_plays = tracking_weeks[['gameId', 'playId']].drop_duplicates()
print(f"Number of unique plays: {len(unique_plays)}")


Number of unique plays: 16124


In [20]:
plays_subset = plays_df[['gameId', 'playId', 'yardlineNumber', 'yardlineSide', 'absoluteYardlineNumber']]
tracking = tracking_weeks.merge(plays_subset, on=['gameId', 'playId'], how='left')

# Keep only frames right at ball snap, where frameType is 'SNAP'
snap_tracking = tracking[tracking['frameType'] == 'SNAP']
# Calculate LOS x-coordinate
def calculate_los_x(playDirection, absoluteYardlineNumber):
    if playDirection == 'left':
        return 100 - absoluteYardlineNumber
    else:  # playDirection == 'right'
        return absoluteYardlineNumber

snap_tracking['LOS_x'] = snap_tracking.apply(
    lambda row: calculate_los_x(row['playDirection'], row['absoluteYardlineNumber']), axis=1
)

# Distance from LOS
snap_tracking['dist_from_LOS'] = np.abs(snap_tracking['x'] - snap_tracking['LOS_x'])

# Motion toward ball based on play direction
def motion_toward_ball(row):
    if row['playDirection'] == 'left':
        return np.cos(np.deg2rad(row['dir']))  # left = 180
    else:  # right
        return np.cos(np.deg2rad(row['dir'] - 180))

snap_tracking['motion_toward_ball'] = snap_tracking.apply(motion_toward_ball, axis=1)

# Velocity toward LOS
snap_tracking['velocity_toward_LOS'] = snap_tracking['s'] * snap_tracking['motion_toward_ball']

# Likely blitzer: close to LOS and moving fast toward ball
snap_tracking['isLikelyBlitzer'] = (
    (snap_tracking['dist_from_LOS'] <= 5) &
    (snap_tracking['velocity_toward_LOS'] > 1.5)
).astype(int)

# Count total likely blitzers
num_likely_blitzers = snap_tracking['isLikelyBlitzer'].sum()
print(f"Total number of players flagged as likely blitzers: {num_likely_blitzers}")

# Optional: count blitzers per play
blitzer_counts_by_play = snap_tracking.groupby(['gameId', 'playId'])['isLikelyBlitzer'].sum().reset_index()
print(blitzer_counts_by_play.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snap_tracking['LOS_x'] = snap_tracking.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snap_tracking['dist_from_LOS'] = np.abs(snap_tracking['x'] - snap_tracking['LOS_x'])


Total number of players flagged as likely blitzers: 1890
       gameId  playId  isLikelyBlitzer
0  2022090800      56                0
1  2022090800      80                0
2  2022090800     101                0
3  2022090800     122                0
4  2022090800     167                0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snap_tracking['motion_toward_ball'] = snap_tracking.apply(motion_toward_ball, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snap_tracking['velocity_toward_LOS'] = snap_tracking['s'] * snap_tracking['motion_toward_ball']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snap_tracking['isLikely

In [25]:
# Count plays with at least one blitzer
plays_with_blitzers = blitzer_counts_by_play[blitzer_counts_by_play['isLikelyBlitzer'] > 0]
num_plays_with_blitzers = len(plays_with_blitzers)

# Count total unique plays
total_plays = len(blitzer_counts_by_play)

# Calculate percentage
blitz_percentage = (num_plays_with_blitzers / total_plays) * 100 if total_plays > 0 else 0

print(f"Total plays: {total_plays}")
print(f"Plays with at least one blitzer: {num_plays_with_blitzers}")
print(f"Percentage of plays with blitzers: {blitz_percentage:.2f}%")

# Calculate total pass plays:
# how many pass plays are there in tracking_weeks using isDropback
pass_plays = plays_df[plays_df['isDropback'] == 1]
print(f"Number of pass plays: {len(pass_plays)}")

# Calculate percentage of pass plays with blitzers
pass_plays_with_blitzers = pass_plays[pass_plays['playId'].isin(plays_with_blitzers['playId'])]
num_pass_plays_with_blitzers = len(pass_plays_with_blitzers)
pass_plays_total = len(pass_plays)
pass_blitz_percentage = (num_pass_plays_with_blitzers / pass_plays_total) * 100 if pass_plays_total > 0 else 0
print(f"Total pass plays: {pass_plays_total}")
print(f"Pass plays with at least one blitzer: {num_pass_plays_with_blitzers}")
print(f"Percentage of pass plays with blitzers: {pass_blitz_percentage:.2f}%")

# Optional: Breakdown of plays by number of blitzers
blitzer_distribution = blitzer_counts_by_play['isLikelyBlitzer'].value_counts().sort_index()
print("\nDistribution of blitzers per play:")
print(blitzer_distribution)

# Optional: You might want to classify a "blitz play" as having more than 1 blitzer
threshold_for_blitz_play = 1  # Adjust this threshold as needed
plays_with_significant_blitz = blitzer_counts_by_play[blitzer_counts_by_play['isLikelyBlitzer'] > threshold_for_blitz_play]
print(f"\nPlays with more than {threshold_for_blitz_play} blitzers: {len(plays_with_significant_blitz)}")
print(f"Percentage: {(len(plays_with_significant_blitz) / total_plays) * 100:.2f}%")

Total plays: 16124
Plays with at least one blitzer: 1237
Percentage of plays with blitzers: 7.67%
Number of pass plays: 9736
Total pass plays: 9736
Pass plays with at least one blitzer: 3051
Percentage of pass plays with blitzers: 31.34%

Distribution of blitzers per play:
isLikelyBlitzer
0    14887
1      751
2      370
3       81
4       23
5        9
6        2
7        1
Name: count, dtype: int64

Plays with more than 1 blitzers: 486
Percentage: 3.01%
