In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

# 🏈 Predicting Blitzes Using Pre-Snap Behavior

**Authors:**  
- Chris Doyle (christopherdoyle@college.harvard.edu)  
- Hans Elasri (hanselasri@college.harvard.edu)  
- Thomas Garity (tgarity@college.harvard.edu)  
- Rishi Hazra (rishihazra@college.harvard.edu)  
- Chris Ruaño (cruano@college.harvard.edu)

---

## Project Summary

Blitzing is one of the most aggressive and high-risk strategies in football. When executed well, it can disrupt an offensive drive by forcing the quarterback into rushed decisions; when misread or mistimed, it can leave the defense vulnerable to big plays. Offensive coaches and quarterbacks spend countless hours studying pre-snap cues to anticipate incoming blitzes, while defenses work just as hard to disguise them through subtle shifts, delayed rushes, and simulated pressures.

Our project aims to bring analytics into this equation by predicting whether a defense will blitz, using only pre-snap player tracking data from the NFL Big Data Bowl 2025 dataset. By extracting features such as player positioning, movement trends, alignment depth, and formation structure, we seek to develop machine learning models that systematically classify plays as blitz or non-blitz scenarios. 

A successful model would not only help identify the most telling pre-snap indicators of pressure but also provide a practical tool for offensive strategists to better anticipate and counter defensive blitzes, enhancing both game preparation and real-time decision-making.

---

## Data Structure

Our data can be downloaded from the 2025 Big Data Bowl on [Kaggle](https://www.kaggle.com/competitions/nfl-big-data-bowl-2025/data)  
or using the Kaggle API:

```
kaggle competitions download -c nfl-big-data-bowl-2025
```

**Instructions:**
- Download the dataset and unzip the file `nfl-big-data-bowl-2025.zip`.
- The datasets will be saved in a `\nfl-big-data-bowl-2025` directory within the project repo.
- Ensure that this directory is listed in your `.gitignore` file to avoid pushing large data files to GitHub.

Now, let's check that the data has been downloaded correctly:

**SETUP + EXTRA PREPROCESSING**
1. Read csv (master_df)
2. Split into train, validation, and test sets
3. Preprocess:
   1. Adjust player positions to be relative to line of scrimmage (add LOS to plays)
   2. Player IDs: one hot encode? Or is there a way to somehow use the player IDs as features?
      1. Incorporate the player's previous stats?
   3. One hot teams, positions, formations


**APPROACH 1:**
1. Preprocess: create samples where each sample is:
   1. A sequence of X plays leading up to the current play
   2. Label of whether the X+1th play was a blitz
2. Run that through an RNN (sequence --> binary classification)
3. Save accuracy

**APPROACH 2:**
1. Preprocess: create samples where each sample is:
   1. Sequence of timestamps leading up to curent play
   2. High-level overview of past X plays
   3. Label of whether the X+1th play was a blitz
2. Run that through a multi-pronged model:
   1. RNN for timestamps
   2. RNN for past plays + current plays (minus the things that they would know -- but you do have team, offense, defense formations etc)
   3. Concatenate activations
   4. Pass through dense layers
   5. Predict
3. Save accuracy

In [3]:
# Import dataframe
df = pd.read_csv('master_df.csv')

  df = pd.read_csv('master_df.csv')


In [None]:
# Inspect the dataframe
df.shape

(381419, 881)

In [None]:
def summarize_plays(plays):
  playIds = plays['playId'].unique()
  summary_ = []
  # Look at a play
  for playId in playIds:
    play = df[df['playId'] == playId]
    # Ensure that the play is sorted by frameId (lowest is first)
    play = play.sort_values(['frameId'], ascending=[True])
    print(len(play))

    # Create a summary row for that play
    summary_row = play.iloc[0]
  

In [None]:
seq_length = 5

# Get list of all unique games
gameIds = df['gameId'].unique()

X = []
y = []

# Look at a game
for gameId in gameIds:
  plays = df[df['gameId'] == gameId].sort_values(['quarter', 'gameClock'], ascending=[True, False])
  # Group by playId and keep only the last frame
  last_frames_df = plays.groupby('playId').tail(1).reset_index(drop=True)
  
  game_length = len(last_frames_df)
  if game_length < seq_length:
    continue

  # Pull target labels
  plays_array = plays[feature_cols].values
  targets_array = plays[target_col].values if target_col in game_plays.columns else None
  play_ids_array = plays[play_id_cols].values


  # Get the sequence of plays usling sliding window
  for i in range(0, game_length - seq_length, step):
    # Get n consecutive plays for X
    sequence = plays_array[i:i+n]
    
    # Skip sequences with NaN values if needed
    if np.isnan(sequence).any():
        continue
        
    # Add the sequence to our dataset
    X.append(sequence)
    
    # Get target from the n+1th play (if target column exists)
    if targets_array is not None:
        y.append(targets_array[i+n])
        
    # Keep track of which play this prediction is for
    play_ids.append(tuple(play_ids_array[i+n]))


  # Convert to numpy arrays
  X = np.array(X)
  y = np.array(y) if targets_array is not None else None

In [53]:

non_numeric_cols = []
for col in df.columns:
    try:
        # Try to convert to numeric - this will work for actual numeric data
        pd.to_numeric(df[col])
        numeric_cols.append(col)
    except:
        # If conversion fails, it's not numeric
        non_numeric_cols.append(col)


In [56]:
print(len(non_numeric_cols))

881


In [58]:
# Simple approach - just examine the column types directly
column_types = df.dtypes
print(f"DataFrame shape: {df.shape}")

# Group by type
type_counts = column_types.value_counts()
print("\nColumn type counts:")
print(type_counts)


DataFrame shape: (381419, 881)

Column type counts:
int64      406
float64    335
object     117
bool        23
Name: count, dtype: int64


In [None]:
# Columns to not include in final data
drop_cols = ['gameId', 'playId', 'frameId']

# One hot encode columns
object_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()


In [60]:
def create_sequences(df, n=5, target_col='blitz', play_id_cols=['gameId', 'playId'], 
                     time_cols=['quarter', 'gameClock'], step=1):
    """
    Create sequences of n consecutive plays for RNN input with overlapping windows.
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame with last frame of each play
    n : int
        Sequence length (number of plays to include in each sequence)
    target_col : str
        Column name for the target variable (blitz indicator)
    play_id_cols : list
        Columns that identify a unique play
    time_cols : list
        Columns to sort by to ensure chronological order
    step : int
        Step size for sliding window (1 = maximum overlap, n = no overlap)
        
    Returns:
    --------
    X : numpy array of shape (num_sequences, n, num_features)
        Sequences of n plays with features
    y : numpy array of shape (num_sequences,)
        Target values indicating whether the n+1th play was a blitz
    play_ids : list of tuples
        Identifiers for the play following each sequence (for reference)
    """
    X = []
    y = []
    play_ids = []
    
    # Get list of unique games
    games = df[play_id_cols[0]].unique()
    
    # Define feature columns (exclude target and any columns we don't want as features)
    exclude_cols = play_id_cols + [target_col] 
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    for game_id in games:
        # Get plays for this game and sort chronologically
        game_plays = df[df[play_id_cols[0]] == game_id].sort_values(time_cols)
        
        # Get the length of this game in plays
        game_length = len(game_plays)
        
        # Skip games that are too short for our sequence length
        if game_length <= n:
            continue
            
        # Convert to numpy for faster operations
        plays_array = game_plays[feature_cols].values
        targets_array = game_plays[target_col].values if target_col in game_plays.columns else None
        play_ids_array = game_plays[play_id_cols].values
        
        # Create overlapping windows
        for i in range(0, game_length - n, step):
            # Get n consecutive plays for X
            sequence = plays_array[i:i+n]
            
            # Skip sequences with NaN values if needed
            if np.isnan(sequence).any():
                continue
                
            # Add the sequence to our dataset
            X.append(sequence)
            
            # Get target from the n+1th play (if target column exists)
            if targets_array is not None:
                y.append(targets_array[i+n])
                
            # Keep track of which play this prediction is for
            play_ids.append(tuple(play_ids_array[i+n]))
    
    # Convert to numpy arrays
    X = np.array(X)
    y = np.array(y) if targets_array is not None else None
    
    print(f"Created {len(X)} sequences of length {n}")
    print(f"X shape: {X.shape}")
    if y is not None:
        print(f"y shape: {y.shape}")
    
    return X, y, play_ids

In [None]:
X, y, play_ids = create_sequences(df)