# 📊 NFL Combine & Draft Analytics
### Predicting NFL Success from Combine Metrics
This notebook explores the relationship between NFL Combine performance and career success.

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import requests
from bs4 import BeautifulSoup
import glob

## 📂 Load Dataset
We'll load a dataset containing NFL Combine stats taken from [ProFootball Reference](https://www.pro-football-reference.com/draft/2024-combine.htm) from 2000-2024.

In [7]:
# Load dataset (update with actual URL or file path)
csv_files = glob.glob("data/2024.csv")

df_list = [pd.read_csv(f) for f in csv_files]
raw_df = pd.concat(df_list, ignore_index=True)
raw_df.head()

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Player-additional
0,Kris Abrams-Draine,CB,Missouri,College Stats,5-11,179,4.44,33.5,,,,,Denver Broncos / 5th / 145th pick / 2024,AbraKr00
1,Isaiah Adams,G,Illinois,College Stats,6-4,315,5.22,24.5,,102.0,7.77,4.73,Arizona Cardinals / 3rd / 71st pick / 2024,AdamIs01
2,Rasheen Ali,RB,Marshall,College Stats,5-11,206,,,,,,,Baltimore Ravens / 5th / 165th pick / 2024,AlixRa00
3,Erick All,TE,Iowa,College Stats,6-4,252,,,,,,,Cincinnati Bengals / 4th / 115th pick / 2024,AllxEr00
4,Braelon Allen,RB,Wisconsin,College Stats,6-1,235,,32.0,26.0,117.0,,,New York Jets / 4th / 134th pick / 2024,AlleBr05


## 🧹 Data Cleaning & Preprocessing
Expand the "Drafted" column and convert relevant columns to numeric and handle missing values.

In [8]:

df = raw_df.copy()
df[["Team", "Round", "Pick", "Year"]] = df["Drafted (tm/rnd/yr)"].str.split(" / ", expand=True)
df.drop(columns=['Drafted (tm/rnd/yr)'], inplace=True)

duplicate_players = df["Player"].duplicated(keep=False)
print(df[duplicate_players])

df


Empty DataFrame
Columns: [Player, Pos, School, College, Ht, Wt, 40yd, Vertical, Bench, Broad Jump, 3Cone, Shuttle, Player-additional, Team, Round, Pick, Year]
Index: []


Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Player-additional,Team,Round,Pick,Year
0,Kris Abrams-Draine,CB,Missouri,College Stats,5-11,179,4.44,33.5,,,,,AbraKr00,Denver Broncos,5th,145th pick,2024
1,Isaiah Adams,G,Illinois,College Stats,6-4,315,5.22,24.5,,102.0,7.77,4.73,AdamIs01,Arizona Cardinals,3rd,71st pick,2024
2,Rasheen Ali,RB,Marshall,College Stats,5-11,206,,,,,,,AlixRa00,Baltimore Ravens,5th,165th pick,2024
3,Erick All,TE,Iowa,College Stats,6-4,252,,,,,,,AllxEr00,Cincinnati Bengals,4th,115th pick,2024
4,Braelon Allen,RB,Wisconsin,College Stats,6-1,235,,32.0,26.0,117.0,,,AlleBr05,New York Jets,4th,134th pick,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,Roman Wilson,WR,Michigan,College Stats,5-11,185,4.39,,12.0,,,,WilsRo02,Pittsburgh Steelers,3rd,84th pick,2024
317,Mekhi Wingo,DT,LSU,College Stats,6-0,284,4.85,31.5,25.0,109.0,,,WingMe00,Detroit Lions,6th,189th pick,2024
318,Xavier Worthy,WR,Texas,College Stats,5-11,165,4.21,41.0,,131.0,,,WortXa00,Kansas City Chiefs,1st,28th pick,2024
319,Jaylen Wright,RB,Tennessee,College Stats,5-11,210,4.38,38.0,,134.0,,,WrigJa04,Miami Dolphins,4th,120th pick,2024


## 📈 Correlation Analysis
Let's analyze how Combine metrics correlate with draft position and career success.

In [9]:
plt.figure(figsize=(10, 6))
sns.heatmap(df[['40yd', 'BenchReps', 'Vertical', 'Broad Jump', 'Draft Pick']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Between NFL Combine Metrics & Draft Position')
plt.show()

KeyError: "['BenchReps', 'Draft Pick'] not in index"

<Figure size 1000x600 with 0 Axes>

## 🚀 Does Speed Predict NFL Success?
A scatterplot showing the relationship between 40-yard dash times and career Approximate Value (AV).

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['40yd'], y=df['Career AV'], hue=df['Position'], alpha=0.7)
plt.xlabel('40-Yard Dash Time (s)')
plt.ylabel('Career Approximate Value (AV)')
plt.title('Does Speed Predict NFL Success?')
plt.show()

NameError: name 'plt' is not defined

## 🏆 Next Steps
- Build a predictive model to estimate career success from Combine stats.
- Compare trends for different positions (QB, WR, etc.).
- Identify potential draft steals and busts.

# Linear Regression to Model Rookie NFL Receiving Yards per Catch:

Question: How relevant is combine data alone in predicting NFL WR Stars?

Isolating NFL Rookies from year in the combine, linear regressing combine data to try and predict their receiving yards over the next 3 years
 

In [2]:
import pandas as pd
import glob

combine_list = []

for i in range(2000, 2021):
    file_path = glob.glob(f"data/{i}.csv") 
    
    if file_path:  
        temp = pd.read_csv(file_path[0]) 
        temp["Year"] = i 
        combine_list.append(temp)  

combine_csv = pd.concat(combine_list, ignore_index=True)
combine_csv = combine_csv[combine_csv["Pos"] == "WR"]
combine_csv.tail(20)



Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Player-additional,Year
6751,Kalija Lipscomb,WR,Vanderbilt,College Stats,6-0,207,4.57,32.0,16.0,127.0,,,,LipsKa01,2020
6755,Austin Mack,WR,Ohio State,College Stats,6-1,208,4.59,31.5,,117.0,,4.42,,MackAu01,2020
6767,Denzel Mims,WR,Baylor,College Stats,6-3,207,4.38,38.5,16.0,131.0,6.66,4.43,New York Jets / 2nd / 59th pick / 2020,MimsDe00,2020
6771,Darnell Mooney,WR,Tulane,College Stats,5-10,176,4.38,37.0,9.0,124.0,,,Chicago Bears / 5th / 173rd pick / 2020,MoonDa00,2020
6789,K.J. Osborn,WR,Miami,College Stats,5-11,203,4.48,37.5,18.0,123.0,7.0,4.35,Minnesota Vikings / 5th / 176th pick / 2020,OsboKJ00,2020
6790,Aaron Parker,WR,Rhode Island,,6-2,209,4.57,26.5,12.0,112.0,6.94,4.23,,ParkAa01,2020
6792,Dezmon Patmon,WR,Washington State,College Stats,6-4,225,4.48,36.0,15.0,132.0,7.28,4.38,Indianapolis Colts / 6th / 212th pick / 2020,PatmDe00,2020
6797,Donovan Peoples-Jones,WR,Michigan,College Stats,6-2,212,4.48,44.5,,139.0,,,Cleveland Browns / 6th / 187th pick / 2020,PeopDo00,2020
6799,Malcolm Perry,WR,Navy,College Stats,5-9,186,4.63,36.0,10.0,122.0,7.12,4.31,Miami Dolphins / 7th / 246th pick / 2020,PerrMa00,2020
6807,Michael Pittman,WR,USC,College Stats,6-4,223,4.52,36.5,13.0,121.0,6.96,4.14,Indianapolis Colts / 2nd / 34th pick / 2020,PittMi01,2020


### At this point, we have run into our first issue with our predictors. Many rookies opt to not do some of the tests or it was not recorded in the past. These can be seen with NaN. We can see that a lot of the data is missing at least 1 component, and there is very little data with all metrics recorded in this dataset.

In [3]:
forty_yd_missing = combine_csv["40yd"].isna().sum()
vert_missing = combine_csv["Vertical"].isna().sum()
bench_missing = combine_csv["Bench"].isna().sum()
bj_missing = combine_csv["Broad Jump"].isna().sum()
three_cone_missing = combine_csv["3Cone"].isna().sum()
shuttle_missing = combine_csv["Shuttle"].isna().sum()

print(f"Number of NaN values in '40yd' column: {forty_yd_missing}")
print(f"Number of NaN values in 'Vertical' column: {vert_missing}")
print(f"Number of NaN values in 'Bench' column: {bench_missing}")
print(f"Number of NaN values in 'Broad Jump' column: {bj_missing}")
print(f"Number of NaN values in '3Cone' column: {three_cone_missing}")
print(f"Number of NaN values in 'Shuttle' column: {shuttle_missing}")

columns_to_check = ["Ht", "Wt", "40yd", "Vertical", "Bench", "Broad Jump", "3Cone", "Shuttle"]

rows_without_nan = combine_csv[columns_to_check].notna().all(axis=1).sum()
print(f"Number of rows without any NaN in the specified columns: {rows_without_nan}")

## TODO: Figure out which columns to exclude in data for simplicity 

Number of NaN values in '40yd' column: 36
Number of NaN values in 'Vertical' column: 181
Number of NaN values in 'Bench' column: 492
Number of NaN values in 'Broad Jump' column: 202
Number of NaN values in '3Cone' column: 353
Number of NaN values in 'Shuttle' column: 336
Number of rows without any NaN in the specified columns: 315


In [4]:
cleaned_df = combine_csv.dropna(subset=columns_to_check)
# print(cleaned_df.columns)

# forty_yd_missing = cleaned_df["40yd"].isna().sum()
# vert_missing = cleaned_df["Vertical"].isna().sum()
# bench_missing = cleaned_df["Bench"].isna().sum()
# bj_missing = cleaned_df["Broad Jump"].isna().sum()
# three_cone_missing = cleaned_df["3Cone"].isna().sum()
# shuttle_missing = cleaned_df["Shuttle"].isna().sum()

# print(f"Number of NaN values in '40yd' column: {forty_yd_missing}")
# print(f"Number of NaN values in 'Vertical' column: {vert_missing}")
# print(f"Number of NaN values in 'Bench' column: {bench_missing}")
# print(f"Number of NaN values in 'Broad Jump' column: {bj_missing}")
# print(f"Number of NaN values in '3Cone' column: {three_cone_missing}")
# print(f"Number of NaN values in 'Shuttle' column: {shuttle_missing}")
cleaned_df.head()

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Player-additional,Year
1434,Ryan Krause,WR,Nebraska-Omaha,,6-2,244,4.6,33.5,18.0,119.0,6.78,4.16,San Diego Chargers / 6th / 169th pick / 2004,KrauRy00,2004
1960,Miles Austin,WR,Monmouth (NJ),,6-2,215,4.47,40.5,21.0,123.0,7.09,4.14,,AustMi00,2006
1970,Damarius Bilbo,WR,Georgia Tech,College Stats,6-2,214,4.7,36.5,19.0,115.0,6.9,4.18,,-9999,2006
1996,Marques Colston,WR,Hofstra,,6-5,224,4.5,37.0,16.0,123.0,6.94,4.43,New Orleans Saints / 7th / 252nd pick / 2006,ColsMa00,2006
2008,Chris Denney,WR,Nebraska-Omaha,,6-3,224,4.65,33.0,13.0,110.0,7.02,4.33,,-9999,2006


In [5]:
cleaned_combine = cleaned_df.drop(columns=["Pos", "School", "College", "Drafted (tm/rnd/yr)", "Player-additional"])
# We are keeping year to find the Receiving data in the first 3 years since the combine
cleaned_combine.head()

Unnamed: 0,Player,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Year
1434,Ryan Krause,6-2,244,4.6,33.5,18.0,119.0,6.78,4.16,2004
1960,Miles Austin,6-2,215,4.47,40.5,21.0,123.0,7.09,4.14,2006
1970,Damarius Bilbo,6-2,214,4.7,36.5,19.0,115.0,6.9,4.18,2006
1996,Marques Colston,6-5,224,4.5,37.0,16.0,123.0,6.94,4.43,2006
2008,Chris Denney,6-3,224,4.65,33.0,13.0,110.0,7.02,4.33,2006


### At this point, we have cleaned and isolated our rookie combine data for receivers, now we need to sift through the NFL receiving data. A very similar process in gathering the data.

In [6]:
### Matching the year and name to the nfl draft year, and summing up the receiving yards
NFL_list = []

for i in range(2000, 2021):
    file_path = glob.glob(f"data/{i}NFL.csv") 
    
    if file_path:  
        temp = pd.read_csv(file_path[0]) 
        temp["Year"] = i 
        NFL_list.append(temp)  

NFL_csv = pd.concat(NFL_list, ignore_index=True)
NFL_csv = NFL_csv[NFL_csv["Pos"] == "WR"]
NFL_csv.columns

Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'Tgt', 'Rec', 'Yds',
       'Y/R', 'TD', '1D', 'Succ%', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt',
       'Fmb', 'Awards', '-9999', 'Year'],
      dtype='object')

In [13]:
## Soring the dataframe for ease of use
sortedNFL = NFL_csv.sort_values(by=["Player", "Year"], ascending=[True, False])
sortedNFL.shape[0]

sortedNFL.head()
name_counts = sortedNFL["Player"].value_counts()
valid_names = name_counts[name_counts >= 3].index
filtered_df = sortedNFL[sortedNFL["Player"].isin(valid_names)]
sortedNFL = filtered_df.sort_values(by=["Player", "Year"], ascending=[True, False])
sortedNFL.head(20)

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,Tgt,Rec,Yds,...,Succ%,Lng,R/G,Y/G,Ctch%,Y/Tgt,Fmb,Awards,-9999,Year
11700,88.0,A.J. Green,32.0,CIN,WR,16.0,14.0,104.0,47.0,523.0,...,40.4,33.0,2.9,32.7,45.2,5.0,0.0,,GreeA.00,2020
10433,78.0,A.J. Green,30.0,CIN,WR,9.0,9.0,77.0,46.0,694.0,...,57.1,38.0,5.1,77.1,59.7,9.0,2.0,,GreeA.00,2018
9768,17.0,A.J. Green,29.0,CIN,WR,16.0,16.0,143.0,75.0,1078.0,...,45.5,77.0,4.7,67.4,52.4,7.5,2.0,PB,GreeA.00,2017
9185,42.0,A.J. Green,28.0,CIN,WR,10.0,10.0,100.0,66.0,964.0,...,54.0,54.0,6.6,96.4,66.0,9.6,0.0,PB,GreeA.00,2016
8542,14.0,A.J. Green,27.0,CIN,WR,16.0,16.0,132.0,86.0,1297.0,...,60.6,80.0,5.4,81.1,65.2,9.8,1.0,PB,GreeA.00,2015
7980,35.0,A.J. Green,26.0,CIN,WR,13.0,13.0,117.0,69.0,1041.0,...,53.0,81.0,5.3,80.1,59.0,8.9,3.0,PB,GreeA.00,2014
7354,6.0,A.J. Green,25.0,CIN,WR,16.0,16.0,178.0,98.0,1426.0,...,47.2,82.0,6.1,89.1,55.1,8.0,1.0,PBAP-2,GreeA.00,2013
6757,6.0,A.J. Green,25.0,CIN,WR,16.0,16.0,178.0,98.0,1426.0,...,47.2,82.0,6.1,89.1,55.1,8.0,1.0,PBAP-2,GreeA.00,2012
6193,34.0,A.J. Green,23.0,CIN,WR,15.0,15.0,115.0,65.0,1057.0,...,46.1,58.0,4.3,70.5,56.5,9.2,1.0,PB,GreeA.00,2011
8231,278.0,A.J. Jenkins,25.0,KAN,WR,9.0,2.0,15.0,9.0,93.0,...,40.0,27.0,1.0,10.3,60.0,6.2,1.0,,JenkA.00,2014


In [None]:
sortedNFL_shrunk = sortedNFL.drop(columns=["Rk", "Age", "Team", "Pos", ""])

### TODO: Begin matching rookie names to their total 3 year rec yards, and eliminate any rookies that have less than 3 years in the NFL
Final dataframe should have the rookie name, combine stats, and 3 year rec yards as the last column and running regression from there should be easy