# 📊 NFL Combine & Draft Analytics
### Predicting NFL Success from Combine Metrics
This notebook explores the relationship between NFL Combine performance and career success.

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import requests
from bs4 import BeautifulSoup
import glob

## 📂 Load Dataset
We'll load a dataset containing NFL Combine stats taken from [ProFootball Reference](https://www.pro-football-reference.com/draft/2024-combine.htm) from 2000-2024.

In [25]:
# Load dataset (update with actual URL or file path)
csv_files = glob.glob("data/2024.csv")

df_list = [pd.read_csv(f) for f in csv_files]
raw_df = pd.concat(df_list, ignore_index=True)
raw_df.head()

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Player-additional
0,Kris Abrams-Draine,CB,Missouri,College Stats,5-11,179,4.44,33.5,,,,,Denver Broncos / 5th / 145th pick / 2024,AbraKr00
1,Isaiah Adams,G,Illinois,College Stats,6-4,315,5.22,24.5,,102.0,7.77,4.73,Arizona Cardinals / 3rd / 71st pick / 2024,AdamIs01
2,Rasheen Ali,RB,Marshall,College Stats,5-11,206,,,,,,,Baltimore Ravens / 5th / 165th pick / 2024,AlixRa00
3,Erick All,TE,Iowa,College Stats,6-4,252,,,,,,,Cincinnati Bengals / 4th / 115th pick / 2024,AllxEr00
4,Braelon Allen,RB,Wisconsin,College Stats,6-1,235,,32.0,26.0,117.0,,,New York Jets / 4th / 134th pick / 2024,AlleBr05


## 🧹 Data Cleaning & Preprocessing
Expand the "Drafted" column and convert relevant columns to numeric and handle missing values.

In [26]:

df = raw_df.copy()
df[["Team", "Round", "Pick", "Year"]] = df["Drafted (tm/rnd/yr)"].str.split(" / ", expand=True)
df.drop(columns=['Drafted (tm/rnd/yr)'], inplace=True)

duplicate_players = df["Player"].duplicated(keep=False)
print(df[duplicate_players])

df


Empty DataFrame
Columns: [Player, Pos, School, College, Ht, Wt, 40yd, Vertical, Bench, Broad Jump, 3Cone, Shuttle, Player-additional, Team, Round, Pick, Year]
Index: []


Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Player-additional,Team,Round,Pick,Year
0,Kris Abrams-Draine,CB,Missouri,College Stats,5-11,179,4.44,33.5,,,,,AbraKr00,Denver Broncos,5th,145th pick,2024
1,Isaiah Adams,G,Illinois,College Stats,6-4,315,5.22,24.5,,102.0,7.77,4.73,AdamIs01,Arizona Cardinals,3rd,71st pick,2024
2,Rasheen Ali,RB,Marshall,College Stats,5-11,206,,,,,,,AlixRa00,Baltimore Ravens,5th,165th pick,2024
3,Erick All,TE,Iowa,College Stats,6-4,252,,,,,,,AllxEr00,Cincinnati Bengals,4th,115th pick,2024
4,Braelon Allen,RB,Wisconsin,College Stats,6-1,235,,32.0,26.0,117.0,,,AlleBr05,New York Jets,4th,134th pick,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,Roman Wilson,WR,Michigan,College Stats,5-11,185,4.39,,12.0,,,,WilsRo02,Pittsburgh Steelers,3rd,84th pick,2024
317,Mekhi Wingo,DT,LSU,College Stats,6-0,284,4.85,31.5,25.0,109.0,,,WingMe00,Detroit Lions,6th,189th pick,2024
318,Xavier Worthy,WR,Texas,College Stats,5-11,165,4.21,41.0,,131.0,,,WortXa00,Kansas City Chiefs,1st,28th pick,2024
319,Jaylen Wright,RB,Tennessee,College Stats,5-11,210,4.38,38.0,,134.0,,,WrigJa04,Miami Dolphins,4th,120th pick,2024


## 📈 Correlation Analysis
Let's analyze how Combine metrics correlate with draft position and career success.

In [27]:
plt.figure(figsize=(10, 6))
sns.heatmap(df[['40yd', 'BenchReps', 'Vertical', 'Broad Jump', 'Draft Pick']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Between NFL Combine Metrics & Draft Position')
plt.show()

KeyError: "['BenchReps', 'Draft Pick'] not in index"

<Figure size 1000x600 with 0 Axes>

## 🚀 Does Speed Predict NFL Success?
A scatterplot showing the relationship between 40-yard dash times and career Approximate Value (AV).

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['40yd'], y=df['Career AV'], hue=df['Position'], alpha=0.7)
plt.xlabel('40-Yard Dash Time (s)')
plt.ylabel('Career Approximate Value (AV)')
plt.title('Does Speed Predict NFL Success?')
plt.show()

NameError: name 'plt' is not defined

## 🏆 Next Steps
- Build a predictive model to estimate career success from Combine stats.
- Compare trends for different positions (QB, WR, etc.).
- Identify potential draft steals and busts.

# Linear Regression to Model Rookie NFL Receiving Yards per Catch:

Question: How relevant is combine data alone in predicting NFL WR Stars?

Isolating NFL Rookies from year in the combine, linear regressing combine data to try and predict their receiving yards over the next 3 years
 

In [130]:
import pandas as pd
import glob

combine_list = []

for i in range(2000, 2024):
    file_path = glob.glob(f"data/{i}.csv") 
    
    if file_path:  
        temp = pd.read_csv(file_path[0]) 
        temp["Year"] = i 
        combine_list.append(temp)  

combine_csv = pd.concat(combine_list, ignore_index=True)
combine_csv = combine_csv[combine_csv["Pos"] == "WR"]
combine_csv.tail(20)

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Player-additional,Year
7851,Matt Landers,WR,Arkansas,College Stats,6-4,200.0,4.37,37.0,,130.0,,,,-9999,2023
7872,Marvin Mims,WR,Oklahoma,College Stats,5-11,183.0,4.38,39.5,,129.0,6.9,,Denver Broncos / 2nd / 63rd pick / 2023,MimsMa00,2023
7873,Jonathan Mingo,WR,Mississippi,College Stats,6-2,220.0,4.46,39.5,22.0,129.0,,,Carolina Panthers / 2nd / 39th pick / 2023,MingJo00,2023
7878,Jalen Moreno-Cropper,WR,Fresno St.,,5-11,172.0,4.4,,,121.0,,,,-9999,2023
7886,Puka Nacua,WR,BYU,College Stats,6-2,201.0,,,,,,,Los Angeles Rams / 5th / 177th pick / 2023,NacuPu00,2023
7888,Joseph Ngata,WR,Clemson,College Stats,6-3,217.0,4.54,34.5,,124.0,,,,NgatJo01,2023
7897,Trey Palmer,WR,Nebraska,College Stats,6-0,192.0,4.33,,,,,,Tampa Bay Buccaneers / 6th / 191st pick / 2023,PalmTr00,2023
7902,A.T. Perry,WR,Wake Forest,College Stats,6-4,198.0,4.47,35.0,,133.0,,,New Orleans Saints / 6th / 195th pick / 2023,PerrAT00,2023
7910,Jayden Reed,WR,Michigan St.,College Stats,5-11,187.0,4.45,33.5,,121.0,,4.29,Green Bay Packers / 2nd / 50th pick / 2023,ReedJa03,2023
7911,Rashee Rice,WR,SMU,College Stats,6-1,204.0,4.51,41.0,,128.0,,,Kansas City Chiefs / 2nd / 55th pick / 2023,RiceRa01,2023


### At this point, we have run into our first issue with our predictors. Many rookies opt to not do some of the tests or it was not recorded in the past. These can be seen with NaN. We can see that a lot of the data is missing at least 1 component, and there is very little data with all metrics recorded in this dataset.

In [131]:
forty_yd_missing = combine_csv["40yd"].isna().sum()
vert_missing = combine_csv["Vertical"].isna().sum()
bench_missing = combine_csv["Bench"].isna().sum()
bj_missing = combine_csv["Broad Jump"].isna().sum()
three_cone_missing = combine_csv["3Cone"].isna().sum()
shuttle_missing = combine_csv["Shuttle"].isna().sum()

print(f"Number of NaN values in '40yd' column: {forty_yd_missing}")
print(f"Number of NaN values in 'Vertical' column: {vert_missing}")
print(f"Number of NaN values in 'Bench' column: {bench_missing}")
print(f"Number of NaN values in 'Broad Jump' column: {bj_missing}")
print(f"Number of NaN values in '3Cone' column: {three_cone_missing}")
print(f"Number of NaN values in 'Shuttle' column: {shuttle_missing}")

columns_to_check = ["Ht", "Wt", "40yd", "Vertical", "Broad Jump", "3Cone", "Shuttle"]

rows_without_nan = combine_csv[columns_to_check].notna().all(axis=1).sum()
print(f"Number of rows without any NaN in the specified columns: {rows_without_nan}")

## TODO: Figure out which columns to exclude in data for simplicity 

Number of NaN values in '40yd' column: 54
Number of NaN values in 'Vertical' column: 201
Number of NaN values in 'Bench' column: 585
Number of NaN values in 'Broad Jump' column: 223
Number of NaN values in '3Cone' column: 425
Number of NaN values in 'Shuttle' column: 401
Number of rows without any NaN in the specified columns: 639


In [132]:
cleaned_df = combine_csv.dropna(subset=columns_to_check)
# print(cleaned_df.columns)

# forty_yd_missing = cleaned_df["40yd"].isna().sum()
# vert_missing = cleaned_df["Vertical"].isna().sum()
# bench_missing = cleaned_df["Bench"].isna().sum()
# bj_missing = cleaned_df["Broad Jump"].isna().sum()
# three_cone_missing = cleaned_df["3Cone"].isna().sum()
# shuttle_missing = cleaned_df["Shuttle"].isna().sum()

# print(f"Number of NaN values in '40yd' column: {forty_yd_missing}")
# print(f"Number of NaN values in 'Vertical' column: {vert_missing}")
# print(f"Number of NaN values in 'Bench' column: {bench_missing}")
# print(f"Number of NaN values in 'Broad Jump' column: {bj_missing}")
# print(f"Number of NaN values in '3Cone' column: {three_cone_missing}")
# print(f"Number of NaN values in 'Shuttle' column: {shuttle_missing}")
cleaned_df.head()

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Player-additional,Year
58,Chris Cole,WR,Texas A&M,College Stats,6-0,191.0,4.37,37.5,,125.0,6.76,4.09,Denver Broncos / 3rd / 70th pick / 2000,ColeCh00,2000
59,Chris Coleman,WR,North Carolina State,College Stats,6-0,211.0,4.61,32.5,,110.0,6.79,4.1,,ColeCh01,2000
61,Laveranues Coles,WR,Florida State,College Stats,5-11,192.0,4.41,34.0,,115.0,6.89,4.39,New York Jets / 3rd / 78th pick / 2000,ColeLa00,2000
70,Chris Daniels,WR,Purdue,,6-3,217.0,4.74,33.0,,115.0,7.2,4.24,,-9999,2000
75,JaJuan Dawson,WR,Tulane,College Stats,6-1,199.0,4.55,34.0,,117.0,6.96,4.16,Cleveland Browns / 3rd / 79th pick / 2000,DawsJa00,2000


In [133]:
cleaned_combine = cleaned_df.drop(columns=["Pos", "School", "College", "Drafted (tm/rnd/yr)", "Player-additional"])
# We are keeping year to find the Receiving data in the first 3 years since the combine
cleaned_combine.head()

Unnamed: 0,Player,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Year
58,Chris Cole,6-0,191.0,4.37,37.5,,125.0,6.76,4.09,2000
59,Chris Coleman,6-0,211.0,4.61,32.5,,110.0,6.79,4.1,2000
61,Laveranues Coles,5-11,192.0,4.41,34.0,,115.0,6.89,4.39,2000
70,Chris Daniels,6-3,217.0,4.74,33.0,,115.0,7.2,4.24,2000
75,JaJuan Dawson,6-1,199.0,4.55,34.0,,117.0,6.96,4.16,2000


### At this point, we have cleaned and isolated our rookie combine data for receivers, now we need to sift through the NFL receiving data. A very similar process in gathering the data.

In [134]:
### Matching the year and name to the nfl draft year, and summing up the receiving yards
NFL_list = []

for i in range(2000, 2021):
    file_path = glob.glob(f"data/{i}NFL.csv") 
    
    if file_path:  
        temp = pd.read_csv(file_path[0]) 
        temp["Year"] = i 
        NFL_list.append(temp)  

NFL_csv = pd.concat(NFL_list, ignore_index=True)
NFL_csv = NFL_csv[NFL_csv["Pos"] == "WR"]
NFL_csv.columns

Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'Tgt', 'Rec', 'Yds',
       'Y/R', 'TD', '1D', 'Succ%', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt',
       'Fmb', 'Awards', '-9999', 'Year'],
      dtype='object')

In [135]:
## Soring the dataframe for ease of use
sortedNFL = NFL_csv.sort_values(by=["Player", "Year"], ascending=[True, False])
sortedNFL.shape[0]

sortedNFL.head()
name_counts = sortedNFL["Player"].value_counts()
valid_names = name_counts[name_counts >= 3]
valid_names = name_counts[name_counts >= 3].index
filtered_df = sortedNFL[sortedNFL["Player"].isin(valid_names)]
sortedNFL = filtered_df.sort_values(by=["Player", "Year"], ascending=[True, True])
sortedNFL.head(20)

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,Tgt,Rec,Yds,...,Succ%,Lng,R/G,Y/G,Ctch%,Y/Tgt,Fmb,Awards,-9999,Year
6193,34.0,A.J. Green,23.0,CIN,WR,15.0,15.0,115.0,65.0,1057.0,...,46.1,58.0,4.3,70.5,56.5,9.2,1.0,PB,GreeA.00,2011
6757,6.0,A.J. Green,25.0,CIN,WR,16.0,16.0,178.0,98.0,1426.0,...,47.2,82.0,6.1,89.1,55.1,8.0,1.0,PBAP-2,GreeA.00,2012
7354,6.0,A.J. Green,25.0,CIN,WR,16.0,16.0,178.0,98.0,1426.0,...,47.2,82.0,6.1,89.1,55.1,8.0,1.0,PBAP-2,GreeA.00,2013
7980,35.0,A.J. Green,26.0,CIN,WR,13.0,13.0,117.0,69.0,1041.0,...,53.0,81.0,5.3,80.1,59.0,8.9,3.0,PB,GreeA.00,2014
8542,14.0,A.J. Green,27.0,CIN,WR,16.0,16.0,132.0,86.0,1297.0,...,60.6,80.0,5.4,81.1,65.2,9.8,1.0,PB,GreeA.00,2015
9185,42.0,A.J. Green,28.0,CIN,WR,10.0,10.0,100.0,66.0,964.0,...,54.0,54.0,6.6,96.4,66.0,9.6,0.0,PB,GreeA.00,2016
9768,17.0,A.J. Green,29.0,CIN,WR,16.0,16.0,143.0,75.0,1078.0,...,45.5,77.0,4.7,67.4,52.4,7.5,2.0,PB,GreeA.00,2017
10433,78.0,A.J. Green,30.0,CIN,WR,9.0,9.0,77.0,46.0,694.0,...,57.1,38.0,5.1,77.1,59.7,9.0,2.0,,GreeA.00,2018
11700,88.0,A.J. Green,32.0,CIN,WR,16.0,14.0,104.0,47.0,523.0,...,40.4,33.0,2.9,32.7,45.2,5.0,0.0,,GreeA.00,2020
7041,282.0,A.J. Jenkins,24.0,KAN,WR,16.0,1.0,17.0,8.0,130.0,...,29.4,48.0,0.5,8.1,47.1,7.6,0.0,,JenkA.00,2012


In [136]:
sortedNFL_shrunk = sortedNFL.drop(columns=["Rk", "Age", "Pos", "G", "GS", "Lng", "1D", "Awards", "-9999"])
sortedNFL_shrunk.shape

(3548, 14)

In [137]:
# Identify players and years where the team is 'XTM'
xtm_rows = sortedNFL_shrunk[sortedNFL_shrunk["Team"].str.endswith("TM")]

# Get a set of (Player, Year) pairs where the team is XTM
xtm_players_years = set(zip(xtm_rows["Player"], xtm_rows["Year"]))

# Filter dataframe:
#   - Keep only "XTM" row if the player was traded that year
#   - Keep all rows for non-traded players
filtered_df = sortedNFL_shrunk[
    (sortedNFL_shrunk["Team"].str.endswith("TM")) | 
    (~sortedNFL_shrunk.apply(lambda row: (row["Player"], row["Year"]) in xtm_players_years, axis=1))
]

# Display the cleaned DataFrame=
filtered_df.tail(40)

sortedNFL_shrunk = filtered_df

### TODO: Begin matching rookie names to their total 3 year rec yards, and eliminate any rookies that have less than 3 years in the NFL
Final dataframe should have the rookie name, combine stats, and 3 year rec yards as the last column and running regression from there should be easy

In [138]:
### Create final.csv, use R to model and get the info from it. 

first_year_playing = sortedNFL_shrunk.groupby("Player")["Year"].min()
first_year_playing

temp = sortedNFL_shrunk
temp["Years_Since_Rookie"] = temp["Year"] - temp["Player"].map(first_year_playing)
temp = temp[temp["Years_Since_Rookie"] < 3]
temp = temp.drop(columns=["Years_Since_Rookie"])

sortedNFL_shrunk = temp


sortedNFL = sortedNFL_shrunk.sort_values(by=["Player", "Year"], ascending=[True, False])
sortedNFL.shape[0]

sortedNFL.head()
name_counts = sortedNFL["Player"].value_counts()
valid_names = name_counts[name_counts >= 3]
valid_names = name_counts[name_counts >= 3].index
filtered_df = sortedNFL[sortedNFL["Player"].isin(valid_names)]
sortedNFL = filtered_df.sort_values(by=["Player", "Year"], ascending=[True, True])
sortedNFL.tail(42)

Unnamed: 0,Player,Team,Tgt,Rec,Yds,Y/R,TD,Succ%,R/G,Y/G,Ctch%,Y/Tgt,Fmb,Year
1790,Tyrone Calico,TEN,43.0,18.0,297.0,16.5,4.0,37.2,1.3,21.2,41.9,6.9,0.0,2003
2556,Tyrone Calico,TEN,4.0,2.0,13.0,6.5,0.0,25.0,2.0,13.0,50.0,3.3,0.0,2004
2916,Tyrone Calico,TEN,42.0,22.0,191.0,8.7,0.0,38.1,1.8,15.9,52.4,4.5,1.0,2005
6166,Victor Cruz,NYG,131.0,82.0,1536.0,18.7,9.0,56.5,5.1,96.0,62.6,11.7,1.0,2011
6781,Victor Cruz,NYG,122.0,73.0,998.0,13.7,4.0,51.6,5.2,71.3,59.8,8.2,1.0,2012
7378,Victor Cruz,NYG,122.0,73.0,998.0,13.7,4.0,51.6,5.2,71.3,59.8,8.2,1.0,2013
6350,Vincent Brown,SDG,40.0,19.0,329.0,17.3,2.0,47.5,1.4,23.5,47.5,8.2,0.0,2011
6852,Vincent Brown,SDG,70.0,41.0,472.0,11.5,1.0,50.0,2.6,29.5,58.6,6.7,0.0,2012
7449,Vincent Brown,SDG,70.0,41.0,472.0,11.5,1.0,50.0,2.6,29.5,58.6,6.7,0.0,2013
3100,Vincent Jackson,SDG,8.0,3.0,59.0,19.7,0.0,37.5,0.4,7.4,37.5,7.4,0.0,2005


In [139]:
sortedNFL = sortedNFL.drop(columns=["Tgt", "Rec", "Yds", "TD", "Y/G", "Y/Tgt", "Fmb"])
sortedNFL.tail(40)

columns_to_check = ["Y/R", "R/G", "Ctch%", "Year"]

# Count the number of rows with NaN in each specified column
missing_counts = sortedNFL[columns_to_check].isna().sum()
sortedNFL.shape


(1551, 7)

In [140]:
sortedNFL_simplified = sortedNFL.dropna(subset=columns_to_check)

In [141]:
sortedNFL_simplified.head()

Unnamed: 0,Player,Team,Y/R,Succ%,R/G,Ctch%,Year
6193,A.J. Green,CIN,16.3,46.1,4.3,56.5,2011
6757,A.J. Green,CIN,14.6,47.2,6.1,55.1,2012
7354,A.J. Green,CIN,14.6,47.2,6.1,55.1,2013
7041,A.J. Jenkins,KAN,16.3,29.4,0.5,47.1,2012
7638,A.J. Jenkins,KAN,16.3,29.4,0.5,47.1,2013


In [142]:
averaged_df = sortedNFL_simplified.groupby("Player", as_index=False).mean(numeric_only=True).round(3)
averaged_df = averaged_df.drop(columns=["Year"])
averaged_df.head(10)

Unnamed: 0,Player,Y/R,Succ%,R/G,Ctch%
0,A.J. Green,15.167,46.833,5.5,55.567
1,A.J. Jenkins,14.3,32.933,0.667,51.4
2,Aaron Dobson,13.567,50.533,2.333,54.267
3,Aaron Moorehead,10.7,49.6,1.533,55.633
4,Ace Sanders,9.4,37.3,2.433,67.633
5,Adam Humphries,10.4,52.8,3.2,69.1
6,Adam Jennings,10.3,100.0,0.4,100.0
7,Adam Thielen,14.367,51.2,1.867,67.733
8,Ahmad Merritt,11.267,23.767,0.8,39.033
9,Albert Wilson,12.733,45.5,1.9,59.767


In [143]:
merged_df = pd.merge(cleaned_combine ,averaged_df, on="Player", how="inner")
merged_df = merged_df.drop(columns=["Bench"])
merged_df.head()

Unnamed: 0,Player,Ht,Wt,40yd,Vertical,Broad Jump,3Cone,Shuttle,Year,Y/R,Succ%,R/G,Ctch%
0,Laveranues Coles,5-11,192.0,4.41,34.0,115.0,6.89,4.39,2000,15.233,53.2,3.667,58.867
1,JaJuan Dawson,6-1,199.0,4.55,34.0,117.0,6.96,4.16,2000,12.4,49.133,2.533,60.933
2,Larry Foster,5-10,198.0,4.55,37.0,121.0,6.88,4.06,2000,11.367,35.433,1.5,52.167
3,Trevor Gaylor,6-3,199.0,4.56,37.5,121.0,7.03,4.29,2000,14.967,39.767,1.6,43.133
4,Scottie Montgomery,6-0,196.0,4.57,36.0,117.0,7.16,4.11,2000,10.6,63.133,0.667,72.233


In [144]:
# Function to convert height in ft-in format to decimal feet
def convert_height_to_decimal(height):
    # Split the height into feet and inches
    feet, inches = height.split('-')
    
    # Convert feet and inches to decimal feet
    decimal_height = int(feet) + int(inches) / 12
    
    return decimal_height

# Example usage: apply to a DataFrame column
merged_df['Ht'] = merged_df['Ht'].apply(convert_height_to_decimal)

merged_df.to_csv("output.csv", index=False)

In [None]:
### Regression Time

