# NFL Fair Value Analysis (2024)

**Goal**: Identify market inefficiencies by comparing Player Cost (Cap Hit/Salary) vs. Player Performance (Approximate Value).

**Metrics**:
- **Cost**: 2024 Cap Hit / Cash (Spotrac)
- **Performance**: 2024 Approximate Value (PFR)
- **Efficiency**: `AV / $1M`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', 50)
sns.set_style("whitegrid")

## 1. Load Data

In [None]:
# Define paths (Update these specific filenames based on your latest scrapes)
SPOTRAC_PATH = "../data/raw/spotrac_player_rankings_2024_2026w05_20260130_183232.csv"
PFR_PATH = "../data/raw/pfr/rosters_2024.csv"

spotrac = pd.read_csv(SPOTRAC_PATH)
pfr = pd.read_csv(PFR_PATH)

print(f"Spotrac rows: {len(spotrac)}")
print(f"PFR rows: {len(pfr)}")

## 2. Data Cleaning & Normalization

In [None]:
# Clean Names
def clean_name(name):
    if pd.isna(name): return ""
    # Remove suffixes like " Jr.", " III"
    name = name.replace(" Jr.", "").replace(" Sr.", "").replace(" III", "").replace(" II", "")
    return name.strip().lower()

spotrac['clean_name'] = spotrac['player_name'].apply(clean_name)
pfr['clean_name'] = pfr['Player'].apply(clean_name)

# Handle duplicates (keep highest Salary/AV if dupes exist)
spotrac = spotrac.sort_values('total_contract_value_millions', ascending=False).drop_duplicates(subset=['clean_name', 'team'])
pfr = pfr.sort_values('AV', ascending=False).drop_duplicates(subset=['clean_name', 'team'])

# Team Mapping (Spotrac -> PFR)
TEAM_MAP = {
    'GNB': 'GB', 'KAN': 'KC', 'LVR': 'LV', 'NWE': 'NE', 'NOR': 'NO', 'SFO': 'SF', 'TAM': 'TB', 
    'JAX': 'JAX', 'LAC': 'LAC', 'LAR': 'LAR' # Add mismatches as found
}
# Check team codes overlap
spotrac_teams = set(spotrac['team'].unique())
pfr_teams = set(pfr['team'].unique())
print("Difference:", spotrac_teams - pfr_teams)

## 3. Merge

In [None]:
merged = pd.merge(spotrac, pfr, left_on=['clean_name', 'team'], right_on=['clean_name', 'team'], how='inner')

print(f"Merged rows: {len(merged)}")
print(f"Match rate: {len(merged)/len(spotrac):.1%}")

merged.head()

## 4. Feature Engineering: Efficiency Metrics

In [None]:
# Calculate AV per $1 Million
# Handle $0 or very low salaries to prevent infinity
if 'total_contract_value_millions' in merged.columns:
    merged['salary_m'] = merged['total_contract_value_millions']
else:
    merged['salary_m'] = merged['value'].replace('[\$,]', '', regex=True).astype(float) / 1_000_000

# Filter for meaningful participation (e.g., > 0 AV)
df_active = merged[merged['AV'] > 0].copy()

# Metric: Efficiency (AV / Salary)
# We cap min salary at $0.8M (league min approx) to avoid skewing for cheap rookies too much
df_active['eff_salary_m'] = df_active['salary_m'].clip(lower=0.8)
df_active['efficiency'] = df_active['AV'] / df_active['eff_salary_m']

df_active[['player_name', 'team', 'Pos', 'AV', 'salary_m', 'efficiency']].sort_values('efficiency', ascending=False).head(10)

## 5. Visualizations

In [None]:
plt.figure(figsize=(12, 8))

# Define regions
plt.axvspan(0, 5, ymin=0.5, ymax=1, color='green', alpha=0.1, label='High Value Zone')
plt.axvspan(10, 60, ymin=0, ymax=0.25, color='red', alpha=0.1, label='Overvalued Zone')

# Main scatter plot
sns.scatterplot(data=df_active, x='salary_m', y='AV', hue='Pos', alpha=0.6, s=60)
plt.title("NFL 2024: Performance (AV) vs. Cost (Cap Hit) - Efficiency Analysis")
plt.xlabel("Cap Hit ($M)")
plt.ylabel("Approximate Value (AV)")

# Call out top 10 efficient players (Steals)
steals = df_active[df_active['AV'] >= 10].sort_values('efficiency', ascending=False).head(10)
for i, row in steals.iterrows():
    plt.text(row['salary_m']+0.5, row['AV'], row['player_name'], fontsize=9, fontweight='bold', color='darkgreen')
    plt.scatter(row['salary_m'], row['AV'], color='green', s=100, marker='*', zorder=5)

# Call out top 10 overrated players (Overpays) - minimal salary filter to focus on big contracts
overpays = df_active[(df_active['salary_m'] >= 10)].sort_values('efficiency', ascending=True).head(10)
for i, row in overpays.iterrows():
    plt.text(row['salary_m']+0.5, row['AV'], row['player_name'], fontsize=9, fontweight='bold', color='darkred')
    plt.scatter(row['salary_m'], row['AV'], color='red', s=100, marker='X', zorder=5)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Top 10 Best Value (High AV, Low Cost)
# Filter: significant playing time (AV >= 10)
steals = df_active[df_active['AV'] >= 10].sort_values('efficiency', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=steals, y='player_name', x='efficiency', palette='viridis')
plt.title("Top 10 Most Efficient Players (AV >= 10)")
plt.xlabel("Efficiency (AV / $1M)")
plt.show()

In [None]:
# Top 10 Overvalued (High Cost, Low AV)
overpays = df_active[(df_active['salary_m'] >= 5)].sort_values('efficiency', ascending=True).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=overpays, y='player_name', x='efficiency', palette='magma')
plt.title("Top 10 Least Efficient Players (Salary >= $5M)")
plt.xlabel("Efficiency (AV / $1M)")
plt.show()