# Play Selection for YouTube Overlay Visualization

This notebook analyzes the NFL tracking data to identify the most compelling plays for YouTube overlay visualization.

**Goals:**
1. Load and analyze play inventory
2. Score plays by YouTube availability and visualization quality
3. Generate top 20 recommendations with YouTube search queries
4. Create detailed recommendation report

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data.loader import NFLDataLoader, extract_ball_in_air_frames
from selection.play_recommender import PlayRecommender

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

%matplotlib inline

## 1. Load Data

In [None]:
# Initialize data loader
print("Loading NFL data...")
loader = NFLDataLoader(data_dir='../data/raw')

print(f"\nData loaded successfully!")
print(f"Games: {len(loader.games)}")
print(f"Plays: {len(loader.plays)}")
print(f"Players: {len(loader.players)}")

In [None]:
# Quick data overview
print("Season(s):", loader.games['season'].unique())
print("Weeks:", sorted(loader.games['week'].unique()))
print("\nTeams:", sorted(loader.games['homeTeamAbbr'].unique()))

## 2. Analyze Play Inventory

In [None]:
# Get pass plays summary
pass_plays = loader.plays[loader.plays['isDropback'] == True]

print(f"Total Plays: {len(loader.plays)}")
print(f"Pass Plays: {len(pass_plays)} ({len(pass_plays)/len(loader.plays)*100:.1f}%)")
print("\nPass Results:")
print(pass_plays['passResult'].value_counts())

In [None]:
# Distribution of air yards
plt.figure(figsize=(14, 5))

plt.subplot(1, 3, 1)
plt.hist(pass_plays['passLength'].dropna(), bins=50, edgecolor='black')
plt.xlabel('Air Yards')
plt.ylabel('Frequency')
plt.title('Distribution of Pass Lengths')
plt.axvline(20, color='red', linestyle='--', label='Min for YouTube (20 yds)')
plt.axvline(30, color='green', linestyle='--', label='Ideal (30+ yds)')
plt.legend()

plt.subplot(1, 3, 2)
sns.boxplot(data=pass_plays, x='passResult', y='passLength')
plt.xlabel('Pass Result')
plt.ylabel('Air Yards')
plt.title('Air Yards by Result')

plt.subplot(1, 3, 3)
deep_passes = pass_plays[pass_plays['passLength'] >= 20]
deep_passes_by_week = deep_passes.groupby('week').size()
plt.bar(deep_passes_by_week.index, deep_passes_by_week.values)
plt.xlabel('Week')
plt.ylabel('Count')
plt.title('Deep Passes (20+ yds) by Week')

plt.tight_layout()
plt.show()

print(f"\nDeep Passes (20+ yards): {len(deep_passes)} ({len(deep_passes)/len(pass_plays)*100:.1f}%)")
print(f"Very Deep Passes (30+ yards): {len(pass_plays[pass_plays['passLength'] >= 30])}")
print(f"Bombs (40+ yards): {len(pass_plays[pass_plays['passLength'] >= 40])}")

## 3. Score Plays for YouTube Overlay Suitability

In [None]:
# Initialize recommender
print("Initializing PlayRecommender...\n")
recommender = PlayRecommender(loader)

# Analyze all plays and score them
print("Scoring all pass plays...")
scored_plays = recommender.analyze_play_inventory()

print(f"\nScored {len(scored_plays)} plays")

In [None]:
# Distribution of scores
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(scored_plays['total_score'], bins=30, edgecolor='black')
plt.xlabel('Total Score')
plt.ylabel('Frequency')
plt.title('Distribution of Play Scores')
plt.axvline(7.5, color='green', linestyle='--', label='HIGH threshold')
plt.axvline(6.0, color='orange', linestyle='--', label='MEDIUM threshold')
plt.legend()

plt.subplot(1, 2, 2)
youtube_counts = scored_plays['youtube_likelihood'].value_counts()
plt.bar(youtube_counts.index, youtube_counts.values, color=['green', 'orange', 'red'])
plt.xlabel('YouTube Likelihood')
plt.ylabel('Count')
plt.title('YouTube Footage Availability Estimate')

plt.tight_layout()
plt.show()

print("\nYouTube Likelihood Distribution:")
print(scored_plays['youtube_likelihood'].value_counts())

## 4. Get Top 20 Recommendations

In [None]:
# Get top 20 plays with HIGH YouTube likelihood
top_20 = recommender.get_top_recommendations(
    n=20,
    min_air_yards=20,
    youtube_quality='HIGH'
)

print(f"Top 20 Recommended Plays:\n")
print("="*80)

for idx, (_, play) in enumerate(top_20.iterrows(), 1):
    game = loader.games[loader.games['gameId'] == play['gameId']].iloc[0]
    
    print(f"\n{idx}. {play['possessionTeam']} vs {play['defensiveTeam']} (Week {play['week']})")
    print(f"   Score: {play['total_score']:.2f}/10 | Air Yards: {play['passLength']:.0f} | Result: {play['passResult']}")
    print(f"   {play['playDescription'][:80]}...")
    
    # YouTube search query
    query = recommender.generate_youtube_query(play)
    print(f"   YouTube: {query}")

print("\n" + "="*80)

In [None]:
# Summary statistics for top 20
print("Top 20 Summary Statistics:\n")
print(f"Average Air Yards: {top_20['passLength'].mean():.1f}")
print(f"Average Score: {top_20['total_score'].mean():.2f}/10")
print(f"\nBreakdown:")
print(f"  Touchdowns: {len(top_20[top_20['playDescription'].str.contains('TOUCHDOWN', case=False, na=False)])}")
print(f"  Interceptions: {len(top_20[top_20['passResult'] == 'IN'])}")
print(f"  Completions: {len(top_20[top_20['passResult'] == 'C'])}")
print(f"  4th Quarter: {len(top_20[top_20['quarter'] == 4])}")
print(f"  3rd/4th Down: {len(top_20[top_20['down'] >= 3])}")
print(f"  40+ Yard Passes: {len(top_20[top_20['passLength'] >= 40])}")

## 5. Test Ball-in-Air Extraction on Top Play

In [None]:
# Test with the #1 recommended play
test_play = top_20.iloc[0]
game_id = test_play['gameId']
play_id = test_play['playId']

print(f"Testing Play #1:")
print(f"Game ID: {game_id}")
print(f"Play ID: {play_id}")
print(f"Description: {test_play['playDescription'][:100]}...")
print(f"\nLoading tracking data...")

# Get play metadata
metadata = loader.get_play_metadata(game_id, play_id)
print(f"\nPlay Metadata:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

In [None]:
# Load tracking data and extract ball-in-air frames
tracking = loader.get_play_tracking(game_id, play_id, week=metadata['week'])

print(f"Total frames in play: {len(tracking['frameId'].unique())}")
print(f"Players tracked: {tracking['nflId'].nunique()}")

# Extract ball-in-air period
ball_in_air, info = extract_ball_in_air_frames(tracking)

print(f"\nBall-in-Air Period:")
print(f"  Frames: {info['frames_in_air']}")
print(f"  Duration: {info['time_in_air']:.2f} seconds")
print(f"  Release frame: {info['pass_forward_frame']}")
print(f"  Arrival frame: {info['outcome_frame']}")

if info['time_in_air'] >= 2.0:
    print(f"\n✅ Excellent! {info['time_in_air']:.2f} seconds is perfect for overlay visualization")
elif info['time_in_air'] >= 1.5:
    print(f"\n✓ Good! {info['time_in_air']:.2f} seconds is workable")
else:
    print(f"\n⚠ Short! {info['time_in_air']:.2f} seconds might be too brief")

In [None]:
# Check for data quality
print("Data Quality Check:\n")

# Check for missing values in key columns
key_cols = ['x', 'y', 's', 'dir', 'o']
for col in key_cols:
    missing = ball_in_air[col].isna().sum()
    total = len(ball_in_air)
    print(f"  {col}: {missing}/{total} missing ({missing/total*100:.1f}%)")

# Check events
events = ball_in_air[ball_in_air['event'].notna()]['event'].unique()
print(f"\nEvents captured: {list(events)}")

# Check ball tracking
ball_frames = ball_in_air[ball_in_air['club'] == 'football']
print(f"\nBall tracked: {len(ball_frames)} frames ({len(ball_frames)/info['frames_in_air']*100:.1f}%)")

if len(ball_frames) >= info['frames_in_air'] * 0.8:
    print("✅ Good ball tracking!")
else:
    print("⚠ Incomplete ball tracking")

## 6. Generate Detailed Recommendation Report

In [None]:
# Generate comprehensive markdown report
print("Generating detailed recommendation report...\n")

report_path = recommender.create_recommendation_report(
    output_path='../RECOMMENDED_PLAYS.md'
)

print(f"\n✅ Report generated: {report_path}")
print("\nThis report includes:")
print("  - Top 20 plays with full details")
print("  - YouTube search queries for each play")
print("  - Direct YouTube search links")
print("  - Why each play is compelling")
print("  - Game/Play IDs for data loading")
print("  - Summary statistics")

## 7. Export Top Plays to CSV

In [None]:
# Export for easy reference
export_cols = [
    'gameId', 'playId', 'week', 'quarter', 'down', 'yardsToGo',
    'possessionTeam', 'defensiveTeam', 'passLength', 'passResult',
    'total_score', 'youtube_likelihood', 'playDescription'
]

top_20[export_cols].to_csv('../outputs/top_20_plays.csv', index=False)
print("✅ Top 20 plays exported to: outputs/top_20_plays.csv")

## Next Steps

1. **Review** `RECOMMENDED_PLAYS.md` for detailed play information
2. **Search YouTube** using the provided queries
3. **Download footage** for plays with available video
4. **Test synchronization** with Play #1 (highest scored)
5. **Create proof of concept** overlay for 1 play
6. **Scale to 5-7** final visualizations