# Referee-Playstyle-Discipline Analytics Demo

This notebook demonstrates the comprehensive referee analytics system that models how team playstyles affect disciplinary outcomes by referee, with spatial zone analysis.

## Setup and Imports

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path

# Import our modules
from src.io_load import StatsBombLoader
from src.features import PlaystyleFeatureExtractor
from src.discipline import DisciplineAnalyzer
from src.modeling_zone_nb import ZoneNBModeler
from src.viz_referee import RefereeVisualizer
from backend.server import GitHubAPIClient

# Setup
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✓ All modules imported successfully")

## 1. Data Loading and Feature Engineering

Load StatsBomb data and extract playstyle and discipline features.

In [None]:
# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"- Default competitions: {len(config['default_analysis']['competitions'])}")
print(f"- Zone grid: {config['features']['discipline']['zones']['x_bins']}x{config['features']['discipline']['zones']['y_bins']}")
print(f"- Interaction features: {config['modeling']['zone_nb']['interaction_features']}")

In [None]:
# Initialize components (you'll need to provide GitHub token)
GITHUB_TOKEN = "your_github_token_here"  # Replace with actual token

# Initialize clients
github_client = GitHubAPIClient(GITHUB_TOKEN)
loader = StatsBombLoader(github_client, cache_dir="../data/cache")

# Initialize analyzers
feature_extractor = PlaystyleFeatureExtractor(config['features']['playstyle'])
discipline_analyzer = DisciplineAnalyzer(config['features']['discipline'])

print("✓ Data loading components initialized")

## 2. Sample Match Analysis

Analyze a single match to demonstrate feature extraction.

In [None]:
# Get sample matches from La Liga
try:
    matches_df = loader.get_matches(11, 90)  # La Liga 2020/21
    sample_match = matches_df.iloc[0]
    
    print(f"Sample match: {sample_match['home_team_name']} vs {sample_match['away_team_name']}")
    print(f"Date: {sample_match['match_date']}")
    print(f"Referee: {sample_match.get('referee_name', 'Unknown')}")
    
    # Load events for this match
    events_df = loader.get_events(sample_match['match_id'])
    print(f"Events loaded: {len(events_df)}")
    
except Exception as e:
    print(f"Error loading sample data: {e}")
    print("This is expected if GitHub token is not provided")

## 3. Feature Extraction Demo

In [None]:
# Extract features for both teams (if data is available)
if 'events_df' in locals() and not events_df.empty:
    home_team = sample_match['home_team_name']
    away_team = sample_match['away_team_name']
    
    # Extract playstyle features
    home_playstyle = feature_extractor.extract_team_match_features(
        events_df, home_team, away_team
    )
    
    # Extract discipline features
    home_discipline = discipline_analyzer.extract_team_match_discipline(
        events_df, home_team, away_team
    )
    
    print("Home Team Playstyle Features:")
    for key, value in list(home_playstyle.items())[:8]:
        print(f"  {key}: {value:.3f}")
    
    print("\nHome Team Discipline Features:")
    for key, value in list(home_discipline.items())[:8]:
        print(f"  {key}: {value}")
else:
    print("Skipping feature extraction - no sample data available")

## 4. Zone-wise Modeling Demo

Demonstrate the statistical modeling approach with synthetic data.

In [None]:
# Create synthetic dataset for demonstration
np.random.seed(42)

n_matches = 200
referees = ['Referee A', 'Referee B', 'Referee C', 'Referee D', 'Referee E']

# Generate synthetic team-match data
synthetic_data = []

for i in range(n_matches):
    referee = np.random.choice(referees)
    
    # Base features (standardized)
    z_directness = np.random.normal(0, 1)
    z_ppda = np.random.normal(0, 1)
    z_possession_share = np.random.normal(0, 1)
    z_block_height_x = np.random.normal(0, 1)
    z_wing_share = np.random.normal(0, 1)
    
    # Referee effects (some referees are stricter)
    referee_effect = {'Referee A': 1.2, 'Referee B': 0.8, 'Referee C': 1.0, 
                     'Referee D': 1.1, 'Referee E': 0.9}[referee]
    
    # Home/away
    home_away = np.random.choice(['home', 'away'])
    home_indicator = 1 if home_away == 'home' else 0
    
    # Exposure
    opp_passes = np.random.poisson(400)
    log_opp_passes = np.log(opp_passes)
    
    # Generate fouls for each zone with referee interactions
    row = {
        'match_id': 1000 + i,
        'team': f'Team_{i % 20}',
        'referee_name': referee,
        'home_away': home_away,
        'home_indicator': home_indicator,
        'z_directness': z_directness,
        'z_ppda': z_ppda,
        'z_possession_share': z_possession_share,
        'z_block_height_x': z_block_height_x,
        'z_wing_share': z_wing_share,
        'opp_passes': opp_passes,
        'log_opp_passes': log_opp_passes
    }
    
    # Generate zone-specific fouls
    for x in range(5):
        for y in range(3):
            # Base rate varies by zone (more fouls in middle third)
            base_rate = 0.5 + (1.0 if x in [1, 2, 3] else 0.2)
            
            # Add feature effects and referee interactions
            linear_pred = (base_rate + 
                          0.2 * z_directness + 
                          0.15 * z_ppda +
                          0.1 * home_indicator +
                          0.2 * np.log(referee_effect))
            
            # Add referee-specific directness interaction
            if referee == 'Referee A':  # Stricter on direct play
                linear_pred += 0.3 * z_directness
            elif referee == 'Referee B':  # More lenient on direct play
                linear_pred -= 0.2 * z_directness
            
            # Generate count with exposure
            lambda_param = np.exp(linear_pred + log_opp_passes - np.log(400))  # Normalize exposure
            fouls = np.random.poisson(max(0.1, lambda_param))
            
            row[f'foul_grid_x{x}_y{y}'] = fouls
    
    synthetic_data.append(row)

synthetic_df = pd.DataFrame(synthetic_data)
print(f"Created synthetic dataset: {len(synthetic_df)} team-matches")
print(f"Referees: {synthetic_df['referee_name'].nunique()}")
print(f"Total fouls: {synthetic_df[[c for c in synthetic_df.columns if c.startswith('foul_grid')]].sum().sum()}")

## 5. Fit Zone-wise Models

In [None]:
# Initialize and fit models
modeler = ZoneNBModeler(config)

# Prepare data
df_prepared, prep_info = modeler.prepare_modeling_data(synthetic_df)
print(f"Data prepared: {prep_info}")

# Fit models
print("\nFitting zone-wise NB models...")
fitted_models = modeler.fit_zone_nb_models(
    df_prepared,
    feature_list=['z_directness', 'z_ppda', 'z_possession_share'],
    interaction_features=['directness']
)

print(f"Successfully fitted {len(fitted_models)} zone models")

# Get diagnostics
diagnostics = modeler.get_model_diagnostics()
print(f"\nModel Diagnostics:")
print(f"- Convergence rate: {diagnostics['convergence_rate']:.1%}")
print(f"- Average AIC: {diagnostics['average_aic']:.2f}")
print(f"- Zones analyzed: {len(diagnostics['zones_analyzed'])}")

## 6. Extract Referee Effects

In [None]:
# Extract referee slopes for directness
directness_slopes = modeler.extract_referee_slopes('directness')

if not directness_slopes.empty:
    print("Referee-specific directness effects:")
    print(directness_slopes.groupby('referee_name').agg({
        'slope': 'mean',
        'significant': 'sum'
    }).round(3))
else:
    print("No interaction slopes found (expected with limited synthetic data)")

## 7. Visualization Examples

In [None]:
# Initialize visualizer
visualizer = RefereeVisualizer(config)

# Create a simple heatmap showing expected fouls
sample_team_features = {
    'z_directness': 1.0,  # High directness
    'z_ppda': 0.5,
    'z_possession_share': -0.5,
    'home_indicator': 1,
    'referee_name': 'Referee A',
    'log_opp_passes': np.log(400)
}

# Add zone columns with zeros (required for prediction)
for x in range(5):
    for y in range(3):
        sample_team_features[f'foul_grid_x{x}_y{y}'] = 0

# Predict expected fouls
expected_fouls = modeler.predict_expected_fouls(pd.Series(sample_team_features))

print("Expected fouls per zone for high-directness team with Referee A:")
for zone, fouls in expected_fouls.items():
    print(f"{zone}: {fouls:.2f}")

In [None]:
# Create basic heatmap visualization
fig, ax = plt.subplots(figsize=(12, 8))

# Create 5x3 grid
foul_grid = np.zeros((3, 5))
for zone_id, fouls in expected_fouls.items():
    if zone_id.startswith('zone_'):
        parts = zone_id.split('_')
        x_zone = int(parts[1])
        y_zone = int(parts[2])
        foul_grid[2 - y_zone, x_zone] = fouls  # Flip y for proper orientation

# Create heatmap
im = ax.imshow(foul_grid, cmap='Reds', aspect='auto', extent=[0, 120, 0, 80])

# Add values to cells
for y in range(3):
    for x in range(5):
        value = foul_grid[y, x]
        ax.text((x + 0.5) * 24, (y + 0.5) * 26.67, f'{value:.1f}', 
               ha='center', va='center', fontsize=12, fontweight='bold',
               color='white' if value > np.max(foul_grid) * 0.5 else 'black')

# Formatting
ax.set_xlim(0, 120)
ax.set_ylim(0, 80)
ax.set_xlabel('Field Length (m)')
ax.set_ylabel('Field Width (m)')
ax.set_title('Expected Fouls per Zone\nHigh-Directness Team with Referee A', 
            fontsize=14, fontweight='bold')

# Colorbar
cbar = plt.colorbar(im, ax=ax, shrink=0.8)
cbar.set_label('Expected Fouls per Match', rotation=270, labelpad=20)

plt.tight_layout()
plt.show()

print(f"Total expected fouls: {np.sum(foul_grid):.1f}")

## 8. Summary and Next Steps

This demo has shown:

1. **Feature Engineering**: Extraction of playstyle (directness, pressing, possession) and disciplinary features (fouls by zone)
2. **Zone-wise Modeling**: Negative Binomial GLMs with referee interactions for each field zone
3. **Prediction**: Expected foul counts based on team playstyle and referee
4. **Visualization**: Heatmaps showing spatial distribution of expected fouls

### To use with real data:

1. **Provide GitHub token** for StatsBomb data access
2. **Run dataset builder**: `python src/run_build_dataset.py --github-token YOUR_TOKEN`
3. **Fit models**: `python src/run_fit_models.py --dataset data/team_match_features.parquet`
4. **Generate reports**: `python src/run_report.py --models-dir data/models_nb_zone --heatmap --referee "Referee Name" --feature directness`

### Key insights from this system:

- **Referee consistency**: Some referees are systematically stricter/more lenient
- **Playstyle interactions**: Certain referees penalize specific playing styles more
- **Spatial patterns**: Foul calling varies by field zone and referee
- **Tactical intelligence**: Teams can adapt strategy based on referee assignment