# Tactical Analysis & Team Clustering
Analyze team playing styles and cluster teams based on tactical metrics.

In [1]:
%pip install --upgrade nbformat

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go

DATA_DIR = "./data"

print("âš½ Tactical Analysis Pipeline")
print("="*60)

âš½ Tactical Analysis Pipeline


## 1. Load Team Features

In [3]:
print("\nðŸ“‚ Loading team features...")
team_features = pd.read_csv(os.path.join(DATA_DIR, "team_features_2023.csv"))
team_features['date'] = pd.to_datetime(team_features['date'])

print(f"âœ“ Team features loaded: {len(team_features)} records")
print(f"  Teams: {team_features['team_name'].nunique()}")
print(f"  Matchweeks: {team_features['matchweek'].max()}")


ðŸ“‚ Loading team features...
âœ“ Team features loaded: 760 records
  Teams: 20
  Matchweeks: 38


## 2. Create Team Style Vectors
Aggregate season-level tactical metrics for each team.

In [4]:
print("\n" + "="*60)
print("CREATING TEAM STYLE VECTORS")
print("="*60)

# Calculate season averages for tactical features
print("\n  Aggregating tactical metrics...")

tactical_features = [
    'xG', 'xGA', 'shots', 'shots_against',
    'shots_on_target', 'deep', 'ppda', 
    'shot_quality', 'pressing_intensity'
]

team_styles = team_features.groupby('team_name')[tactical_features].mean().reset_index()

# Add form metrics
team_styles['avg_goals_for'] = team_features.groupby('team_name')['goals_for'].mean().values
team_styles['avg_goals_against'] = team_features.groupby('team_name')['goals_against'].mean().values
team_styles['ppg'] = team_features.groupby('team_name')['points'].mean().values
team_styles['win_rate'] = (team_features.groupby('team_name')['result'].apply(lambda x: (x == 'W').mean())).values

print(f"âœ“ Team style vectors created: {len(team_styles)} teams")
print(f"  Features: {len(tactical_features)} tactical metrics")


CREATING TEAM STYLE VECTORS

  Aggregating tactical metrics...
âœ“ Team style vectors created: 20 teams
  Features: 9 tactical metrics


## 3. Cluster Teams by Playing Style
Use K-means clustering to group teams with similar tactics.

In [5]:
print("\n" + "="*60)
print("CLUSTERING TEAMS")
print("="*60)

# Prepare features for clustering
print("\n  Standardizing features...")
feature_cols = tactical_features
X = team_styles[feature_cols].fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# K-means clustering
print("\n  Running K-means clustering (k=5)...")
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
team_styles['cluster'] = kmeans.fit_predict(X_scaled)

print(f"âœ“ Teams clustered into {n_clusters} groups")

# Show cluster distribution
for cluster_id in range(n_clusters):
    cluster_teams = team_styles[team_styles['cluster'] == cluster_id]['team_name'].tolist()
    print(f"\n  Cluster {cluster_id}: {len(cluster_teams)} teams")
    print(f"    {', '.join(cluster_teams)}")


CLUSTERING TEAMS

  Standardizing features...

  Running K-means clustering (k=5)...
âœ“ Teams clustered into 5 groups

  Cluster 0: 8 teams
    Aston Villa, Burnley, Crystal Palace, Everton, Fulham, Manchester United, Nottingham Forest, Wolverhampton Wanderers

  Cluster 1: 2 teams
    Arsenal, Manchester City

  Cluster 2: 6 teams
    Bournemouth, Brentford, Brighton, Chelsea, Newcastle United, Tottenham

  Cluster 3: 3 teams
    Luton, Sheffield United, West Ham

  Cluster 4: 1 teams
    Liverpool


## 4. Analyze Cluster Characteristics
Describe the tactical profile of each cluster.

In [6]:
print("\n" + "="*60)
print("CLUSTER CHARACTERISTICS")
print("="*60)

# Calculate cluster centroids
cluster_profiles = team_styles.groupby('cluster')[tactical_features + ['ppg', 'win_rate']].mean()

# Define cluster labels based on characteristics
cluster_labels = {}
for cluster_id in range(n_clusters):
    profile = cluster_profiles.loc[cluster_id]
    
    # Categorize based on key metrics
    if profile['xG'] > cluster_profiles['xG'].mean() and profile['pressing_intensity'] > cluster_profiles['pressing_intensity'].mean():
        label = "Attacking Pressing"
    elif profile['xG'] > cluster_profiles['xG'].mean():
        label = "Attacking Possession"
    elif profile['xGA'] < cluster_profiles['xGA'].mean() and profile['ppda'] < cluster_profiles['ppda'].mean():
        label = "Defensive Deep Block"
    elif profile['pressing_intensity'] > cluster_profiles['pressing_intensity'].mean():
        label = "High Pressing"
    else:
        label = "Balanced"
    
    cluster_labels[cluster_id] = label

team_styles['cluster_label'] = team_styles['cluster'].map(cluster_labels)

print("\nðŸ“Š Cluster Profiles:")
for cluster_id in range(n_clusters):
    profile = cluster_profiles.loc[cluster_id]
    label = cluster_labels[cluster_id]
    teams_in_cluster = team_styles[team_styles['cluster'] == cluster_id]
    
    print(f"\n  ðŸŽ¯ Cluster {cluster_id}: {label}")
    print(f"     Teams: {len(teams_in_cluster)}")
    print(f"     Avg xG: {profile['xG']:.2f}")
    print(f"     Avg xGA: {profile['xGA']:.2f}")
    print(f"     Pressing: {profile['pressing_intensity']:.2f}")
    print(f"     Shot Quality: {profile['shot_quality']:.2f}")
    print(f"     PPG: {profile['ppg']:.2f}")


CLUSTER CHARACTERISTICS

ðŸ“Š Cluster Profiles:

  ðŸŽ¯ Cluster 0: Balanced
     Teams: 8
     Avg xG: 1.45
     Avg xGA: 1.75
     Pressing: 0.08
     Shot Quality: 0.12
     PPG: 1.24

  ðŸŽ¯ Cluster 1: Attacking Possession
     Teams: 2
     Avg xG: 2.29
     Avg xGA: 0.91
     Pressing: 0.10
     Shot Quality: 0.14
     PPG: 2.37

  ðŸŽ¯ Cluster 2: Attacking Pressing
     Teams: 6
     Avg xG: 1.88
     Avg xGA: 1.67
     Pressing: 0.12
     Shot Quality: 0.13
     PPG: 1.42

  ðŸŽ¯ Cluster 3: Balanced
     Teams: 3
     Avg xG: 1.28
     Avg xGA: 2.24
     Pressing: 0.07
     Shot Quality: 0.12
     PPG: 0.82

  ðŸŽ¯ Cluster 4: Attacking Pressing
     Teams: 1
     Avg xG: 2.49
     Avg xGA: 1.25
     Pressing: 0.16
     Shot Quality: 0.12
     PPG: 2.16


## 5. Visualize Clusters with PCA
Reduce dimensions and visualize team clusters.

In [7]:
print("\n" + "="*60)
print("PCA VISUALIZATION")
print("="*60)

# Apply PCA
print("\n  Applying PCA (2 components)...")
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

team_styles['pca1'] = X_pca[:, 0]
team_styles['pca2'] = X_pca[:, 1]

print(f"âœ“ PCA complete")
print(f"  Explained variance: {pca.explained_variance_ratio_.sum():.2%}")

# Create interactive scatter plot
fig = px.scatter(
    team_styles,
    x='pca1',
    y='pca2',
    color='cluster_label',
    hover_name='team_name',
    hover_data={
        'pca1': False,
        'pca2': False,
        'cluster_label': True,
        'xG': ':.2f',
        'xGA': ':.2f',
        'ppg': ':.2f',
        'pressing_intensity': ':.2f'
    },
    title='Premier League Team Tactical Clusters (2023/24)',
    labels={'pca1': 'PC1', 'pca2': 'PC2', 'cluster_label': 'Tactical Style'},
    width=900,
    height=600
)

fig.update_traces(marker=dict(size=15, line=dict(width=1, color='white')))
fig.show()

print("\nðŸ“Š Interactive visualization displayed above")


PCA VISUALIZATION

  Applying PCA (2 components)...
âœ“ PCA complete
  Explained variance: 83.78%



ðŸ“Š Interactive visualization displayed above


## 6. Team Comparison Radar Charts
Create radar charts to compare team tactical profiles.

In [8]:
print("\n" + "="*60)
print("TEAM COMPARISON VISUALIZATION")
print("="*60)

def create_radar_chart(team_name, comparison_teams=None):
    """Create radar chart comparing a team to league average or other teams."""
    
    # Get team data
    team_data = team_styles[team_styles['team_name'] == team_name].iloc[0]
    
    # Calculate percentile ranks
    radar_metrics = ['xG', 'xGA', 'shots', 'pressing_intensity', 'shot_quality', 'deep']
    percentiles = []
    
    for metric in radar_metrics:
        # Lower is better for xGA
        if metric == 'xGA':
            percentile = (team_styles[metric] >= team_data[metric]).mean() * 100
        else:
            percentile = (team_styles[metric] <= team_data[metric]).mean() * 100
        percentiles.append(percentile)
    
    # Create figure
    fig = go.Figure()
    
    # Add team trace
    fig.add_trace(go.Scatterpolar(
        r=percentiles,
        theta=radar_metrics,
        fill='toself',
        name=team_name,
        line=dict(color='#00ff87', width=2)
    ))
    
    # Add comparison teams
    if comparison_teams:
        for comp_team in comparison_teams:
            comp_data = team_styles[team_styles['team_name'] == comp_team].iloc[0]
            comp_percentiles = []
            
            for metric in radar_metrics:
                if metric == 'xGA':
                    percentile = (team_styles[metric] >= comp_data[metric]).mean() * 100
                else:
                    percentile = (team_styles[metric] <= comp_data[metric]).mean() * 100
                comp_percentiles.append(percentile)
            
            fig.add_trace(go.Scatterpolar(
                r=comp_percentiles,
                theta=radar_metrics,
                fill='toself',
                name=comp_team,
                opacity=0.6
            ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True, range=[0, 100])
        ),
        showlegend=True,
        title=f'Tactical Profile: {team_name} (Percentile Ranks)',
        width=700,
        height=500
    )
    
    return fig

# Example: Compare top teams
print("\n  Creating radar charts for top teams...")
top_teams = team_styles.nlargest(3, 'ppg')['team_name'].tolist()

for i, team in enumerate(top_teams[:2]):  # Show first 2
    fig = create_radar_chart(team, comparison_teams=[top_teams[(i+1) % len(top_teams)]])
    fig.show()
    print(f"âœ“ Radar chart for {team}")


TEAM COMPARISON VISUALIZATION

  Creating radar charts for top teams...


âœ“ Radar chart for Manchester City


âœ“ Radar chart for Arsenal


## 7. Save Results

In [9]:
print("\n" + "="*60)
print("SAVING RESULTS")
print("="*60)

# Save team styles with clusters
team_styles.to_csv(os.path.join(DATA_DIR, "team_tactical_styles_2023.csv"), index=False)
print(f"ðŸ’¾ Saved: team_tactical_styles_2023.csv")

# Save cluster profiles
cluster_profiles.reset_index().to_csv(os.path.join(DATA_DIR, "cluster_profiles_2023.csv"), index=False)
print(f"ðŸ’¾ Saved: cluster_profiles_2023.csv")

print("\n" + "="*60)
print("âœ… TACTICAL ANALYSIS COMPLETE!")
print("="*60)

print("\nðŸ“Š Results Summary:")
print(f"  {n_clusters} tactical clusters identified")
print(f"  {len(team_styles)} teams analyzed")
print(f"  Cluster labels: {list(cluster_labels.values())}")


SAVING RESULTS
ðŸ’¾ Saved: team_tactical_styles_2023.csv
ðŸ’¾ Saved: cluster_profiles_2023.csv

âœ… TACTICAL ANALYSIS COMPLETE!

ðŸ“Š Results Summary:
  5 tactical clusters identified
  20 teams analyzed
  Cluster labels: ['Balanced', 'Attacking Possession', 'Attacking Pressing', 'Balanced', 'Attacking Pressing']
