# Player Scouting & Similarity Analysis
Find similar players and generate scouting reports for the 2023/24 season.

In [9]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path

# Add src directory to path
sys.path.append(str(Path('./src').resolve()))
from player_scout import PlayerScout

import plotly.express as px
import plotly.graph_objects as go

DATA_DIR = "./data"

print("üîç Player Scouting Pipeline")
print("="*60)

üîç Player Scouting Pipeline


## 1. Load Player Data

In [10]:
print("\nüìÇ Loading player data...")
player_profiles = pd.read_csv(os.path.join(DATA_DIR, "player_profiles_2023.csv"))

print(f"‚úì Player profiles loaded: {len(player_profiles)} qualified players")
print(f"  Teams: {player_profiles['team_name'].nunique()}")
print(f"  Positions: {player_profiles['position'].unique()}")


üìÇ Loading player data...
‚úì Player profiles loaded: 435 qualified players
  Teams: 20
  Positions: ['AML' 'DC' 'MC' 'GK' 'DMC' 'Sub' 'AMC' 'FW' 'AMR' 'DR' 'DL' 'MR' 'FWL'
 'FWR' 'ML' 'DMR' 'DML']


## 2. Create Player Feature Vectors
Select relevant metrics for similarity comparison.

In [11]:
print("\n" + "="*60)
print("CREATING PLAYER FEATURE VECTORS")
print("="*60)

# Initialize PlayerScout
scout = PlayerScout()

# Normalize position labels and assign groups
player_profiles['position_group'] = player_profiles['position'].apply(scout.normalize_position)

print(f"‚úì Position groups assigned:")
print(player_profiles['position_group'].value_counts())

# Get feature list from scout
all_features_per90 = scout.all_features_per90
position_groups = scout.position_groups


CREATING PLAYER FEATURE VECTORS
‚úì Position groups assigned:
position_group
Midfielder    221
Defender      152
Forward        62
Name: count, dtype: int64


## 3. Calculate Player Similarity
Use cosine similarity to find similar players within each position group.

In [12]:
print("\n" + "="*60)
print("CALCULATING PLAYER SIMILARITY")
print("="*60)

# Example: Find similar players using PlayerScout
print("\nüîç Finding similar players...")

# Get top scorer
top_scorer = player_profiles.nlargest(1, 'goals_per90').iloc[0]['player_name']
print(f"\nüìä Players similar to {top_scorer}:")

# Use scout's find_similar_players method
similar = scout.find_similar_players(player_profiles, top_scorer, n=5)
if len(similar) > 0:
    # Rename similarity_score to similarity for display
    display_df = similar.copy()
    if 'similarity_score' in display_df.columns:
        display_df = display_df.rename(columns={'similarity_score': 'similarity'})
    print(display_df.to_string(index=False))


CALCULATING PLAYER SIMILARITY

üîç Finding similar players...

üìä Players similar to Cole Palmer:
 player_name       team_name position  minutes  goals_per90  assists_per90  xG_per90  similarity
  Alex Iwobi          Fulham      AMR      757     0.356671       0.237781  0.330709    0.976019
  Phil Foden Manchester City      AMC      535     0.841121       0.504673  0.374846    0.951963
Moussa Diaby     Aston Villa      AMC      841     0.428062       0.428062  0.436389    0.946080
Eberechi Eze  Crystal Palace      AMC     1280     0.492187       0.210938  0.431953    0.939589
Noni Madueke         Chelsea      AMR      964     0.373444       0.186722  0.176901    0.921011


## 4. Player Percentile Rankings
Calculate percentile ranks for each metric within position groups.

In [13]:
print("\n" + "="*60)
print("CALCULATING PERCENTILE RANKINGS")
print("="*60)

# Calculate percentiles by position group
print("\n  Computing percentile ranks...")

percentile_cols = []
for col in all_features_per90:
    percentile_col = f'{col}_pct'
    player_profiles[percentile_col] = player_profiles.groupby('position_group')[col].rank(pct=True) * 100
    percentile_cols.append(percentile_col)

print(f"‚úì Percentile rankings calculated for {len(all_features_per90)} metrics")

# Add overall rating (average of key percentiles)
key_metrics = {
    'Forward': ['goals_per90_pct', 'xG_per90_pct', 'shots_per90_pct'],
    'Midfielder': ['assists_per90_pct', 'xA_per90_pct', 'key_passes_per90_pct', 'xGChain_per90_pct'],
    'Defender': ['xGBuildup_per90_pct', 'xGChain_per90_pct']
}

def calculate_overall_rating(row):
    """Calculate position-specific overall rating."""
    pos_group = row['position_group']
    metrics = key_metrics.get(pos_group, all_features_per90)
    
    pct_metrics = [m + '_pct' if not m.endswith('_pct') else m for m in metrics]
    values = [row[m] for m in pct_metrics if m in row.index]
    
    return np.mean(values) if values else 0

player_profiles['overall_rating'] = player_profiles.apply(calculate_overall_rating, axis=1)

print(f"‚úì Overall ratings calculated")


CALCULATING PERCENTILE RANKINGS

  Computing percentile ranks...
‚úì Percentile rankings calculated for 8 metrics
‚úì Overall ratings calculated


## 5. Generate Scouting Reports
Create detailed reports for top players.

In [14]:
print("\n" + "="*60)
print("GENERATING SCOUTING REPORTS")
print("="*60)

def create_scouting_report(player_name):
    """Generate a detailed scouting report for a player using PlayerScout."""
    
    # Get report from scout
    report = scout.create_player_report(player_profiles, player_name)
    
    if 'error' in report:
        print(f"‚ö†Ô∏è {report['error']}")
        return
    
    print(f"\n{'='*60}")
    print(f"SCOUTING REPORT: {report['player_name']}")
    print(f"{'='*60}")
    print(f"Team: {report['team']}")
    print(f"Position: {report['position']} ({report['position_group']})")
    print(f"Minutes Played: {report['minutes']:.0f} ({report['matches']:.0f} appearances)")
    
    # Calculate overall rating if percentiles available
    if 'percentiles' in report and report['percentiles']:
        overall_rating = np.mean(list(report['percentiles'].values()))
        print(f"\nüìä OVERALL RATING: {overall_rating:.1f}/100")
    
    print(f"\n‚öΩ ATTACKING METRICS:")
    print(f"  Goals per 90: {report['goals_per90']:.2f}")
    print(f"  xG per 90: {report['xG_per90']:.2f}")
    print(f"  Assists per 90: {report['assists_per90']:.2f}")
    print(f"  xA per 90: {report['xA_per90']:.2f}")
    
    print(f"\nüéØ INVOLVEMENT METRICS:")
    print(f"  Shots per 90: {report['shots_per90']:.2f}")
    print(f"  Key Passes per 90: {report['key_passes_per90']:.2f}")
    print(f"  xGChain per 90: {report['xGChain_per90']:.2f}")
    
    # Find similar players
    print(f"\nüîç SIMILAR PLAYERS:")
    similar = scout.find_similar_players(player_profiles, player_name, n=3)
    if len(similar) > 0:
        for idx, sim_player in similar.iterrows():
            similarity = sim_player.get('similarity_score', sim_player.get('similarity', 0))
            print(f"  ‚Ä¢ {sim_player['player_name']} ({sim_player['team_name']}) - {similarity:.2%} similar")
    
    print(f"\n{'='*60}\n")

# Generate reports for top players in each position
print("\nüìã Generating reports for top players by position...")

for pos_group in ['Forward', 'Midfielder', 'Defender']:
    top_player = player_profiles[player_profiles['position_group'] == pos_group].nlargest(1, 'overall_rating')
    if len(top_player) > 0:
        create_scouting_report(top_player.iloc[0]['player_name'])


GENERATING SCOUTING REPORTS

üìã Generating reports for top players by position...

SCOUTING REPORT: Erling Haaland
Team: Manchester City
Position: FW (Forward)
Minutes Played: 2540 (31 appearances)

üìä OVERALL RATING: 77.4/100

‚öΩ ATTACKING METRICS:
  Goals per 90: 0.92
  xG per 90: 1.10
  Assists per 90: 0.18
  xA per 90: 0.17

üéØ INVOLVEMENT METRICS:
  Shots per 90: 4.18
  Key Passes per 90: 1.03
  xGChain per 90: 1.04

üîç SIMILAR PLAYERS:
  ‚Ä¢ Alexander Isak (Newcastle United) - 93.41% similar
  ‚Ä¢ Juli√°n √Ålvarez (Manchester City) - 83.97% similar
  ‚Ä¢ Callum Wilson (Newcastle United) - 79.20% similar



SCOUTING REPORT: Kevin De Bruyne
Team: Manchester City
Position: AMC (Midfielder)
Minutes Played: 640 (17 appearances)

üìä OVERALL RATING: 91.3/100

‚öΩ ATTACKING METRICS:
  Goals per 90: 0.28
  xG per 90: 0.17
  Assists per 90: 0.70
  xA per 90: 0.98

üéØ INVOLVEMENT METRICS:
  Shots per 90: 3.09
  Key Passes per 90: 4.22
  xGChain per 90: 1.23

üîç SIMILAR PLAYE

## 6. Visualize Player Comparisons
Create scatter plots and radar charts for player analysis.

In [15]:
print("\n" + "="*60)
print("CREATING VISUALIZATIONS")
print("="*60)

# Scatter plot: Goals vs Assists per 90
print("\n  Creating scatter plot...")

fig = px.scatter(
    player_profiles,
    x='goals_per90',
    y='assists_per90',
    color='position_group',
    size='minutes',
    hover_name='player_name',
    hover_data={
        'team_name': True,
        'position': True,
        'goals_per90': ':.2f',
        'assists_per90': ':.2f',
        'xG_per90': ':.2f',
        'minutes': True,
        'overall_rating': ':.1f'
    },
    title='Premier League Players: Goals vs Assists per 90 (2023/24)',
    labels={
        'goals_per90': 'Goals per 90',
        'assists_per90': 'Assists per 90',
        'position_group': 'Position'
    },
    width=1000,
    height=600
)

fig.update_traces(marker=dict(line=dict(width=0.5, color='white')))
fig.show()

print("‚úì Scatter plot created (display skipped in batch mode)")

# Radar chart for player comparison
def create_player_radar(player_name, comparison_players=None):
    """Create radar chart for player comparison."""
    
    player_data = player_profiles[player_profiles['player_name'] == player_name].iloc[0]
    
    # Use percentile ranks
    radar_metrics = [col.replace('_pct', '') for col in percentile_cols]
    percentiles = [player_data[f'{m}_pct'] for m in radar_metrics]
    
    # Simplify metric names for display
    display_names = [m.replace('_per90', '').replace('_', ' ').title() for m in radar_metrics]
    
    fig = go.Figure()
    
    # Add main player
    fig.add_trace(go.Scatterpolar(
        r=percentiles,
        theta=display_names,
        fill='toself',
        name=player_name,
        line=dict(color='#00ff87', width=2)
    ))
    
    # Add comparison players
    if comparison_players:
        for comp_name in comparison_players:
            comp_data = player_profiles[player_profiles['player_name'] == comp_name]
            if len(comp_data) > 0:
                comp_data = comp_data.iloc[0]
                comp_percentiles = [comp_data[f'{m}_pct'] for m in radar_metrics]
                
                fig.add_trace(go.Scatterpolar(
                    r=comp_percentiles,
                    theta=display_names,
                    fill='toself',
                    name=comp_name,
                    opacity=0.6
                ))
    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
        showlegend=True,
        title=f'Player Profile: {player_name} (Percentile Ranks)',
        width=800,
        height=600
    )
    
    return fig

# Example radar chart
print("\n  Creating radar chart...")
top_forwards = player_profiles[player_profiles['position_group'] == 'Forward'].nlargest(2, 'overall_rating')
if len(top_forwards) >= 2:
    player1 = top_forwards.iloc[0]['player_name']
    player2 = top_forwards.iloc[1]['player_name']
    
    fig = create_player_radar(player1, comparison_players=[player2])
    fig.show()
    print(f"‚úì Radar chart created for {player1} vs {player2} (display skipped in batch mode)")


CREATING VISUALIZATIONS

  Creating scatter plot...


‚úì Scatter plot created (display skipped in batch mode)

  Creating radar chart...


‚úì Radar chart created for Erling Haaland vs Alexander Isak (display skipped in batch mode)


## 7. Save Results

In [16]:
print("\n" + "="*60)
print("SAVING RESULTS")
print("="*60)

# Save enhanced player profiles with percentiles
player_profiles.to_csv(os.path.join(DATA_DIR, "player_profiles_enhanced_2023.csv"), index=False)
print(f"üíæ Saved: player_profiles_enhanced_2023.csv")

# Create leaderboards
print("\nüìä Creating position-specific leaderboards...")

for pos_group in ['Forward', 'Midfielder', 'Defender']:
    leaderboard = player_profiles[player_profiles['position_group'] == pos_group].nlargest(20, 'overall_rating')
    filename = f"leaderboard_{pos_group.lower()}_2023.csv"
    leaderboard.to_csv(os.path.join(DATA_DIR, filename), index=False)
    print(f"üíæ Saved: {filename}")

print("\n" + "="*60)
print("‚úÖ PLAYER SCOUTING ANALYSIS COMPLETE!")
print("="*60)

print("\nüìä Summary:")
print(f"  Total players analyzed: {len(player_profiles)}")
print(f"  Forwards: {len(player_profiles[player_profiles['position_group'] == 'Forward'])}")
print(f"  Midfielders: {len(player_profiles[player_profiles['position_group'] == 'Midfielder'])}")
print(f"  Defenders: {len(player_profiles[player_profiles['position_group'] == 'Defender'])}")
print(f"\n  Top-rated Forward: {player_profiles[player_profiles['position_group'] == 'Forward'].nlargest(1, 'overall_rating').iloc[0]['player_name']}")
print(f"  Top-rated Midfielder: {player_profiles[player_profiles['position_group'] == 'Midfielder'].nlargest(1, 'overall_rating').iloc[0]['player_name']}")
print(f"  Top-rated Defender: {player_profiles[player_profiles['position_group'] == 'Defender'].nlargest(1, 'overall_rating').iloc[0]['player_name']}")


SAVING RESULTS
üíæ Saved: player_profiles_enhanced_2023.csv

üìä Creating position-specific leaderboards...
üíæ Saved: leaderboard_forward_2023.csv
üíæ Saved: leaderboard_midfielder_2023.csv
üíæ Saved: leaderboard_defender_2023.csv

‚úÖ PLAYER SCOUTING ANALYSIS COMPLETE!

üìä Summary:
  Total players analyzed: 435
  Forwards: 62
  Midfielders: 221
  Defenders: 152

  Top-rated Forward: Erling Haaland
  Top-rated Midfielder: Kevin De Bruyne
  Top-rated Defender: John Stones
