# Composite IQ Testing

This notebook tests different ways to interpret and combine the percentiles from our 11 IQ metrics to create a final composite Basketball IQ score.

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats


## Load the Percentile Data

First, let's load the percentile data we calculated and see what we're working with.

In [24]:
# Load the raw IQ metrics data and calculate percentiles within the notebook
iq_metrics = pd.read_csv('../data/processed/all_player_iq_metrics.csv')

print(f"Raw IQ metrics shape: {iq_metrics.shape}")
print(f"\nColumns: {list(iq_metrics.columns)}")

# Define the 11 IQ metric columns (matching README)
metric_columns = [
    'ast_tov_ratio', 'late_clock_efficiency', 'clutch_ast_tov', 'efg_pct',
    'deflections_per_36', 'screen_assists_per_36', 'loose_balls_per_36', 
    'shooting_foul_pct', 'personal_foul_rate', 'age', 'ast_pct'
]

print(f"\nCalculating percentiles for {len(iq_metrics)} players across {len(metric_columns)} metrics...")

# Create dataframe for detailed percentiles
base_columns = ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'POSITION', 'GP', 'MIN']
df = iq_metrics[base_columns].copy()

# Calculate percentiles for each metric
for metric in metric_columns:
    print(f"  Calculating percentiles for {metric}...")
    
    if metric in ['screen_assists_per_36', 'shooting_foul_pct']:
        # Position-based percentiles for screen assists and shooting foul rate only
        percentiles = []
        for idx, row in iq_metrics.iterrows():
            value = row[metric]
            position = row.get('POSITION', 'Unknown')
            
            if position == 'Unknown' or pd.isna(position):
                # Fall back to league-wide percentiles if position is unknown
                metric_values = iq_metrics[metric]
                finite_values = metric_values[np.isfinite(metric_values)]
                if np.isfinite(value):
                    if metric in ['shooting_foul_pct', 'personal_foul_rate']:
                        percentile = (finite_values > value).mean() * 100  # Lower is better
                    else:
                        percentile = (finite_values < value).mean() * 100  # Higher is better
                else:
                    percentile = np.nan
            else:
                # Get values for players in the same position
                same_position_values = iq_metrics[iq_metrics['POSITION'] == position][metric]
                finite_position_values = same_position_values[np.isfinite(same_position_values)]
                
                if np.isfinite(value) and len(finite_position_values) > 1:
                    if metric in ['shooting_foul_pct', 'personal_foul_rate']:
                        percentile = (finite_position_values > value).mean() * 100  # Lower is better
                    else:
                        percentile = (finite_position_values < value).mean() * 100  # Higher is better
                else:
                    percentile = np.nan
            percentiles.append(percentile)
    else:
        # League-wide percentiles for all other metrics
        metric_values = iq_metrics[metric]
        finite_values = metric_values[np.isfinite(metric_values)]
        
        percentiles = []
        for value in metric_values:
            if np.isfinite(value):
                if metric in ['personal_foul_rate']:
                    percentile = (finite_values > value).mean() * 100  # Lower is better for personal fouls
                else:
                    percentile = (finite_values < value).mean() * 100  # Higher is better for all others
            else:
                percentile = np.nan
            percentiles.append(percentile)
    
    # Add percentile column with rounding to 1 decimal place, replace NaN with 50th percentile
    df[f'{metric}_percentile'] = np.round(percentiles, 1)
    df[f'{metric}_percentile'] = df[f'{metric}_percentile'].fillna(50.0)

print(f"\nDataset shape after adding percentiles: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Show first few rows
df.head()

Raw IQ metrics shape: (300, 17)

Columns: ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'POSITION', 'GP', 'MIN', 'ast_tov_ratio', 'late_clock_efficiency', 'clutch_ast_tov', 'efg_pct', 'deflections_per_36', 'screen_assists_per_36', 'loose_balls_per_36', 'shooting_foul_pct', 'personal_foul_rate', 'age', 'ast_pct']

Calculating percentiles for 300 players across 11 metrics...
  Calculating percentiles for ast_tov_ratio...
  Calculating percentiles for late_clock_efficiency...
  Calculating percentiles for clutch_ast_tov...
  Calculating percentiles for efg_pct...
  Calculating percentiles for deflections_per_36...
  Calculating percentiles for screen_assists_per_36...
  Calculating percentiles for loose_balls_per_36...
  Calculating percentiles for shooting_foul_pct...
  Calculating percentiles for personal_foul_rate...
  Calculating percentiles for age...
  Calculating percentiles for ast_pct...

Dataset shape after adding percentiles: (300, 17)
Columns: ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID',

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,POSITION,GP,MIN,ast_tov_ratio_percentile,late_clock_efficiency_percentile,clutch_ast_tov_percentile,efg_pct_percentile,deflections_per_36_percentile,screen_assists_per_36_percentile,loose_balls_per_36_percentile,shooting_foul_pct_percentile,personal_foul_rate_percentile,age_percentile,ast_pct_percentile
0,1628983,Shai Gilgeous-Alexander,1610612760,Unknown,76,34.2,85.0,55.1,71.3,67.0,72.3,16.0,88.7,78.3,66.0,50.3,92.7
1,203507,Giannis Antetokounmpo,1610612749,Unknown,67,34.2,68.3,89.0,89.7,88.3,18.0,83.0,42.7,89.2,61.7,79.7,96.0
2,203999,Nikola Jokić,1610612743,Unknown,70,36.7,93.7,81.5,76.0,94.7,90.3,94.0,10.3,83.4,69.3,74.3,99.3
3,1629029,Luka Dončić,1610612747,Unknown,50,35.4,69.3,47.3,82.0,41.3,87.0,5.0,10.3,60.3,50.7,40.7,96.7
4,1630162,Anthony Edwards,1610612750,Unknown,79,36.3,21.7,30.1,23.0,50.7,39.7,5.0,24.7,94.2,85.7,19.3,71.0


In [25]:
# Identify percentile columns
percentile_cols = [col for col in df.columns if col.endswith('_percentile')]
print(f"Percentile columns ({len(percentile_cols)}): {percentile_cols}")

# Check for any missing values in percentiles
print("\nMissing values in percentiles:")
for col in percentile_cols:
    missing = df[col].isnull().sum()
    print(f"{col}: {missing}")

Percentile columns (11): ['ast_tov_ratio_percentile', 'late_clock_efficiency_percentile', 'clutch_ast_tov_percentile', 'efg_pct_percentile', 'deflections_per_36_percentile', 'screen_assists_per_36_percentile', 'loose_balls_per_36_percentile', 'shooting_foul_pct_percentile', 'personal_foul_rate_percentile', 'age_percentile', 'ast_pct_percentile']

Missing values in percentiles:
ast_tov_ratio_percentile: 0
late_clock_efficiency_percentile: 0
clutch_ast_tov_percentile: 0
efg_pct_percentile: 0
deflections_per_36_percentile: 0
screen_assists_per_36_percentile: 0
loose_balls_per_36_percentile: 0
shooting_foul_pct_percentile: 0
personal_foul_rate_percentile: 0
age_percentile: 0
ast_pct_percentile: 0


## Method 1: Simple Average

The most straightforward approach - equal weight to all 11 metrics.

In [26]:
# Method 1: Simple average of all percentiles
df['composite_simple_avg'] = df[percentile_cols].mean(axis=1)

# Create rankings
df['rank_simple_avg'] = df['composite_simple_avg'].rank(ascending=False, method='min').astype(int)

# Show top 10
print("=== TOP 10 - SIMPLE AVERAGE METHOD ===")
top_10_simple = df.nlargest(10, 'composite_simple_avg')[['PLAYER_NAME', 'POSITION', 'TEAM_ID', 'composite_simple_avg', 'rank_simple_avg']]
for _, player in top_10_simple.iterrows():
    print(f"{player['rank_simple_avg']:2d}. {player['PLAYER_NAME']:<25} ({player['POSITION']:<3}, {player['TEAM_ID']}) - {player['composite_simple_avg']:.1f}%")

=== TOP 10 - SIMPLE AVERAGE METHOD ===
 1. Nikola Jokić              (Unknown, 1610612743) - 78.8%
 2. T.J. McConnell            (Unknown, 1610612754) - 74.9%
 3. LeBron James              (Unknown, 1610612747) - 74.8%
 4. Jimmy Butler III          (Unknown, 1610612744) - 74.8%
 5. Giannis Antetokounmpo     (Unknown, 1610612749) - 73.2%
 6. Chris Paul                (Unknown, 1610612759) - 73.1%
 7. Stephen Curry             (Unknown, 1610612744) - 72.7%
 8. Tyrese Haliburton         (Unknown, 1610612754) - 72.6%
 9. Matisse Thybulle          (Unknown, 1610612757) - 71.6%
10. Josh Hart                 (Unknown, 1610612752) - 69.5%


## Method 2: Traditional IQ Scale

Convert percentiles to traditional IQ scale (mean=100, std=15) as mentioned in README.

In [27]:
# Method 2: Convert to traditional IQ scale
def percentile_to_iq(percentile):
    """Convert percentile (0-100) to IQ scale (mean=100, std=15)"""
    # Convert percentile to z-score, then to IQ scale
    z_score = stats.norm.ppf(percentile / 100)
    iq_score = 100 + 15 * z_score
    return iq_score

# Convert each percentile to IQ scale, then average
iq_cols = []
for col in percentile_cols:
    iq_col = col.replace('_percentile', '_iq')
    df[iq_col] = df[col].apply(percentile_to_iq)
    iq_cols.append(iq_col)

df['composite_iq_scale'] = df[iq_cols].mean(axis=1)
df['rank_iq_scale'] = df['composite_iq_scale'].rank(ascending=False, method='min').astype(int)

print("=== TOP 10 - TRADITIONAL IQ SCALE METHOD ===")
top_10_iq = df.nlargest(10, 'composite_iq_scale')[['PLAYER_NAME', 'POSITION', 'TEAM_ID', 'composite_iq_scale', 'rank_iq_scale']]
for _, player in top_10_iq.iterrows():
    print(f"{player['rank_iq_scale']:2d}. {player['PLAYER_NAME']:<25} ({player['POSITION']:<3}, {player['TEAM_ID']}) - {player['composite_iq_scale']:.1f} IQ")

=== TOP 10 - TRADITIONAL IQ SCALE METHOD ===
 1. Nikola Jokić              (Unknown, 1610612743) - 114.9 IQ
 2. LeBron James              (Unknown, 1610612747) - 114.4 IQ
 3. Tyrese Haliburton         (Unknown, 1610612754) - 114.2 IQ
 4. Jimmy Butler III          (Unknown, 1610612744) - 113.5 IQ
 5. Chris Paul                (Unknown, 1610612759) - 113.2 IQ
 6. Tyus Jones                (Unknown, 1610612756) - 112.0 IQ
 7. Matisse Thybulle          (Unknown, 1610612757) - 111.5 IQ
 8. Stephen Curry             (Unknown, 1610612744) - 111.3 IQ
 9. Giannis Antetokounmpo     (Unknown, 1610612749) - 111.1 IQ
10. Brandon Williams          (Unknown, 1610612742) - 110.0 IQ


In [None]:
# Method 3: Custom weighted approach emphasizing true basketball IQ
weights = {
    'ast_tov_ratio_percentile': 0.15,      # Most indicative of decision making
    'clutch_ast_tov_percentile': 0.10,     # Pressure decisions  
    'ast_pct_percentile': 0.15,            # Playmaking ability shows intellect
    

    'deflections_per_36_percentile': 0.10, # Anticipation and awareness
    'screen_assists_per_36_percentile': 0.10, # Team play IQ
    'late_clock_efficiency_percentile': 0.04, # Pressure execution
    
    # Efficiency (10% total) - Shot selection
    'efg_pct_percentile': 0.10,            # Efficient shot selection
    
    # Secondary (5% total) - Discipline and hustle
    'shooting_foul_pct_percentile': 0.10,  # Defensive discipline
    'loose_balls_per_36_percentile': 0.05, # Hustle (athletic)
    'personal_foul_rate_percentile': 0.10, # Personal foul discipline
    
    # Age (0% total) - No weight
    'age_percentile': 0.01             # Experience (no weight)
}

print("Weighted Formula:")
print(f"  AST/TOV: {weights['ast_tov_ratio_percentile']:.0%}")
print(f"  Clutch AST/TOV: {weights['clutch_ast_tov_percentile']:.0%}")
print(f"  Assist Percentage: {weights['ast_pct_percentile']:.0%}")
print(f"  Deflection Rate: {weights['deflections_per_36_percentile']:.0%}")
print(f"  Screen Assist Rate: {weights['screen_assists_per_36_percentile']:.0%}")
print(f"  Late Clock Efficiency: {weights['late_clock_efficiency_percentile']:.0%}")
print(f"  EFG Percentage: {weights['efg_pct_percentile']:.0%}")
print(f"  Shooting Foul Rate: {weights['shooting_foul_pct_percentile']:.0%}")
print(f"  Loose Ball Retrieval Rate: {weights['loose_balls_per_36_percentile']:.0%}")
print(f"  Personal Foul Rate: {weights['personal_foul_rate_percentile']:.0%}")
print(f"  Age: {weights['age_percentile']:.0%}")
print(f"\nTotal weight: {sum(weights.values()):.0%}")

# Calculate weighted composite score
df['composite_weighted_iq'] = 0
for metric, weight in weights.items():
    df['composite_weighted_iq'] += df[metric] * weight

df['rank_weighted_iq'] = df['composite_weighted_iq'].rank(ascending=False, method='min').astype(int)

top_15_weighted_iq = df.nlargest(15, 'composite_weighted_iq')[['PLAYER_NAME', 'POSITION', 'TEAM_ID', 'composite_weighted_iq', 'rank_weighted_iq']]
for _, player in top_15_weighted_iq.iterrows():
    print(f"{player['rank_weighted_iq']:2d}. {player['PLAYER_NAME']:<25} ({player['POSITION']:<3}, {player['TEAM_ID']}) - {player['composite_weighted_iq']:.1f}%")

Weighted Basketball IQ Formula:
  AST/TOV: 15%
  Clutch AST/TOV: 10%
  Assist Percentage: 15%
  Deflection Rate: 10%
  Screen Assist Rate: 10%
  Late Clock Efficiency: 4%
  EFG Percentage: 10%
  Shooting Foul Rate: 10%
  Loose Ball Retrieval Rate: 5%
  Personal Foul Rate: 10%
  Age: 1%

Total weight: 100%

=== TOP 15 - WEIGHTED BASKETBALL IQ METHOD ==
 1. Nikola Jokić              (Unknown, 1610612743) - 84.2%
 2. Tyrese Haliburton         (Unknown, 1610612754) - 82.0%
 3. LeBron James              (Unknown, 1610612747) - 75.7%
 4. Jimmy Butler III          (Unknown, 1610612744) - 75.4%
 5. Tyus Jones                (Unknown, 1610612756) - 74.8%
 6. T.J. McConnell            (Unknown, 1610612754) - 74.2%
 7. Giannis Antetokounmpo     (Unknown, 1610612749) - 74.1%
 8. Chris Paul                (Unknown, 1610612759) - 74.1%
 9. Matisse Thybulle          (Unknown, 1610612757) - 72.0%
10. Stephen Curry             (Unknown, 1610612744) - 72.0%
11. Ty Jerome                 (Unknown, 161061