In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## Load the Percentile Data

First, let's load the percentile data we calculated and see what we're working with.

In [25]:
# Identify percentile columns
percentile_cols = [col for col in df.columns if col.endswith('_percentile')]
print(f"Percentile columns ({len(percentile_cols)}): {percentile_cols}")

# Check for any missing values in percentiles
print("\nMissing values in percentiles:")
for col in percentile_cols:
    missing = df[col].isnull().sum()
    print(f"{col}: {missing}")

Percentile columns (11): ['ast_tov_ratio_percentile', 'late_clock_efficiency_percentile', 'clutch_ast_tov_percentile', 'efg_pct_percentile', 'deflections_per_36_percentile', 'screen_assists_per_36_percentile', 'loose_balls_per_36_percentile', 'shooting_foul_pct_percentile', 'personal_foul_rate_percentile', 'age_percentile', 'ast_pct_percentile']

Missing values in percentiles:
ast_tov_ratio_percentile: 0
late_clock_efficiency_percentile: 0
clutch_ast_tov_percentile: 0
efg_pct_percentile: 0
deflections_per_36_percentile: 0
screen_assists_per_36_percentile: 0
loose_balls_per_36_percentile: 0
shooting_foul_pct_percentile: 0
personal_foul_rate_percentile: 0
age_percentile: 0
ast_pct_percentile: 0


In [ ]:
iq_metrics = pd.read_csv('../data/processed/all_player_iq_metrics.csv')

print(f"raw iq metrics shape: {iq_metrics.shape}")
print(f"\ncolumns: {list(iq_metrics.columns)}")

metric_columns = [
    'ast_tov_ratio', 'late_clock_efficiency', 'clutch_ast_tov', 'efg_pct',
    'deflections_per_36', 'screen_assists_per_36', 'loose_balls_per_36', 
    'shooting_foul_pct', 'personal_foul_rate', 'age', 'ast_pct'
]

print(f"\ncalculating percentiles for {len(iq_metrics)} players across {len(metric_columns)} metrics...")

base_columns = ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'POSITION', 'GP', 'MIN']
df = iq_metrics[base_columns].copy()

for metric in metric_columns:
    print(f"  calculating percentiles for {metric}...")
    
    if metric in ['screen_assists_per_36', 'shooting_foul_pct']:
        percentiles = []
        for idx, row in iq_metrics.iterrows():
            value = row[metric]
            position = row.get('POSITION', 'Unknown')
            
            if position == 'Unknown' or pd.isna(position):
                metric_values = iq_metrics[metric]
                finite_values = metric_values[np.isfinite(metric_values)]
                if np.isfinite(value):
                    if metric in ['shooting_foul_pct', 'personal_foul_rate']:
                        percentile = (finite_values > value).mean() * 100
                    else:
                        percentile = (finite_values < value).mean() * 100
                else:
                    percentile = np.nan
            else:
                same_position_values = iq_metrics[iq_metrics['POSITION'] == position][metric]
                finite_position_values = same_position_values[np.isfinite(same_position_values)]
                
                if np.isfinite(value) and len(finite_position_values) > 1:
                    if metric in ['shooting_foul_pct', 'personal_foul_rate']:
                        percentile = (finite_position_values > value).mean() * 100
                    else:
                        percentile = (finite_position_values < value).mean() * 100
                else:
                    percentile = np.nan
            percentiles.append(percentile)
    else:
        metric_values = iq_metrics[metric]
        finite_values = metric_values[np.isfinite(metric_values)]
        
        percentiles = []
        for value in metric_values:
            if np.isfinite(value):
                if metric in ['personal_foul_rate']:
                    percentile = (finite_values > value).mean() * 100
                else:
                    percentile = (finite_values < value).mean() * 100
            else:
                percentile = np.nan
            percentiles.append(percentile)
    
    df[f'{metric}_percentile'] = np.round(percentiles, 1)
    df[f'{metric}_percentile'] = df[f'{metric}_percentile'].fillna(50.0)

print(f"\ndataset shape after adding percentiles: {df.shape}")
print(f"columns: {list(df.columns)}")

df.head()

In [None]:
percentile_cols = [col for col in df.columns if col.endswith('_percentile')]
print(f"percentile columns ({len(percentile_cols)}): {percentile_cols}")

print("\nmissing values in percentiles:")
for col in percentile_cols:
    missing = df[col].isnull().sum()
    print(f"{col}: {missing}")

In [27]:
# Method 2: Convert to traditional IQ scale
def percentile_to_iq(percentile):
    """Convert percentile (0-100) to IQ scale (mean=100, std=15)"""
    # Convert percentile to z-score, then to IQ scale
    z_score = stats.norm.ppf(percentile / 100)
    iq_score = 100 + 15 * z_score
    return iq_score

# Convert each percentile to IQ scale, then average
iq_cols = []
for col in percentile_cols:
    iq_col = col.replace('_percentile', '_iq')
    df[iq_col] = df[col].apply(percentile_to_iq)
    iq_cols.append(iq_col)

df['composite_iq_scale'] = df[iq_cols].mean(axis=1)
df['rank_iq_scale'] = df['composite_iq_scale'].rank(ascending=False, method='min').astype(int)

print("=== TOP 10 - TRADITIONAL IQ SCALE METHOD ===")
top_10_iq = df.nlargest(10, 'composite_iq_scale')[['PLAYER_NAME', 'POSITION', 'TEAM_ID', 'composite_iq_scale', 'rank_iq_scale']]
for _, player in top_10_iq.iterrows():
    print(f"{player['rank_iq_scale']:2d}. {player['PLAYER_NAME']:<25} ({player['POSITION']:<3}, {player['TEAM_ID']}) - {player['composite_iq_scale']:.1f} IQ")

=== TOP 10 - TRADITIONAL IQ SCALE METHOD ===
 1. Nikola Jokić              (Unknown, 1610612743) - 114.9 IQ
 2. LeBron James              (Unknown, 1610612747) - 114.4 IQ
 3. Tyrese Haliburton         (Unknown, 1610612754) - 114.2 IQ
 4. Jimmy Butler III          (Unknown, 1610612744) - 113.5 IQ
 5. Chris Paul                (Unknown, 1610612759) - 113.2 IQ
 6. Tyus Jones                (Unknown, 1610612756) - 112.0 IQ
 7. Matisse Thybulle          (Unknown, 1610612757) - 111.5 IQ
 8. Stephen Curry             (Unknown, 1610612744) - 111.3 IQ
 9. Giannis Antetokounmpo     (Unknown, 1610612749) - 111.1 IQ
10. Brandon Williams          (Unknown, 1610612742) - 110.0 IQ


In [None]:
df['composite_simple_avg'] = df[percentile_cols].mean(axis=1)

df['rank_simple_avg'] = df['composite_simple_avg'].rank(ascending=False, method='min').astype(int)

print("top 10 - simple average method")
top_10_simple = df.nlargest(10, 'composite_simple_avg')[['PLAYER_NAME', 'POSITION', 'TEAM_ID', 'composite_simple_avg', 'rank_simple_avg']]
for _, player in top_10_simple.iterrows():
    print(f"{player['rank_simple_avg']:2d}. {player['PLAYER_NAME']:<25} ({player['POSITION']:<3}, {player['TEAM_ID']}) - {player['composite_simple_avg']:.1f}%")