In [20]:
import pandas as pd
from nba_api.stats.endpoints import LeagueLeaders

# Define the target season
SEASON = '2023-24'
# Define the statistic to sort by. 'PTS' is a good default to get all qualified players.
STAT_CATEGORY = 'PTS'

# 1. Instantiate the LeagueLeaders endpoint
# This object makes the API call to stats.nba.com
leaders = LeagueLeaders(
    # The Season parameter is a string in the format 'YYYY-YY'
    # It specifies which NBA season data to retrieve.
    season=SEASON,
    # The StatCategory parameter determines which statistic the results are ranked by.
    # Crucially, it causes the endpoint to return a comprehensive set of player
    # season-long averages (PPG, RPG, APG, etc.) in the resulting table.
    stat_category_abbreviation=STAT_CATEGORY,
    # The Scope parameter is set to 'S' for 'Season' (season totals/averages).
    # Other common scopes include 'RS' for 'Regular Season' or 'P' for 'Playoffs', but 'S' is generally
    # the correct value for season-long stats which are available for all players.
    scope='S'
)

# 2. Extract the data into a pandas DataFrame
# The LeagueLeaders endpoint returns its data in a ResultSets object,
# which has a method called 'get_data_frames()'.
# This method returns a list of pandas DataFrames. The first element ([0])
# contains the primary table of player statistics.
all_player_stats_df = leaders.get_data_frames()[0]

# 3. Print the first few rows and column information
print(f"--- First 5 rows of Player Stats for {SEASON} ---")
print(all_player_stats_df.columns)
print(all_player_stats_df[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']])
print("\n--- All columns available in the DataFrame ---")
print(all_player_stats_df.columns.tolist())

--- First 5 rows of Player Stats for 2023-24 ---
Index(['PLAYER_ID', 'RANK', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF',
       'AST_TOV', 'STL_TOV'],
      dtype='object')
                      PLAYER  GP   PTS  REB  AST  STL  BLK   MIN
0                Luka Dončić  70  2370  647  686   99   38  2624
1    Shai Gilgeous-Alexander  75  2254  415  465  150   67  2553
2      Giannis Antetokounmpo  73  2222  841  476   87   79  2567
3              Jalen Brunson  77  2212  278  519   70   13  2726
4               Nikola Jokić  79  2085  976  708  108   68  2737
..                       ...  ..   ...  ...  ...  ...  ...   ...
567           Justin Jackson   2     0    0    0    0    0     1
568             Kaiser Gates   1     0    1    0    0    0     7
569          Malcolm Cazalon   1     0    0    0    0    0     3
570           Ron H

In [21]:
all_player_stats_df = all_player_stats_df[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']]
all_player_stats_df

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN
0,Luka Dončić,70,2370,647,686,99,38,2624
1,Shai Gilgeous-Alexander,75,2254,415,465,150,67,2553
2,Giannis Antetokounmpo,73,2222,841,476,87,79,2567
3,Jalen Brunson,77,2212,278,519,70,13,2726
4,Nikola Jokić,79,2085,976,708,108,68,2737
...,...,...,...,...,...,...,...,...
567,Justin Jackson,2,0,0,0,0,0,1
568,Kaiser Gates,1,0,1,0,0,0,7
569,Malcolm Cazalon,1,0,0,0,0,0,3
570,Ron Harper Jr.,1,0,0,1,0,0,4


In [22]:
all_player_stats_df["PTS_pm"] = all_player_stats_df["PTS"] / all_player_stats_df["MIN"]
all_player_stats_df["REB_pm"] = all_player_stats_df["REB"] / all_player_stats_df["MIN"]
all_player_stats_df["AST_pm"] = all_player_stats_df["AST"] / all_player_stats_df["MIN"]
all_player_stats_df["STL_pm"] = all_player_stats_df["STL"] / all_player_stats_df["MIN"]
all_player_stats_df["BLK_pm"] = all_player_stats_df["BLK"] / all_player_stats_df["MIN"]
all_player_stats_df

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN,PTS_pm,REB_pm,AST_pm,STL_pm,BLK_pm
0,Luka Dončić,70,2370,647,686,99,38,2624,0.903201,0.246570,0.261433,0.037729,0.014482
1,Shai Gilgeous-Alexander,75,2254,415,465,150,67,2553,0.882883,0.162554,0.182139,0.058754,0.026244
2,Giannis Antetokounmpo,73,2222,841,476,87,79,2567,0.865602,0.327620,0.185430,0.033892,0.030775
3,Jalen Brunson,77,2212,278,519,70,13,2726,0.811445,0.101981,0.190389,0.025679,0.004769
4,Nikola Jokić,79,2085,976,708,108,68,2737,0.761783,0.356595,0.258677,0.039459,0.024845
...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,Justin Jackson,2,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000
568,Kaiser Gates,1,0,1,0,0,0,7,0.000000,0.142857,0.000000,0.000000,0.000000
569,Malcolm Cazalon,1,0,0,0,0,0,3,0.000000,0.000000,0.000000,0.000000,0.000000
570,Ron Harper Jr.,1,0,0,1,0,0,4,0.000000,0.000000,0.250000,0.000000,0.000000


In [23]:
new_stats_df = all_player_stats_df[["PLAYER", "PTS_pm", "REB_pm", "AST_pm", "STL_pm", "BLK_pm"]]
new_stats_df

Unnamed: 0,PLAYER,PTS_pm,REB_pm,AST_pm,STL_pm,BLK_pm
0,Luka Dončić,0.903201,0.246570,0.261433,0.037729,0.014482
1,Shai Gilgeous-Alexander,0.882883,0.162554,0.182139,0.058754,0.026244
2,Giannis Antetokounmpo,0.865602,0.327620,0.185430,0.033892,0.030775
3,Jalen Brunson,0.811445,0.101981,0.190389,0.025679,0.004769
4,Nikola Jokić,0.761783,0.356595,0.258677,0.039459,0.024845
...,...,...,...,...,...,...
567,Justin Jackson,0.000000,0.000000,0.000000,0.000000,0.000000
568,Kaiser Gates,0.000000,0.142857,0.000000,0.000000,0.000000
569,Malcolm Cazalon,0.000000,0.000000,0.000000,0.000000,0.000000
570,Ron Harper Jr.,0.000000,0.000000,0.250000,0.000000,0.000000


In [24]:
# Define the target season
SEASON = '2022-23'
# Define the statistic to sort by. 'PTS' is a good default to get all qualified players.
STAT_CATEGORY = 'PTS'

# 1. Instantiate the LeagueLeaders endpoint
# This object makes the API call to stats.nba.com
leaders = LeagueLeaders(
    # The Season parameter is a string in the format 'YYYY-YY'
    # It specifies which NBA season data to retrieve.
    season=SEASON,
    # The StatCategory parameter determines which statistic the results are ranked by.
    # Crucially, it causes the endpoint to return a comprehensive set of player
    # season-long averages (PPG, RPG, APG, etc.) in the resulting table.
    stat_category_abbreviation=STAT_CATEGORY,
    # The Scope parameter is set to 'S' for 'Season' (season totals/averages).
    # Other common scopes include 'RS' for 'Regular Season' or 'P' for 'Playoffs', but 'S' is generally
    # the correct value for season-long stats which are available for all players.
    scope='S'
)

# 2. Extract the data into a pandas DataFrame
# The LeagueLeaders endpoint returns its data in a ResultSets object,
# which has a method called 'get_data_frames()'.
# This method returns a list of pandas DataFrames. The first element ([0])
# contains the primary table of player statistics.
all_player_stats_df2 = leaders.get_data_frames()[0]

# 3. Print the first few rows and column information
print(f"--- First 5 rows of Player Stats for {SEASON} ---")
print(all_player_stats_df2.columns)
print(all_player_stats_df2[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']])
print("\n--- All columns available in the DataFrame ---")
print(all_player_stats_df2.columns.tolist())

--- First 5 rows of Player Stats for 2022-23 ---
Index(['PLAYER_ID', 'RANK', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF',
       'AST_TOV', 'STL_TOV'],
      dtype='object')
                      PLAYER  GP   PTS  REB  AST  STL  BLK   MIN
0               Jayson Tatum  74  2225  649  342   78   51  2732
1                Joel Embiid  66  2183  670  274   66  112  2284
2                Luka Dončić  66  2138  569  529   90   33  2391
3    Shai Gilgeous-Alexander  68  2135  329  371  112   65  2416
4      Giannis Antetokounmpo  63  1959  742  359   52   51  2024
..                       ...  ..   ...  ...  ...  ...  ...   ...
534         Alondes Williams   1     0    1    0    0    0     5
535            Deonte Burton   2     0    0    0    0    0     7
536            Frank Jackson   1     0    2    1    0    0     5
537       Michael F

In [25]:
all_player_stats_df2 = all_player_stats_df2[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']]
all_player_stats_df2

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN
0,Jayson Tatum,74,2225,649,342,78,51,2732
1,Joel Embiid,66,2183,670,274,66,112,2284
2,Luka Dončić,66,2138,569,529,90,33,2391
3,Shai Gilgeous-Alexander,68,2135,329,371,112,65,2416
4,Giannis Antetokounmpo,63,1959,742,359,52,51,2024
...,...,...,...,...,...,...,...,...
534,Alondes Williams,1,0,1,0,0,0,5
535,Deonte Burton,2,0,0,0,0,0,7
536,Frank Jackson,1,0,2,1,0,0,5
537,Michael Foster Jr.,1,0,0,0,0,0,1


In [26]:
all_player_stats_df2["PTS_pm"] = all_player_stats_df2["PTS"] / all_player_stats_df2["MIN"]
all_player_stats_df2["REB_pm"] = all_player_stats_df2["REB"] / all_player_stats_df2["MIN"]
all_player_stats_df2["AST_pm"] = all_player_stats_df2["AST"] / all_player_stats_df2["MIN"]
all_player_stats_df2["STL_pm"] = all_player_stats_df2["STL"] / all_player_stats_df2["MIN"]
all_player_stats_df2["BLK_pm"] = all_player_stats_df2["BLK"] / all_player_stats_df2["MIN"]
all_player_stats_df2

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN,PTS_pm,REB_pm,AST_pm,STL_pm,BLK_pm
0,Jayson Tatum,74,2225,649,342,78,51,2732,0.814422,0.237555,0.125183,0.028551,0.018668
1,Joel Embiid,66,2183,670,274,66,112,2284,0.955779,0.293345,0.119965,0.028897,0.049037
2,Luka Dončić,66,2138,569,529,90,33,2391,0.894187,0.237976,0.221246,0.037641,0.013802
3,Shai Gilgeous-Alexander,68,2135,329,371,112,65,2416,0.883692,0.136175,0.153560,0.046358,0.026904
4,Giannis Antetokounmpo,63,1959,742,359,52,51,2024,0.967885,0.366601,0.177372,0.025692,0.025198
...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,Alondes Williams,1,0,1,0,0,0,5,0.000000,0.200000,0.000000,0.000000,0.000000
535,Deonte Burton,2,0,0,0,0,0,7,0.000000,0.000000,0.000000,0.000000,0.000000
536,Frank Jackson,1,0,2,1,0,0,5,0.000000,0.400000,0.200000,0.000000,0.000000
537,Michael Foster Jr.,1,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000


In [27]:
old_stats_df = all_player_stats_df2[["PLAYER", "PTS_pm", "REB_pm", "AST_pm", "STL_pm", "BLK_pm"]]
old_stats_df

Unnamed: 0,PLAYER,PTS_pm,REB_pm,AST_pm,STL_pm,BLK_pm
0,Jayson Tatum,0.814422,0.237555,0.125183,0.028551,0.018668
1,Joel Embiid,0.955779,0.293345,0.119965,0.028897,0.049037
2,Luka Dončić,0.894187,0.237976,0.221246,0.037641,0.013802
3,Shai Gilgeous-Alexander,0.883692,0.136175,0.153560,0.046358,0.026904
4,Giannis Antetokounmpo,0.967885,0.366601,0.177372,0.025692,0.025198
...,...,...,...,...,...,...
534,Alondes Williams,0.000000,0.200000,0.000000,0.000000,0.000000
535,Deonte Burton,0.000000,0.000000,0.000000,0.000000,0.000000
536,Frank Jackson,0.000000,0.400000,0.200000,0.000000,0.000000
537,Michael Foster Jr.,0.000000,0.000000,0.000000,0.000000,0.000000


In [33]:
df = pd.merge(old_stats_df, new_stats_df, on="PLAYER")
df.to_markdown()

"|     | PLAYER                   |   PTS_pm_x |   REB_pm_x |   AST_pm_x |   STL_pm_x |   BLK_pm_x |   PTS_pm_y |   REB_pm_y |   AST_pm_y |   STL_pm_y |   BLK_pm_y |\n|----:|:-------------------------|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|\n|   0 | Jayson Tatum             |  0.814422  |  0.237555  |  0.125183  | 0.0285505  | 0.0186676  |   0.751229 |  0.227221  |  0.137618  | 0.0283554  | 0.0162571  |\n|   1 | Joel Embiid              |  0.955779  |  0.293345  |  0.119965  | 0.0288967  | 0.0490368  |   1.03361  |  0.328495  |  0.167303  | 0.0351413  | 0.0504202  |\n|   2 | Luka Dončić              |  0.894187  |  0.237976  |  0.221246  | 0.0376412  | 0.0138018  |   0.903201 |  0.24657   |  0.261433  | 0.0377287  | 0.0144817  |\n|   3 | Shai Gilgeous-Alexander  |  0.883692  |  0.136175  |  0.15356   | 0.0463576  | 0.026904   |   0.882883 |  0.162554  |  0.182139  | 0.0587544  | 0.0262436  |\n|  

In [35]:
stat_categories = ['PTS', 'REB', 'AST', 'STL', 'BLK']

# Re-calculate Differences (Step 1)
for stat in stat_categories:
    df[f'{stat}_DIFF'] = df[f'{stat}_pm_y'] - df[f'{stat}_pm_x']

# Calculate Standard Deviation Thresholds (Step 2)
# Threshold: Mean + 1 * Standard Deviation
std_thresholds = {
    stat: df[f'{stat}_DIFF'].mean() + df[f'{stat}_DIFF'].std()
    for stat in stat_categories
}

# --- [NEW STEP: Top 20% Percentile Calculation] ---

# 4. Calculate the 80th Percentile (Top 20%) Threshold for the Raw Stat (_y)
PERCENTILE_THRESHOLD = 0.80 # The 80th percentile is the cutoff for the top 20%
raw_stat_thresholds = {}

print("## 💯 Step 4: Calculate 80th Percentile Thresholds for Raw Stats")

for stat in stat_categories:
    raw_col = f'{stat}_pm_y'
    
    # Use the pandas .quantile() method to find the value at the 80th percentile.
    # Players must be GREATER THAN or EQUAL to this value to be in the top 20%.
    percentile_value = df[raw_col].quantile(PERCENTILE_THRESHOLD)
    raw_stat_thresholds[stat] = percentile_value
    
    print(f"**{stat}_pm_y** (Raw Stat): 80th Percentile Cutoff = {percentile_value:.4f}")
print("-" * 50)


# --- [Final Combined Filter] ---

print("## ✨ Step 5: Apply Two-Layer Filter (StdDev Change AND Top 20% Raw Stat)")

# Start with a boolean mask where all values are False
overall_two_layer_mask = pd.Series([False] * len(df), index=df.index)

# Check each stat category
for stat in stat_categories:
    diff_col = f'{stat}_DIFF'
    raw_col = f'{stat}_pm_y'
    
    # Condition A: Player's change is greater than 1 Std Dev above the mean
    std_dev_mask = df[diff_col] > std_thresholds[stat]
    
    # Condition B: Player's raw stat is in the top 20%
    percentile_mask = df[raw_col] >= raw_stat_thresholds[stat]
    
    # Combined Mask for THIS stat: (Significant Change AND High Raw Performance)
    # The bitwise AND operator (&) is used.
    combined_stat_mask = std_dev_mask & percentile_mask
    
    # Update the overall mask with the bitwise OR operator (|)
    # Player is included if they meet the combined criteria for PTS OR REB OR AST...
    overall_two_layer_mask = overall_two_layer_mask | combined_stat_mask

# Apply the overall two-layer mask to the original DataFrame
filtered_high_performance_gain_df = df[overall_two_layer_mask].copy()

# 6. Display the final filtered DataFrame
print("\n--- Final Filtered Players (Significant Change AND Top 20% Raw Stat) ---")
display_cols = ['PLAYER'] + [f'{stat}_pm_y' for stat in stat_categories] + [f'{stat}_DIFF' for stat in stat_categories]
if filtered_high_performance_gain_df.empty:
    print("No players met both the 1 Std Dev change AND Top 20% raw performance criteria in the same category.")
else:
    print(filtered_high_performance_gain_df[display_cols].sort_values(by='PTS_DIFF', ascending=False).reset_index(drop=True))

## 💯 Step 4: Calculate 80th Percentile Thresholds for Raw Stats
**PTS_pm_y** (Raw Stat): 80th Percentile Cutoff = 0.5417
**REB_pm_y** (Raw Stat): 80th Percentile Cutoff = 0.2541
**AST_pm_y** (Raw Stat): 80th Percentile Cutoff = 0.1539
**STL_pm_y** (Raw Stat): 80th Percentile Cutoff = 0.0402
**BLK_pm_y** (Raw Stat): 80th Percentile Cutoff = 0.0327
--------------------------------------------------
## ✨ Step 5: Apply Two-Layer Filter (StdDev Change AND Top 20% Raw Stat)

--- Final Filtered Players (Significant Change AND Top 20% Raw Stat) ---
                PLAYER  PTS_pm_y  REB_pm_y  AST_pm_y  STL_pm_y  BLK_pm_y  \
0         Kevon Harris  0.666667  0.333333  0.166667  0.000000  0.000000   
1     Alondes Williams  0.312500  0.062500  0.000000  0.000000  0.062500   
2          Daishen Nix  0.540000  0.060000  0.120000  0.100000  0.000000   
3      James Bouknight  0.617284  0.111111  0.074074  0.012346  0.012346   
4         Ryan Rollins  0.564103  0.166667  0.179487  0.128205  0.038462 