In [15]:
import pandas as pd
from nba_api.stats.endpoints import LeagueLeaders

# Define the target season
SEASON = '2023-24'
# Define the statistic to sort by. 'PTS' is a good default to get all qualified players.
STAT_CATEGORY = 'PTS'

# 1. Instantiate the LeagueLeaders endpoint
# This object makes the API call to stats.nba.com
leaders = LeagueLeaders(
    # The Season parameter is a string in the format 'YYYY-YY'
    # It specifies which NBA season data to retrieve.
    season=SEASON,
    # The StatCategory parameter determines which statistic the results are ranked by.
    # Crucially, it causes the endpoint to return a comprehensive set of player
    # season-long averages (PPG, RPG, APG, etc.) in the resulting table.
    stat_category_abbreviation=STAT_CATEGORY,
    # The Scope parameter is set to 'S' for 'Season' (season totals/averages).
    # Other common scopes include 'RS' for 'Regular Season' or 'P' for 'Playoffs', but 'S' is generally
    # the correct value for season-long stats which are available for all players.
    scope='S'
)

# 2. Extract the data into a pandas DataFrame
# The LeagueLeaders endpoint returns its data in a ResultSets object,
# which has a method called 'get_data_frames()'.
# This method returns a list of pandas DataFrames. The first element ([0])
# contains the primary table of player statistics.
all_player_stats_df = leaders.get_data_frames()[0]

# 3. Print the first few rows and column information
print(f"--- First 5 rows of Player Stats for {SEASON} ---")
print(all_player_stats_df.columns)
print(all_player_stats_df[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']])
print("\n--- All columns available in the DataFrame ---")
print(all_player_stats_df.columns.tolist())

--- First 5 rows of Player Stats for 2023-24 ---
Index(['PLAYER_ID', 'RANK', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF',
       'AST_TOV', 'STL_TOV'],
      dtype='object')
                      PLAYER  GP   PTS  REB  AST  STL  BLK   MIN
0                Luka Dončić  70  2370  647  686   99   38  2624
1    Shai Gilgeous-Alexander  75  2254  415  465  150   67  2553
2      Giannis Antetokounmpo  73  2222  841  476   87   79  2567
3              Jalen Brunson  77  2212  278  519   70   13  2726
4               Nikola Jokić  79  2085  976  708  108   68  2737
..                       ...  ..   ...  ...  ...  ...  ...   ...
567           Justin Jackson   2     0    0    0    0    0     1
568             Kaiser Gates   1     0    1    0    0    0     7
569          Malcolm Cazalon   1     0    0    0    0    0     3
570           Ron H

In [16]:
all_player_stats_df = all_player_stats_df[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']]
all_player_stats_df

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN
0,Luka Dončić,70,2370,647,686,99,38,2624
1,Shai Gilgeous-Alexander,75,2254,415,465,150,67,2553
2,Giannis Antetokounmpo,73,2222,841,476,87,79,2567
3,Jalen Brunson,77,2212,278,519,70,13,2726
4,Nikola Jokić,79,2085,976,708,108,68,2737
...,...,...,...,...,...,...,...,...
567,Justin Jackson,2,0,0,0,0,0,1
568,Kaiser Gates,1,0,1,0,0,0,7
569,Malcolm Cazalon,1,0,0,0,0,0,3
570,Ron Harper Jr.,1,0,0,1,0,0,4


In [17]:
all_player_stats_df["PTS_pg"] = all_player_stats_df["PTS"] / all_player_stats_df["GP"]
all_player_stats_df["REB_pg"] = all_player_stats_df["REB"] / all_player_stats_df["GP"]
all_player_stats_df["AST_pg"] = all_player_stats_df["AST"] / all_player_stats_df["GP"]
all_player_stats_df["STL_pg"] = all_player_stats_df["STL"] / all_player_stats_df["GP"]
all_player_stats_df["BLK_pg"] = all_player_stats_df["BLK"] / all_player_stats_df["GP"]
all_player_stats_df

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN,PTS_pg,REB_pg,AST_pg,STL_pg,BLK_pg
0,Luka Dončić,70,2370,647,686,99,38,2624,33.857143,9.242857,9.800000,1.414286,0.542857
1,Shai Gilgeous-Alexander,75,2254,415,465,150,67,2553,30.053333,5.533333,6.200000,2.000000,0.893333
2,Giannis Antetokounmpo,73,2222,841,476,87,79,2567,30.438356,11.520548,6.520548,1.191781,1.082192
3,Jalen Brunson,77,2212,278,519,70,13,2726,28.727273,3.610390,6.740260,0.909091,0.168831
4,Nikola Jokić,79,2085,976,708,108,68,2737,26.392405,12.354430,8.962025,1.367089,0.860759
...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,Justin Jackson,2,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000
568,Kaiser Gates,1,0,1,0,0,0,7,0.000000,1.000000,0.000000,0.000000,0.000000
569,Malcolm Cazalon,1,0,0,0,0,0,3,0.000000,0.000000,0.000000,0.000000,0.000000
570,Ron Harper Jr.,1,0,0,1,0,0,4,0.000000,0.000000,1.000000,0.000000,0.000000


In [18]:
new_stats_df = all_player_stats_df[["PLAYER", "PTS_pg", "REB_pg", "AST_pg", "STL_pg", "BLK_pg"]]
new_stats_df

Unnamed: 0,PLAYER,PTS_pg,REB_pg,AST_pg,STL_pg,BLK_pg
0,Luka Dončić,33.857143,9.242857,9.800000,1.414286,0.542857
1,Shai Gilgeous-Alexander,30.053333,5.533333,6.200000,2.000000,0.893333
2,Giannis Antetokounmpo,30.438356,11.520548,6.520548,1.191781,1.082192
3,Jalen Brunson,28.727273,3.610390,6.740260,0.909091,0.168831
4,Nikola Jokić,26.392405,12.354430,8.962025,1.367089,0.860759
...,...,...,...,...,...,...
567,Justin Jackson,0.000000,0.000000,0.000000,0.000000,0.000000
568,Kaiser Gates,0.000000,1.000000,0.000000,0.000000,0.000000
569,Malcolm Cazalon,0.000000,0.000000,0.000000,0.000000,0.000000
570,Ron Harper Jr.,0.000000,0.000000,1.000000,0.000000,0.000000


In [19]:
# Define the target season
SEASON = '2022-23'
# Define the statistic to sort by. 'PTS' is a good default to get all qualified players.
STAT_CATEGORY = 'PTS'

# 1. Instantiate the LeagueLeaders endpoint
# This object makes the API call to stats.nba.com
leaders = LeagueLeaders(
    # The Season parameter is a string in the format 'YYYY-YY'
    # It specifies which NBA season data to retrieve.
    season=SEASON,
    # The StatCategory parameter determines which statistic the results are ranked by.
    # Crucially, it causes the endpoint to return a comprehensive set of player
    # season-long averages (PPG, RPG, APG, etc.) in the resulting table.
    stat_category_abbreviation=STAT_CATEGORY,
    # The Scope parameter is set to 'S' for 'Season' (season totals/averages).
    # Other common scopes include 'RS' for 'Regular Season' or 'P' for 'Playoffs', but 'S' is generally
    # the correct value for season-long stats which are available for all players.
    scope='S'
)

# 2. Extract the data into a pandas DataFrame
# The LeagueLeaders endpoint returns its data in a ResultSets object,
# which has a method called 'get_data_frames()'.
# This method returns a list of pandas DataFrames. The first element ([0])
# contains the primary table of player statistics.
all_player_stats_df2 = leaders.get_data_frames()[0]

# 3. Print the first few rows and column information
print(f"--- First 5 rows of Player Stats for {SEASON} ---")
print(all_player_stats_df2.columns)
print(all_player_stats_df2[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']])
print("\n--- All columns available in the DataFrame ---")
print(all_player_stats_df2.columns.tolist())

--- First 5 rows of Player Stats for 2022-23 ---
Index(['PLAYER_ID', 'RANK', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF',
       'AST_TOV', 'STL_TOV'],
      dtype='object')
                      PLAYER  GP   PTS  REB  AST  STL  BLK   MIN
0               Jayson Tatum  74  2225  649  342   78   51  2732
1                Joel Embiid  66  2183  670  274   66  112  2284
2                Luka Dončić  66  2138  569  529   90   33  2391
3    Shai Gilgeous-Alexander  68  2135  329  371  112   65  2416
4      Giannis Antetokounmpo  63  1959  742  359   52   51  2024
..                       ...  ..   ...  ...  ...  ...  ...   ...
534         Alondes Williams   1     0    1    0    0    0     5
535            Deonte Burton   2     0    0    0    0    0     7
536            Frank Jackson   1     0    2    1    0    0     5
537       Michael F

In [20]:
all_player_stats_df2 = all_player_stats_df2[['PLAYER', 'GP', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']]
all_player_stats_df2

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN
0,Jayson Tatum,74,2225,649,342,78,51,2732
1,Joel Embiid,66,2183,670,274,66,112,2284
2,Luka Dončić,66,2138,569,529,90,33,2391
3,Shai Gilgeous-Alexander,68,2135,329,371,112,65,2416
4,Giannis Antetokounmpo,63,1959,742,359,52,51,2024
...,...,...,...,...,...,...,...,...
534,Alondes Williams,1,0,1,0,0,0,5
535,Deonte Burton,2,0,0,0,0,0,7
536,Frank Jackson,1,0,2,1,0,0,5
537,Michael Foster Jr.,1,0,0,0,0,0,1


In [21]:
all_player_stats_df2["PTS_pg"] = all_player_stats_df2["PTS"] / all_player_stats_df2["GP"]
all_player_stats_df2["REB_pg"] = all_player_stats_df2["REB"] / all_player_stats_df2["GP"]
all_player_stats_df2["AST_pg"] = all_player_stats_df2["AST"] / all_player_stats_df2["GP"]
all_player_stats_df2["STL_pg"] = all_player_stats_df2["STL"] / all_player_stats_df2["GP"]
all_player_stats_df2["BLK_pg"] = all_player_stats_df2["BLK"] / all_player_stats_df2["GP"]
all_player_stats_df2

Unnamed: 0,PLAYER,GP,PTS,REB,AST,STL,BLK,MIN,PTS_pg,REB_pg,AST_pg,STL_pg,BLK_pg
0,Jayson Tatum,74,2225,649,342,78,51,2732,30.067568,8.770270,4.621622,1.054054,0.689189
1,Joel Embiid,66,2183,670,274,66,112,2284,33.075758,10.151515,4.151515,1.000000,1.696970
2,Luka Dončić,66,2138,569,529,90,33,2391,32.393939,8.621212,8.015152,1.363636,0.500000
3,Shai Gilgeous-Alexander,68,2135,329,371,112,65,2416,31.397059,4.838235,5.455882,1.647059,0.955882
4,Giannis Antetokounmpo,63,1959,742,359,52,51,2024,31.095238,11.777778,5.698413,0.825397,0.809524
...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,Alondes Williams,1,0,1,0,0,0,5,0.000000,1.000000,0.000000,0.000000,0.000000
535,Deonte Burton,2,0,0,0,0,0,7,0.000000,0.000000,0.000000,0.000000,0.000000
536,Frank Jackson,1,0,2,1,0,0,5,0.000000,2.000000,1.000000,0.000000,0.000000
537,Michael Foster Jr.,1,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000


In [22]:
old_stats_df = all_player_stats_df2[["PLAYER", "PTS_pg", "REB_pg", "AST_pg", "STL_pg", "BLK_pg"]]
old_stats_df

Unnamed: 0,PLAYER,PTS_pg,REB_pg,AST_pg,STL_pg,BLK_pg
0,Jayson Tatum,30.067568,8.770270,4.621622,1.054054,0.689189
1,Joel Embiid,33.075758,10.151515,4.151515,1.000000,1.696970
2,Luka Dončić,32.393939,8.621212,8.015152,1.363636,0.500000
3,Shai Gilgeous-Alexander,31.397059,4.838235,5.455882,1.647059,0.955882
4,Giannis Antetokounmpo,31.095238,11.777778,5.698413,0.825397,0.809524
...,...,...,...,...,...,...
534,Alondes Williams,0.000000,1.000000,0.000000,0.000000,0.000000
535,Deonte Burton,0.000000,0.000000,0.000000,0.000000,0.000000
536,Frank Jackson,0.000000,2.000000,1.000000,0.000000,0.000000
537,Michael Foster Jr.,0.000000,0.000000,0.000000,0.000000,0.000000


In [23]:
df = pd.merge(old_stats_df, new_stats_df, on="PLAYER")
df.to_markdown()

"|     | PLAYER                   |   PTS_pg_x |   REB_pg_x |   AST_pg_x |   STL_pg_x |   BLK_pg_x |   PTS_pg_y |   REB_pg_y |   AST_pg_y |   STL_pg_y |   BLK_pg_y |\n|----:|:-------------------------|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|\n|   0 | Jayson Tatum             |  30.0676   |   8.77027  |   4.62162  |  1.05405   |  0.689189  |  26.8514   |   8.12162  |  4.91892   |  1.01351   |  0.581081  |\n|   1 | Joel Embiid              |  33.0758   |  10.1515   |   4.15152  |  1         |  1.69697   |  34.6923   |  11.0256   |  5.61538   |  1.17949   |  1.69231   |\n|   2 | Luka Dončić              |  32.3939   |   8.62121  |   8.01515  |  1.36364   |  0.5       |  33.8571   |   9.24286  |  9.8       |  1.41429   |  0.542857  |\n|   3 | Shai Gilgeous-Alexander  |  31.3971   |   4.83824  |   5.45588  |  1.64706   |  0.955882  |  30.0533   |   5.53333  |  6.2       |  2         |  0.893333  |\n|  

In [25]:
stat_categories = ['PTS', 'REB', 'AST', 'STL', 'BLK']

# Re-calculate Differences (Step 1)
for stat in stat_categories:
    df[f'{stat}_DIFF'] = df[f'{stat}_pg_y'] - df[f'{stat}_pg_x']

# Calculate Standard Deviation Thresholds (Step 2)
# Threshold: Mean + 1 * Standard Deviation
std_thresholds = {
    stat: df[f'{stat}_DIFF'].mean() + df[f'{stat}_DIFF'].std()
    for stat in stat_categories
}

# --- [NEW STEP: Top 20% Percentile Calculation] ---

# 4. Calculate the 80th Percentile (Top 20%) Threshold for the Raw Stat (_y)
PERCENTILE_THRESHOLD = 0.80 # The 80th percentile is the cutoff for the top 20%
raw_stat_thresholds = {}

print("## 💯 Step 4: Calculate 80th Percentile Thresholds for Raw Stats")

for stat in stat_categories:
    raw_col = f'{stat}_pg_y'
    
    # Use the pandas .quantile() method to find the value at the 80th percentile.
    # Players must be GREATER THAN or EQUAL to this value to be in the top 20%.
    percentile_value = df[raw_col].quantile(PERCENTILE_THRESHOLD)
    raw_stat_thresholds[stat] = percentile_value
    
    print(f"**{stat}_pg_y** (Raw Stat): 80th Percentile Cutoff = {percentile_value:.4f}")
print("-" * 50)


# --- [Final Combined Filter] ---

print("## ✨ Step 5: Apply Two-Layer Filter (StdDev Change AND Top 20% Raw Stat)")

# Start with a boolean mask where all values are False
overall_two_layer_mask = pd.Series([False] * len(df), index=df.index)

# Check each stat category
for stat in stat_categories:
    diff_col = f'{stat}_DIFF'
    raw_col = f'{stat}_pg_y'
    
    # Condition A: Player's change is greater than 1 Std Dev above the mean
    std_dev_mask = df[diff_col] > std_thresholds[stat]
    
    # Condition B: Player's raw stat is in the top 20%
    percentile_mask = df[raw_col] >= raw_stat_thresholds[stat]
    
    # Combined Mask for THIS stat: (Significant Change AND High Raw Performance)
    # The bitwise AND operator (&) is used.
    combined_stat_mask = std_dev_mask & percentile_mask
    
    # Update the overall mask with the bitwise OR operator (|)
    # Player is included if they meet the combined criteria for PTS OR REB OR AST...
    overall_two_layer_mask = overall_two_layer_mask | combined_stat_mask

# Apply the overall two-layer mask to the original DataFrame
filtered_high_performance_gain_df = df[overall_two_layer_mask].copy()

# 6. Display the final filtered DataFrame
print("\n--- Final Filtered Players (Significant Change AND Top 20% Raw Stat) ---")
display_cols = ['PLAYER'] + [f'{stat}_pg_y' for stat in stat_categories] + [f'{stat}_DIFF' for stat in stat_categories]
if filtered_high_performance_gain_df.empty:
    print("No players met both the 1 Std Dev change AND Top 20% raw performance criteria in the same category.")
else:
    print(filtered_high_performance_gain_df[display_cols].sort_values(by='PTS_DIFF', ascending=False).reset_index(drop=True))

## 💯 Step 4: Calculate 80th Percentile Thresholds for Raw Stats
**PTS_pg_y** (Raw Stat): 80th Percentile Cutoff = 15.1757
**REB_pg_y** (Raw Stat): 80th Percentile Cutoff = 5.2484
**AST_pg_y** (Raw Stat): 80th Percentile Cutoff = 3.6293
**STL_pg_y** (Raw Stat): 80th Percentile Cutoff = 0.9670
**BLK_pg_y** (Raw Stat): 80th Percentile Cutoff = 0.6545
--------------------------------------------------
## ✨ Step 5: Apply Two-Layer Filter (StdDev Change AND Top 20% Raw Stat)

--- Final Filtered Players (Significant Change AND Top 20% Raw Stat) ---
                PLAYER   PTS_pg_y  REB_pg_y  AST_pg_y  STL_pg_y  BLK_pg_y  \
0           Cam Thomas  22.454545  3.242424  2.909091  0.681818  0.242424   
1    Scotty Pippen Jr.  12.904762  3.190476  4.666667  1.714286  0.476190   
2        Jalen Johnson  16.000000  8.714286  3.625000  1.196429  0.839286   
3           Coby White  19.101266  4.531646  5.126582  0.670886  0.227848   
4   Vince Williams Jr.  10.019231  5.576923  3.365385  0.903846  0.