# League Dash Player Stats

This endpoint allows retrieval of all player stats within a season, which makes calling multiple seasons more feasible

In [31]:
import json
import numpy as np
import pickle
import pandas as pd
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.static import players

In [32]:
league_dash = leaguedashplayerstats.LeagueDashPlayerStats(
    measure_type_detailed_defense="Base",
    season_type_all_star="Regular Season",
    season="2022-23",
    # must set to "N"; otherwise weird negatives appear
    # also, p_m is included regardless of this setting.
    plus_minus="N",
    per_mode_detailed="Per36",
)

In [18]:
league_dash.get_request_url()

'https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=&PaceAdjust=N&PerMode=Per36&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2001-02&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=&TwoWay=&VsConference=&VsDivision=&Weight='

In [33]:
df_list = league_dash.get_data_frames()
df_list[0]

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,CFID,CFPARAMS
0,1630639,A.J. Lawson,A.J.,1610612750,MIN,22.0,1,1,0,1.000,...,2,440,3,472,6,157,15,4,5,16306391610612750
1,1631260,AJ Green,AJ,1610612749,MIL,23.0,5,5,0,1.000,...,33,440,455,465,447,157,15,450,5,16312601610612749
2,1631100,AJ Griffin,AJ,1610612737,ATL,19.0,17,7,10,0.412,...,345,397,146,304,282,157,15,238,5,16311001610612737
3,203932,Aaron Gordon,Aaron,1610612743,DEN,27.0,17,12,5,0.706,...,363,117,92,48,91,54,15,98,5,2039321610612743
4,1628988,Aaron Holiday,Aaron,1610612737,ATL,26.0,19,11,8,0.579,...,169,345,404,255,432,157,15,435,5,16289881610612737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469,1629139,Yuta Watanabe,Yuta,1610612751,BKN,28.0,14,5,9,0.357,...,370,406,182,299,277,157,15,234,5,16291391610612751
470,1628380,Zach Collins,Zach,1610612759,SAS,25.0,11,5,6,0.455,...,36,176,166,366,35,157,15,41,5,16283801610612759
471,203897,Zach LaVine,Zach,1610612741,CHI,27.0,16,8,8,0.500,...,300,208,62,281,146,157,15,104,5,2038971610612741
472,1630192,Zeke Nnaji,Zeke,1610612743,DEN,21.0,15,9,6,0.600,...,45,234,159,435,381,157,15,365,5,16301921610612743


In [34]:
df = df_list[0]
df.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'AGE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS',
       'NBA_FANTASY_PTS', 'DD2', 'TD3', 'WNBA_FANTASY_PTS', 'GP_RANK',
       'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK',
       'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK',
       'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK',
       'DD2_RANK', 'TD3_RANK', 'WNBA_FANTASY_PTS_RANK', 'CFID', 'CFPARAMS'],
      dtype='object')

In [23]:
drop_stats = [
    "W",
    "L",
    "W_PCT",
    "FG_PCT",
    "FG3_PCT",
    "FT_PCT",
    "REB",
    "NBA_FANTASY_PTS",
    "DD2",
    "TD3",
    "WNBA_FANTASY_PTS",
]
drop_ranks = [stat + "_RANK" for stat in drop_stats]
drop_cols = [
    "NICKNAME",
    "TEAM_ID",
    *drop_stats,
    *drop_ranks,
    "CFID",
    "CFPARAMS",
]
df.drop(drop_cols, axis=1).sort_values(by="MIN", ascending=False).head(10)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP,MIN,FGM,FGA,FG3M,FG3A,...,DREB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK
24,952,Antoine Walker,BOS,25.0,81,3406.435,7.0,17.8,2.3,6.8,...,84,105,78,138,262,46,333,153,32,95
395,1495,Tim Duncan,SAS,26.0,82,3331.66,8.3,16.3,0.0,0.1,...,12,137,56,366,33,67,352,153,9,22
258,84,Latrell Sprewell,NYK,31.0,81,3327.863333,6.2,15.4,1.6,4.4,...,382,132,118,241,367,206,420,153,67,273
33,1884,Baron Davis,CHH,23.0,82,3319.765,6.1,14.5,1.8,5.2,...,353,19,77,36,219,255,321,153,79,123
320,1718,Paul Pierce,BOS,24.0,82,3298.856667,7.7,17.4,2.3,5.7,...,134,162,80,62,135,100,326,153,7,62
158,56,Gary Payton,SEA,33.0,82,3297.143333,8.0,17.2,0.8,2.6,...,284,13,151,103,305,222,409,152,20,106
380,950,Stephon Marbury,PHX,25.0,82,3186.473333,7.1,16.0,0.8,2.8,...,411,20,31,293,369,105,394,151,33,227
245,708,Kevin Garnett,MIN,26.0,81,3171.478333,7.5,15.9,0.4,1.3,...,11,85,86,216,81,274,397,70,23,44
189,147,Jalen Rose,CHI,29.0,83,3155.568333,7.6,16.6,1.0,2.8,...,260,110,147,285,217,250,284,100,25,239
135,224,Eddie Jones,MIA,30.0,81,3155.218333,5.9,13.7,1.7,4.4,...,271,149,301,122,146,363,275,150,68,152


Get both traditional *and* advanced stats, *and* multiple seasons?

In [3]:
def get_leaguedash(
    measure_type: str = "Base",
    season_type: str = "Regular Season",
    season: str = "2001-02",
    per_mode: str = "Per36",
    df: bool = True,
):
    """Wrapper for league dash player stats request"""

    league_dash = leaguedashplayerstats.LeagueDashPlayerStats(
        measure_type_detailed_defense=measure_type,
        season_type_all_star=season_type,
        season=season,
        plus_minus="N",
        per_mode_detailed=per_mode,
    )
    if df:
        res = league_dash.get_data_frames()[0]
    else:
        res = json.loads(league_dash.get_normalized_json())

    return res

In [4]:
measure_type = "|".join(["Base", "Advanced"])
season_type = "|".join(["Regular Season", "Playoffs"])
seasons = "|".join(["2005-06", "2006-07"])

# res = get_leaguedash(measure_type, season_type, seasons)
# res = get_leaguedash(measure_type=measure_type)


# res = get_leaguedash(season=seasons)
res_df = get_leaguedash()

No bueno. One type, one season, one season type at a time

In [14]:
res_df.drop(drop_cols, axis=1).sort_values("MIN", ascending=False).head(10)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,...,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS
24,952,Antoine Walker,BOS,25.0,81,3406.435,0.7,1.8,-0.001,1.6,...,-2.7,-0.5,-2.3,1.7,-1.6,1.6,0.1,-0.1,2.5,2.5
395,1495,Tim Duncan,SAS,26.0,82,3331.66,0.7,-3.7,0.039,0.8,...,0.3,1.3,-0.1,-0.4,1.2,-1.2,-3.3,3.3,6.1,6.1
258,84,Latrell Sprewell,NYK,31.0,81,3327.863333,-0.9,0.0,-0.014,0.3,...,-1.7,-0.1,0.5,-0.8,-0.8,0.8,1.0,-1.0,-2.6,-2.6
33,1884,Baron Davis,CHH,23.0,82,3319.765,0.7,0.7,0.006,-0.5,...,2.3,1.3,-0.1,0.3,0.4,-0.4,0.6,-0.6,1.6,1.6
320,1718,Paul Pierce,BOS,24.0,82,3298.856667,1.0,1.5,0.005,1.6,...,-2.6,-0.5,-2.4,1.6,-1.5,1.5,0.0,0.0,3.4,3.4
158,56,Gary Payton,SEA,33.0,82,3297.143333,1.5,0.7,0.019,0.3,...,-1.5,0.7,-1.3,1.1,-0.4,0.4,1.1,-1.1,2.2,2.2
380,950,Stephon Marbury,PHX,25.0,82,3186.473333,0.9,2.6,-0.004,-1.0,...,-0.7,0.0,-0.6,-0.1,-0.8,0.8,2.3,-2.3,-1.1,-1.1
245,708,Kevin Garnett,MIN,26.0,81,3171.478333,2.3,1.9,0.024,-1.1,...,4.8,2.0,0.1,-0.3,0.6,-0.6,-0.1,0.1,4.1,4.1
189,147,Jalen Rose,CHI,29.0,83,3155.568333,0.0,-0.6,0.005,-0.1,...,-1.6,0.0,0.1,0.0,0.3,-0.3,1.3,-1.3,-1.4,-1.4
135,224,Eddie Jones,MIA,30.0,81,3155.218333,2.0,1.1,0.027,-0.5,...,0.9,2.6,0.4,-0.4,1.2,-1.2,1.0,-1.0,0.8,0.8


In [25]:
import json

res_json = json.loads(league_dash.get_normalized_json())
print(f"res_json type: {type(res_json)}\nkeys: {list(res_json.keys())}")

res_json type: <class 'dict'>
keys: ['LeagueDashPlayerStats']


In [26]:
print(
    f"dict value type: {type(res_json['LeagueDashPlayerStats'])}\nlen: {len(res_json['LeagueDashPlayerStats'])}"
)

dict value type: <class 'list'>
len: 440


In [27]:
sample_record = res_json["LeagueDashPlayerStats"][0]
print(f"type: {type(sample_record)}\ncontent: {sample_record}")

type: <class 'dict'>
content: {'PLAYER_ID': 2062, 'PLAYER_NAME': 'A.J. Guyton', 'NICKNAME': 'A.J.', 'TEAM_ID': 1610612741, 'TEAM_ABBREVIATION': 'CHI', 'AGE': 24.0, 'GP': 45, 'W': 10, 'L': 35, 'W_PCT': 0.222, 'MIN': 603.93, 'FGM': 5.2, 'FGA': 14.5, 'FG_PCT': 0.361, 'FG3M': 2.7, 'FG3A': 7.3, 'FG3_PCT': 0.374, 'FTM': 1.3, 'FTA': 1.6, 'FT_PCT': 0.815, 'OREB': 0.7, 'DREB': 1.9, 'REB': 2.6, 'AST': 4.8, 'TOV': 2.2, 'STL': 0.6, 'BLK': 0.4, 'BLKA': 0.2, 'PF': 1.4, 'PFD': 0.0, 'PTS': 14.5, 'PLUS_MINUS': -2.6, 'NBA_FANTASY_PTS': 25.8, 'DD2': 0, 'TD3': 0, 'WNBA_FANTASY_PTS': 26.8, 'GP_RANK': 296, 'W_RANK': 360, 'L_RANK': 129, 'W_PCT_RANK': 416, 'MIN_RANK': 301, 'FGM_RANK': 159, 'FGA_RANK': 73, 'FG_PCT_RANK': 385, 'FG3M_RANK': 7, 'FG3A_RANK': 7, 'FG3_PCT_RANK': 90, 'FTM_RANK': 349, 'FTA_RANK': 379, 'FT_PCT_RANK': 100, 'OREB_RANK': 347, 'DREB_RANK': 422, 'REB_RANK': 418, 'AST_RANK': 83, 'TOV_RANK': 168, 'STL_RANK': 385, 'BLK_RANK': 251, 'BLKA_RANK': 420, 'PF_RANK': 433, 'PFD_RANK': 153, 'PTS_RANK': 

In [31]:
some_recs = res_json["LeagueDashPlayerStats"][:5]
df_some_recs = pd.DataFrame.from_dict(
    some_recs,
    orient="columns",
).set_index("PLAYER_ID", drop=True)
df_some_recs

Unnamed: 0_level_0,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,MIN,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,CFID,CFPARAMS
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2062,A.J. Guyton,A.J.,1610612741,CHI,24.0,45,10,35,0.222,603.93,...,433,153,138,271,304,225,18,193,5,20621610612741
243,Aaron McKie,Aaron,1610612755,PHI,29.0,48,26,22,0.542,1469.013333,...,415,153,144,84,177,176,18,152,5,2431610612755
1425,Aaron Williams,Aaron,1610612751,NJN,30.0,82,52,30,0.634,1549.918333,...,72,153,165,82,123,225,18,146,5,14251610612751
1502,Adonal Foyle,Adonal,1610612744,GSW,27.0,79,19,60,0.241,1485.29,...,125,96,365,353,56,127,18,112,5,15021610612744
1559,Adrian Griffin,Adrian,1610612742,DAL,27.0,58,40,18,0.69,1386.62,...,185,153,291,53,243,225,18,292,5,15591610612742


## Transform

See `Transform` in `data-src.md`

In [26]:
from pathlib import Path
import pickle
import pandas as pd
from typing import Any

data_path = Path("../data/")

season = "2018-19"
reg_pkl = data_path / f"leaguedash_regular_{season}.pkl"
post_pkl = data_path / f"leaguedash_playoffs_{season}.pkl"

# with open(reg_pkl, "rb") as reg_file, bopen(post_pkl, "rb") as post_file:
#     reg_json = pickle.load(reg_file)
#     post_json = pickle.load(post_file)

In [27]:
# loading pickles
def load_pickle(fp: Path) -> Any:
    """Loads the json pickle and returns as df"""
    with open(fp, "rb") as f_in:
        res = pickle.load(f_in)

    if isinstance(res, list) and isinstance(res[0], dict):
        df = pd.DataFrame.from_dict(res).set_index("PLAYER_ID")
        return df
    else:
        raise TypeError("Expected list of dicts")

In [28]:
reg_df = load_pickle(reg_pkl)
post_df = load_pickle(post_pkl)

In [4]:
# removing columns and adding F2GM/A
ALL_COLS = list(reg_df.columns)
DROP_STATS = [
    "NICKNAME",
    "TEAM_ID",
    "W",
    "L",
    "FGM",
    "FGA",
    "REB",
    "NBA_FANTASY_PTS",
    "DD2",
    "TD3",
    "WNBA_FANTASY_PTS",
    "CFID",
    "CFPARAMS",
]
DROP_RANK_PCT = [col for col in ALL_COLS if "_RANK" in col or "_PCT" in col]
DROP_COLS = DROP_STATS + DROP_RANK_PCT

In [35]:
reg_df.head(10)

Unnamed: 0_level_0,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,MIN,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,CFID,CFPARAMS
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
203932,Aaron Gordon,Aaron,1610612753,ORL,23.0,78,40,38,0.513,2632.533333,...,423,121,157,165,159,34,38,147,5,2039321610612753
1628988,Aaron Holiday,Aaron,1610612754,IND,22.0,50,31,19,0.62,645.83,...,147,170,175,58,236,261,38,210,5,16289881610612754
1627846,Abdel Nader,Abdel,1610612760,OKC,25.0,61,38,23,0.623,693.656667,...,204,382,353,469,426,261,38,420,5,16278461610612760
201143,Al Horford,Al,1610612738,BOS,33.0,68,41,27,0.603,1972.683333,...,435,319,161,75,81,70,16,88,5,2011431610612738
202329,Al-Farouq Aminu,Al-Farouq,1610612757,POR,28.0,81,52,29,0.642,2291.698333,...,450,296,380,52,290,60,38,320,5,2023291610612757
1626210,Alan Williams,Alan,1610612751,BKN,26.0,5,1,4,0.2,25.533333,...,331,81,19,6,3,261,38,3,5,16262101610612751
202692,Alec Burks,Alec,1610612758,SAC,27.0,64,19,45,0.297,1374.946667,...,419,176,251,462,276,189,38,273,5,2026921610612758
203518,Alex Abrines,Alex,1610612760,OKC,25.0,31,21,10,0.677,588.273333,...,252,365,451,154,511,261,38,496,5,2035181610612760
1627936,Alex Caruso,Alex,1610612747,LAL,25.0,25,8,17,0.32,531.126667,...,185,99,205,117,215,131,38,204,5,16279361610612747
203458,Alex Len,Alex,1610612737,ATL,26.0,77,28,49,0.364,1544.075,...,79,112,79,351,101,78,38,78,5,2034581610612737


In [36]:
set(df_list[0].columns) - set(reg_df.columns)

{'PLAYER_ID'}

In [7]:
def dump_pickle(obj, fp: Path) -> None:
    """pickle dump"""
    with open(fp, "wb") as f_out:
        pickle.dump(obj, f_out)

In [9]:
# save intermediate result for testing
test_reg_df_path = data_path / "test_regular_df.pkl"
test_playoffs_df_path = data_path / "test_playoffs_df.pkl"
print(test_reg_df_path.resolve())
with open(test_reg_df_path, "wb") as reg_out:
    pickle.dump(obj=reg_df, file=reg_out)

with open(test_playoffs_df_path, "wb") as p_out:
    pickle.dump(obj=post_df, file=p_out)

/home/kohada/project-weekly/streamlit/data/test_regular_df.pkl


### Merging to one df, then .apply()

In [28]:
# testing merging using player_id
cols = reg_df.columns
fo = reg_df.head(10)
ba = post_df.head(10)
foo = fo.drop(DROP_COLS, axis=1)
bar = ba.drop(DROP_COLS, axis=1)
baz = pd.merge(
    left=foo,
    right=bar.loc[:, "GP":],
    how="outer",
    left_index=True,
    right_index=True,
    suffixes=["_reg", "_post"]
    # left_on=foo.columns,
    # right_index=True,
    # validate="many_to_one"
)
baz

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_reg,MIN_reg,FG3M_reg,FG3A_reg,FTM_reg,FTA_reg,OREB_reg,...,DREB_post,AST_post,TOV_post,STL_post,BLK_post,BLKA_post,PF_post,PFD_post,PTS_post,PLUS_MINUS_post
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2738,,,,,,,,,,,...,3.7,4.7,1.2,1.4,1.3,0.2,2.5,2.7,11.8,3.9
101106,,,,,,,,,,,...,10.3,4.2,2.0,1.0,1.2,0.2,7.7,1.4,10.5,-2.2
101161,,,,,,,,,,,...,3.7,3.7,1.8,1.8,0.0,0.0,5.5,0.0,14.7,-33.0
201143,Al Horford,BOS,33.0,68.0,1972.683333,1.3,3.7,1.4,1.7,2.2,...,7.9,4.6,2.2,0.5,0.8,0.6,2.4,2.1,14.5,3.3
202329,Al-Farouq Aminu,POR,28.0,81.0,2291.698333,1.5,4.4,2.4,2.7,1.8,...,6.4,1.9,1.4,0.8,0.9,1.0,3.1,2.3,10.6,-1.8
202692,Alec Burks,SAC,27.0,64.0,1374.946667,1.6,4.4,3.0,3.7,0.8,...,,,,,,,,,,
203083,,,,,,,,,,,...,9.0,2.5,3.1,1.7,1.4,3.1,4.8,5.7,16.1,-27.1
203458,Alex Len,ATL,26.0,77.0,1544.075,1.7,4.8,3.3,5.0,3.7,...,,,,,,,,,,
203518,Alex Abrines,OKC,25.0,31.0,588.273333,2.5,7.8,0.7,0.8,0.3,...,,,,,,,,,,
203932,Aaron Gordon,ORL,23.0,78.0,2632.533333,1.7,4.7,2.5,3.5,1.8,...,5.5,4.0,2.9,1.3,0.2,0.4,3.5,5.1,16.7,-17.4


In [30]:
only_post_mask = baz["AGE"].isna()
# baz[only_post_mask]
player_bio_cols = ["PLAYER_NAME", "TEAM_ABBREVIATION", "AGE"]


def fill_player_bio(player):
    for col in player_bio_cols:
        player[col] = bar.loc[player.name][col]
    return player


bae = baz[only_post_mask].apply(lambda x: fill_player_bio(x), axis=1)

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_reg,MIN_reg,FG3M_reg,FG3A_reg,FTM_reg,FTA_reg,OREB_reg,...,DREB_post,AST_post,TOV_post,STL_post,BLK_post,BLKA_post,PF_post,PFD_post,PTS_post,PLUS_MINUS_post
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2738,Andre Iguodala,GSW,35.0,,,,,,,,...,3.7,4.7,1.2,1.4,1.3,0.2,2.5,2.7,11.8,3.9
101106,Andrew Bogut,GSW,34.0,,,,,,,,...,10.3,4.2,2.0,1.0,1.2,0.2,7.7,1.4,10.5,-2.2
101161,Amir Johnson,PHI,32.0,,,,,,,,...,3.7,3.7,1.8,1.8,0.0,0.0,5.5,0.0,14.7,-33.0
203083,Andre Drummond,DET,25.0,,,,,,,,...,9.0,2.5,3.1,1.7,1.4,3.1,4.8,5.7,16.1,-27.1
1628035,Alfonzo McKinnie,GSW,26.0,,,,,,,,...,5.1,0.6,0.9,0.3,0.2,0.6,4.9,1.5,10.1,1.1


Messy, having to fill player bio from the other dataframe|

In [30]:
cols = reg_df.columns
rm_rank = [col for col in cols if "_RANK" in col or "_PCT" in col]
print(rm_rank)

['W_PCT', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK', 'WNBA_FANTASY_PTS_RANK']


In [5]:
def feature_engineer(df: pd.DataFrame) -> pd.DataFrame:
    """Removes extraneous columns from leaguedash,
    and engineers some new features
    """
    result = df.copy()
    result["FG2M"] = result["FGM"] - result["FG3M"]
    result["FG2A"] = result["FGA"] - result["FG3A"]

    result = result.drop(DROP_COLS, axis=1)
    return result

In [6]:
feat_reg = feature_engineer(reg_df)
feat_post = feature_engineer(post_df)
# feat_reg

In [8]:
# saving intermediate result for testing
test_regular_feat = data_path / "test_regular_feat.pkl"
test_playoffs_feat = data_path / "test_playoffs_feat.pkl"
dump_pickle(feat_reg, test_regular_feat)
dump_pickle(feat_post, test_playoffs_feat)

In [20]:
feat_reg.columns

Index(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE', 'GP', 'MIN', 'FG3M', 'FG3A',
       'FTM', 'FTA', 'OREB', 'DREB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF',
       'PFD', 'PTS', 'PLUS_MINUS', 'FG2M', 'FG2A'],
      dtype='object')

In [18]:
player = feat_reg.loc[203932]
player.name

203932

In [10]:
MERGE_STATS = [
    "GP",
    "MIN",
    "FG3M",
    "FG3A",
    "FTM",
    "FTA",
    "OREB",
    "DREB",
    "AST",
    "TOV",
    "STL",
    "BLK",
    "BLKA",
    "PF",
    "PFD",
    "PTS",
    "PLUS_MINUS",
    "FG2M",
    "FG2A",
]
PLAYER_BIO = set(["PLAYER_NAME", "TEAM_ABBREVIATION", "AGE"])


def _reg_post_merge(
    player: pd.Series,
    post_df: pd.DataFrame,
    post_wt: float = 2.0,
) -> pd.Series:
    """Folds regular and post season stats into one via a weight coefficient
    Scoped within transform_leaguedash to make use of post_df"""
    # if either regular or post stats for a given player is missing, use
    # what's present
    # only fold if both are present
    player = player.copy()  # avoids mutating the df as it's being iterated

    post_ids = set(post_df.index)
    if player.name in post_ids:
        post_season = post_df.loc[player.name]
        # initiate merge, since player is present in both reg and post

    else:
        post_season = player

    gp_tot = player["GP"] + post_wt * post_season["GP"]
    for stat in player.index:
        if stat not in PLAYER_BIO:
            player[stat + "_merge"] = (
                player["GP"] / gp_tot * player[stat]
                + post_wt * post_season["GP"] / gp_tot * post_season[stat]
            )
    return player


def reg_post_merge(reg_df, post_df, post_wt: float = 2.0) -> pd.DataFrame:
    """Wrapper to for df.apply(_reg_post_merge)
    This pattern can include the subsequent dropping of columns, for
    better unit testing
    """
    return reg_df.apply(_reg_post_merge, post_df=post_df, post_wt=post_wt, axis=1).drop(
        MERGE_STATS, axis=1
    )

In [12]:
# saving intermediates for testing
merge_df = reg_post_merge(reg_df=feat_reg, post_df=feat_post)
merge_df_fp = data_path / "test_merge_df.pkl"
dump_pickle(merge_df, merge_df_fp)

In [37]:
haf = feat_reg.sample(10)

Unnamed: 0_level_0,AGE,AST,AST_merge,BLK,BLKA,BLKA_merge,BLK_merge,DREB,DREB_merge,FG2A,...,PLAYER_NAME,PLUS_MINUS,PLUS_MINUS_merge,PTS,PTS_merge,STL,STL_merge,TEAM_ABBREVIATION,TOV,TOV_merge
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
203089,28.0,2.7,,2.1,0.2,,,10.5,,6.9,...,John Henson,0.2,,15.1,,1.3,,CLE,2.3,
1626149,25.0,2.7,2.746392,1.8,1.2,1.153608,1.660825,5.9,5.729897,14.5,...,Montrezl Harrell,1.0,-1.907216,22.7,23.071134,1.2,1.12268,LAC,2.2,2.138144
1628396,21.0,1.0,,2.0,0.0,,,6.0,,16.0,...,Tony Bradley,-18.1,,17.0,,2.0,,UTA,3.0,
201950,29.0,7.8,,0.8,1.0,,,3.9,,12.0,...,Jrue Holiday,3.5,,21.3,,1.6,,NOP,3.1,
201568,30.0,3.1,3.063855,0.4,0.7,0.808434,0.363855,6.3,6.06506,9.0,...,Danilo Gallinari,1.8,0.083133,23.5,23.10241,0.9,0.990361,LAC,1.7,1.754217
2585,35.0,3.7,3.446575,0.7,0.7,0.768493,0.652055,6.8,7.245205,7.7,...,Zaza Pachulia,-4.0,-3.609589,10.9,10.721918,1.3,1.210959,DET,2.3,2.258904
1628971,22.0,2.3,2.180952,0.9,1.0,0.952381,0.864286,3.4,3.519048,5.5,...,Bruce Brown,-0.1,-3.111905,7.9,7.947619,1.0,1.035714,DET,1.1,1.12381
201573,30.0,6.5,,0.1,0.4,,,2.8,,7.2,...,Jerryd Bayless,-8.8,,11.5,,1.0,,MIN,1.8,
1627816,25.0,2.0,,1.2,0.8,,,5.5,,6.9,...,Alex Poythress,-8.9,,12.6,,0.5,,ATL,1.5,
203918,26.0,2.4,2.042857,0.2,0.4,0.364286,0.235714,2.5,2.535714,8.3,...,Rodney Hood,-5.2,-3.485714,15.3,15.264286,1.1,0.921429,POR,1.0,0.892857


In [44]:
tok = haf.apply(reg_post_merge, post_wt=3.0, axis=1)
tok

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP,MIN,FG3M,FG3A,FTM,FTA,OREB,...,TOV_merge,STL_merge,BLK_merge,BLKA_merge,PF_merge,PFD_merge,PTS_merge,PLUS_MINUS_merge,FG2M_merge,FG2A_merge
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
203089,John Henson,CLE,28.0,14,188.408333,2.1,5.9,1.1,1.9,3.1,...,2.3,1.3,2.1,0.2,3.1,1.7,15.1,0.2,3.8,6.9
1626149,Montrezl Harrell,LAC,25.0,82,2158.148333,0.1,0.3,4.4,6.9,3.1,...,2.128,1.11,1.638,1.146,4.3,5.82,23.132,-2.384,9.27,14.446
1628396,Tony Bradley,UTA,21.0,3,35.9,0.0,0.0,1.0,2.0,9.0,...,3.0,2.0,2.0,0.0,6.0,2.0,17.0,-18.1,8.0,16.0
201950,Jrue Holiday,NOP,29.0,67,2401.928333,1.8,5.4,3.1,4.1,1.1,...,3.1,1.6,0.8,1.0,2.2,3.6,21.3,3.5,6.4,12.0
201568,Danilo Gallinari,LAC,30.0,68,2059.211667,2.8,6.5,6.4,7.1,0.9,...,1.762791,1.004651,0.35814,0.825581,2.34186,5.590698,23.039535,-0.188372,4.462791,9.648837
2585,Zaza Pachulia,DET,35.0,68,878.165,0.0,0.2,4.0,5.1,4.1,...,2.251351,1.194595,0.643243,0.781081,6.102703,5.643243,10.689189,-3.537838,3.483784,7.886486
1628971,Bruce Brown,DET,22.0,74,1449.316667,0.6,2.3,1.1,1.5,1.2,...,1.127907,1.04186,0.85814,0.944186,4.497674,1.386047,7.955814,-3.630233,2.513953,5.527907
201573,Jerryd Bayless,MIN,30.0,34,656.996667,1.6,5.4,0.9,1.5,0.6,...,1.8,1.0,0.1,0.4,3.1,1.4,11.5,-8.8,2.9,7.2
1627816,Alex Poythress,ATL,25.0,21,304.9,1.1,2.7,2.1,3.4,3.4,...,1.5,0.5,1.2,0.8,5.5,3.3,12.6,-8.9,3.6,6.9
203918,Rodney Hood,POR,26.0,72,1892.563333,1.6,4.5,2.6,2.9,0.5,...,0.88,0.9,0.24,0.36,2.76,2.56,15.26,-3.28,3.72,7.3


In [None]:
# step 3 - re-rank given merged stats
def _leaguedash_rerank(stat: pd.Series) -> pd.Series:
    """Ranks all the values in the given stat column.
    Largest values will be given top ranks
    To be used in df.apply()

    Parameters
    ---------

    stat: pd.Series
        A statistical field with numeric values to be ranked

    Returns
    --------

    stat_rank: pd.Series
        Ranking of the stat Series
    """

    # sort the values
    sorted_stat_index = stat.sort_values(ascending=False).index

    # attach a sequential index to the now sorted values
    sorted_rank = [(rank + 1) for rank in range(len(stat.index))]

    # can't for the life of me figure out how to return my desired column names
    # rename after returning.
    rank_series = pd.Series(
        data=sorted_rank,
        index=sorted_stat_index,
        name=f"{stat.name}_RANK",
    ).reindex(index=stat.index)

    # standardize by dividing by num of players
    rank_series /= len(stat)
    return rank_series


def leaguedash_rerank(merge_df: pd.DataFrame) -> pd.DataFrame:
    """Wrapper for df.apply(_leaguedash_rerank)
    Includes the setup of dropping player_bio columns and subsequent
    replacement/formatting of column names, and dataframe merge
    """
    rerank = merge_df.drop(PLAYER_BIO, axis=1).apply(_leaguedash_rerank, axis="index")
    rerank.columns = [col.replace("merge", "RANK") for col in rerank.columns]
    merge_rerank = pd.concat([merge_df, rerank], axis="columns")
    return merge_rerank

In [None]:
rerank_df = leaguedash_rerank(merge_df=merge_df)
rerank_fp = data_path / "test_rerank_df.pkl"

In [None]:
dump_pickle(rerank_df, rerank_fp)

In [52]:
# step 4 filter for minutes and games played
def _player_meets_standard(
    player: pd.Series, min_thd: int = 800, gp_thd: int = 40
) -> bool:
    """Does this player pass the minutes or games played threshold?
    Considers the folded minutes/games played
    """
    # logger.debug(f"min merge: {player['MIN_merge']}, min_thd: {min_thd}")
    return player["MIN_merge"] >= min_thd or player["GP_merge"] >= gp_thd


def player_meets_standard(
    df: pd.DataFrame, min_thd: int = 800, gp_thd: int = 40
) -> bool:
    """
    Wrapper for df.apply(_player_meets_standard, ...)
    """
    df["gametime_threshold"] = df.apply(
        _player_meets_standard, min_thd=min_thd, gp_thd=gp_thd, axis=1
    )
    return df

In [None]:
gametime_fp = data_path / "test_gametime_df.pkl"
gametime_df = player_meets_standard(df=rerank_df)

In [None]:
dump_pickle(gametime_df, gametime_fp)

In [54]:
time_filter = tok.apply(_player_meets_standard, min_thd=1000, axis=1)
tok[time_filter]

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP,MIN,FG3M,FG3A,FTM,FTA,OREB,...,TOV_merge,STL_merge,BLK_merge,BLKA_merge,PF_merge,PFD_merge,PTS_merge,PLUS_MINUS_merge,FG2M_merge,FG2A_merge
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1626149,Montrezl Harrell,LAC,25.0,82,2158.148333,0.1,0.3,4.4,6.9,3.1,...,2.128,1.11,1.638,1.146,4.3,5.82,23.132,-2.384,9.27,14.446
201950,Jrue Holiday,NOP,29.0,67,2401.928333,1.8,5.4,3.1,4.1,1.1,...,3.1,1.6,0.8,1.0,2.2,3.6,21.3,3.5,6.4,12.0
201568,Danilo Gallinari,LAC,30.0,68,2059.211667,2.8,6.5,6.4,7.1,0.9,...,1.762791,1.004651,0.35814,0.825581,2.34186,5.590698,23.039535,-0.188372,4.462791,9.648837
2585,Zaza Pachulia,DET,35.0,68,878.165,0.0,0.2,4.0,5.1,4.1,...,2.251351,1.194595,0.643243,0.781081,6.102703,5.643243,10.689189,-3.537838,3.483784,7.886486
1628971,Bruce Brown,DET,22.0,74,1449.316667,0.6,2.3,1.1,1.5,1.2,...,1.127907,1.04186,0.85814,0.944186,4.497674,1.386047,7.955814,-3.630233,2.513953,5.527907
203918,Rodney Hood,POR,26.0,72,1892.563333,1.6,4.5,2.6,2.9,0.5,...,0.88,0.9,0.24,0.36,2.76,2.56,15.26,-3.28,3.72,7.3
