# Altair Demo

Trying out the charts here before moving to streamlit

In [1]:
import altair as alt
import pandas as pd
from pathlib import Path

alt.data_transformers.disable_max_rows()
alt.data_transformers.enable("json")

DataTransformerRegistry.enable('json')

In [2]:
fp = Path("../data/leaguedash_labelled_2018-19.pkl")
# with open(fp, "rb") as f:
#     tor = pickle.load(f)
tor = pd.read_pickle(fp)

In [3]:
fp = Path("../data/leaguedash_labelled_2004-05.pkl")
# with open(fp, "rb") as f:
#     det = pickle.load(f)
det = pd.read_pickle(fp)

## Scatterplot

Scatterplot will be a broad overview of the season's stats:

* FG2A vs FG2M, color=PTS
* FG3A vs FG3M, color=PTS
* FG3A vs FG2A, color=PFD
* AST vs TOV, 
* BLK vs STL, color=PF

In [4]:
tor.columns

Index(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE', 'GP_merge', 'MIN_merge',
       'FG3M_merge', 'FG3A_merge', 'FTM_merge', 'FTA_merge', 'OREB_merge',
       'DREB_merge', 'AST_merge', 'TOV_merge', 'STL_merge', 'BLK_merge',
       'BLKA_merge', 'PF_merge', 'PFD_merge', 'PTS_merge', 'PLUS_MINUS_merge',
       'FG2M_merge', 'FG2A_merge', 'GP_RANK', 'MIN_RANK', 'FG3M_RANK',
       'FG3A_RANK', 'FTM_RANK', 'FTA_RANK', 'OREB_RANK', 'DREB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'FG2M_RANK', 'FG2A_RANK',
       'gametime_threshold', 'label_pred', 'label_names'],
      dtype='object')

In [5]:
tor.groupby(by="label_pred").agg("count")

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_names
label_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,121,121,121,121,121,121,121,121,121,121,...,121,121,121,121,121,121,121,121,121,121
1,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
2,397,397,397,397,397,397,397,397,397,397,...,397,397,397,397,397,397,397,397,397,397


In [6]:
# from example gallery
# https://altair-viz.github.io/gallery/scatter_linked_table.html

src = tor[tor["gametime_threshold"]][tor["label_pred"] == 2]
brush = alt.selection(type="interval")

# scatterplot
points = (
    alt.Chart(src)
    .mark_point()
    .encode(
        x="FG3A_merge:Q",
        y="FG2A_merge:Q",
        color=alt.condition(brush, "PTS_merge:Q", alt.value("grey")),
    )
    .add_selection(brush)
)
# points

# base chart for data tables
ranked_text = (
    alt.Chart(src)
    .mark_text()
    .encode(y=alt.Y("row_number:O", axis=None))
    .transform_window(row_number="row_number()")
    .transform_filter(brush)
    .transform_window(rank="rank(row_number)")
    .transform_filter(alt.datum.rank < 20)
)

# encoding our data table onto the base
player_name = ranked_text.encode(text="PLAYER_NAME:N").properties(title="Name")
team = ranked_text.encode(text="TEAM_ABBREVIATION:N").properties(title="Team")
pts = ranked_text.encode(text="PTS_merge:Q").properties(title="Points")
text = alt.hconcat(player_name, team, pts)

# build chart
alt.hconcat(
    points,
    text,
).resolve_legend(color="independent")

  src = tor[tor["gametime_threshold"]][tor["label_pred"] == 2]


The click and drag selection feature is kind of amazing. Perhaps I can further categorize by the cluster labels?

Use `st.checkbox('label_')` for user to select which labels to plot, and perhaps encode via shape, if multiple labels are selected

## Violinplot

Showcase the distribution of each stat for the two seasons in a shotgun array. Each subplot will have two violins, one for each season.

Let's plot FG2A and FG3A

In [7]:
tor["season"] = tor.apply(lambda x: "2018-19", axis=1)
det["season"] = det.apply(lambda x: "2004-05", axis=1)
src = pd.concat([tor, det], axis=0)
src.sample(10)

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_pred,label_names,season
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1626169,Stanley Johnson,NOP,23.0,66.0,1207.37,1.8,6.4,1.5,1.9,1.0,...,0.433962,0.70566,0.54717,0.803774,0.592453,0.611321,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
970,Othella Harrington,CHI,31.0,60.634146,1106.40187,0.0,0.0,3.380488,4.82439,2.868293,...,0.155172,0.096983,0.213362,0.215517,0.086207,0.140086,True,0,Derrick Favors-Clint Capela-Myles Turner,2004-05
990,Malik Rose,NYK,30.0,76.0,1481.77,0.0,0.2,3.3,4.5,3.1,...,0.19181,0.232759,0.428879,0.338362,0.306034,0.282328,True,0,Derrick Favors-Clint Capela-Myles Turner,2004-05
1913,Michael Ruffin,WAS,28.0,66.010309,1052.747148,0.0,0.0,1.041237,2.234021,4.314433,...,0.114224,0.116379,0.989224,0.659483,0.978448,0.984914,True,0,Derrick Favors-Clint Capela-Myles Turner,2004-05
203915,Spencer Dinwiddie,BKN,26.0,59.923077,1685.475556,2.325641,6.948718,5.053846,6.328205,0.576923,...,0.437736,0.086792,0.10566,0.596226,0.34717,0.381132,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
203914,Gary Harris,DEN,24.0,42.835294,1269.677373,1.668235,4.871765,2.434118,3.002353,0.801176,...,0.739623,0.498113,0.384906,0.124528,0.456604,0.439623,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
762,Shawn Bradley,DAL,33.0,66.230769,749.939487,0.0,0.0,1.738462,2.630769,3.315385,...,0.071121,0.021552,0.780172,0.092672,0.556034,0.635776,True,0,Derrick Favors-Clint Capela-Myles Turner,2004-05
202327,Ekpe Udoh,UTA,32.0,47.436364,297.182939,0.0,0.0,1.947273,3.152727,2.781818,...,0.330189,0.460377,0.669811,0.886792,0.273585,0.471698,True,0,Derrick Favors-Clint Capela-Myles Turner,2018-19
1628963,Marvin Bagley III,SAC,20.0,62.0,1567.495,0.7,2.2,4.2,6.0,3.7,...,0.654717,0.081132,0.109434,0.583019,0.066038,0.058491,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
1628369,Jayson Tatum,BOS,21.0,66.010309,2053.968419,1.607216,4.451546,3.01134,3.578351,0.981443,...,0.743396,0.364151,0.237736,0.145283,0.284906,0.250943,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19


In [8]:
def make_longform(df):
    """Altair prefers longform structures in its grammar"""
    player_bios = [
        "PLAYER_NAME",
        "TEAM_ABBREVIATION",
        "season",
        "label_names",
        "gametime_threshold",
    ]
    longform = df[df["gametime_threshold"]].melt(
        id_vars=player_bios,
        value_vars=df.drop(player_bios, axis="columns").columns,
        var_name="variable",
        value_name="value",
        ignore_index=False,
    )
    longform = longform.set_index("season", append=True)
    # longform.set_index()
    return longform


src_long = make_longform(src)
src_long.sample(5)

# src_long.set_index('season', append=True).xs("2018-19", level="season").head()
# src_long.set_index('season', append=True).loc[(slice(None) ,"2018-19"),:].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_NAME,TEAM_ABBREVIATION,label_names,gametime_threshold,variable,value
PLAYER_ID,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
201973,2018-19,Jonas Jerebko,GSW,Brook Lopez-James Harden-Pascal Siakam,True,BLKA_RANK,0.696226
467,2004-05,Jason Kidd,NJN,Brook Lopez-James Harden-Pascal Siakam,True,FG3M_RANK,0.090517
949,2004-05,Shareef Abdur-Rahim,POR,Brook Lopez-James Harden-Pascal Siakam,True,TOV_RANK,0.390086
203090,2018-19,Maurice Harkless,POR,Derrick Favors-Clint Capela-Myles Turner,True,GP_merge,44.695652
2052,2004-05,DeShawn Stevenson,ORL,Brook Lopez-James Harden-Pascal Siakam,True,PF_merge,2.4


In [23]:
vars = ["FG2A_merge", "FG3A_merge", "PFD_merge"]
# src_a = tor[tor["gametime_threshold"]]
# src_b = det[det["gametime_threshold"]]


def make_violins(df, var, gametime_threshold: bool = True):
    """Make violin charts of var stat grouped by seasons

    Parameters:
    -----------

    df: dataframe, wide-format

    var: str
        column name for which the values will be density transformed

    Returns:
    ---------

    violin: alt.Chart() object
    """
    # use .facet()
    if gametime_threshold:
        df = df.loc[df["gametime_threshold"]]

    # hover = alt.selection_single(on="mouseover", nearest=True, empty="none")

    base = (
        alt.Chart(df)
        .transform_density(
            density=var,
            as_=[var, "density"],
            groupby=["season"],  # don't put :N here
        )
        .mark_area(orient="horizontal")
        .encode(
            y=f"{var}:Q",
            color="season:N",
            x=alt.X(
                "density:Q",
                stack="center",
                impute=None,
                title=None,
                axis=alt.Axis(labels=False, values=[0], grid=False, ticks=False),
            ),
            column=alt.Column(
                "season:N",
                header=alt.Header(
                    title=None,
                    labels=False,  # use color legend to define color
                    # titleOrient="bottom",
                    # labelAnchor="end",
                    # labelOrient="bottom",
                    # labelAngle=-30,
                    # labelPadding=0,
                ),
            ),
        )
        .properties(width=80)
    )

    return base

In [24]:
merge_stats = [stat for stat in src.columns if "merge" in stat][:8]
violins = {stat: make_violins(df=src, var=stat) for stat in merge_stats}
# violin_fg3 = make_violins(src, "FG3A_merge")
# violin_fg2 = make_violins(src, "FG2A_merge")
# violin_fg3 | violin_fg2
chart = alt.vconcat()
while violins:
    rows = alt.hconcat()
    for i in range(4):
        if violins:
            rows |= violins.popitem()[1]
    chart &= rows

chart

In [11]:
violin_stat_filter = src_long.apply(lambda x: "merge" in x["variable"], axis=1)
src_long[violin_stat_filter]

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_NAME,TEAM_ABBREVIATION,label_names,gametime_threshold,variable,value
PLAYER_ID,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
203932,2018-19,Aaron Gordon,ORL,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,69.704545
1628988,2018-19,Aaron Holiday,IND,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,44.964286
1627846,2018-19,Abdel Nader,OKC,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,55.805970
201143,2018-19,Al Horford,BOS,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,55.651163
202329,2018-19,Al-Farouq Aminu,POR,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,62.592920
...,...,...,...,...,...,...,...
2216,2004-05,Zach Randolph,POR,Brook Lopez-James Harden-Pascal Siakam,True,FG2A_merge,16.500000
2560,2004-05,Zarko Cabarkapa,GSW,Brook Lopez-James Harden-Pascal Siakam,True,FG2A_merge,10.700000
2585,2004-05,Zaza Pachulia,MIL,Derrick Favors-Clint Capela-Myles Turner,True,FG2A_merge,9.100000
1442,2004-05,Zeljko Rebraca,LAC,Derrick Favors-Clint Capela-Myles Turner,True,FG2A_merge,9.100000


In [12]:
violin_stat_filter.head(5)

PLAYER_ID  season 
203932     2018-19    False
1628988    2018-19    False
1627846    2018-19    False
201143     2018-19    False
202329     2018-19    False
dtype: bool

## Bargraphs for direct player comps

When user chooses the player, season A, and season B, the model will return three most similar players from season B according to the label.

In addition to just showing the names, we'll also visualize the similarities via bar graph of each basic stat with the chosen players and the players identified by the model.

To give context for those values, the top and bottom measurements for those stats will also be displayed, from both seasons, according to player label. 

For example, choosing 2018-19 Fred Vanvleet may return 2004-05 players like Derek Fisher and Mo pete. Come time for bar graph visualization of assists, those players will be displayed, as well as the top players like Steve Nash or Chauncey Billups, and the bottom ranked, to show the range

In [13]:
src["comp_rank"] = src["PLUS_MINUS_RANK"] + src["MIN_RANK"]
player_name = "Fred VanVleet"
rec = src.loc[src["PLAYER_NAME"] == player_name]
# returns a pd.Series of len 1
player_label = rec["label_pred"].values[0]

# choosing similar players from season_b
comp_pool = src[
    (src["season"] == "2004-05")
    & (src["label_pred"] == player_label)
    & (src["gametime_threshold"])
]

To find the most similarly ranked player given the `comp_rank` value, calculate a new series, `comp_pool['comp_rank'] - rec['comp_rank']`, sort the resulting deltas in ascending order, and take the top three

In [14]:
similarity_index = (
    (comp_pool["comp_rank"] - rec["comp_rank"].values)
    .abs()
    .sort_values(ascending=True)
    .index
)

In [15]:
similarity_rank = comp_pool.loc[similarity_index]
similarity_rank.head()

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_pred,label_names,season,comp_rank
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2399,Mike Dunleavy,GSW,24.0,79.0,2569.408333,1.5,3.9,1.9,2.4,1.4,...,0.133621,0.288793,0.44181,0.411638,0.489224,True,2,Brook Lopez-James Harden-Pascal Siakam,2004-05,0.512931
2768,Chris Duhon,CHI,22.0,72.297872,1915.336099,1.561702,4.46383,0.953191,1.265957,0.514894,...,0.536638,0.862069,0.286638,0.976293,0.969828,True,2,Brook Lopez-James Harden-Pascal Siakam,2004-05,0.510776
1894,Corey Maggette,LAC,25.0,66.0,2446.563333,0.8,2.5,8.3,9.7,1.0,...,0.577586,0.040948,0.418103,0.159483,0.163793,True,2,Brook Lopez-James Harden-Pascal Siakam,2004-05,0.510776
468,Jon Barry,HOU,35.0,58.542169,1279.687711,1.76747,4.06747,1.483133,1.666265,0.53253,...,0.849138,0.625,0.077586,0.898707,0.928879,True,2,Brook Lopez-James Harden-Pascal Siakam,2004-05,0.517241
1747,Rafer Alston,TOR,28.0,80.0,2722.86,1.8,5.1,2.5,3.4,0.6,...,0.43319,0.278017,0.478448,0.605603,0.625,True,2,Brook Lopez-James Harden-Pascal Siakam,2004-05,0.517241


Choosing the top ranked and bottom ranked player for a specific stat:

In [16]:
# FG3A_merge
bar_stat = "FG3A_merge"


def get_stat_ends(bar_stat: str, comp_pool: pd.DataFrame):
    bar_ranked = comp_pool["FG3A_merge"].sort_values(ascending=False).index
    top = comp_pool.loc[bar_ranked].head(1)
    bot = comp_pool.loc[bar_ranked].tail(1)
    return top, bot


top, bot = get_stat_ends(bar_stat=bar_stat, comp_pool=comp_pool)

In [17]:
df_stat = pd.concat([rec, similarity_rank.head(2), top, bot], axis=0)
df_stat["selected_player"] = df_stat.apply(
    lambda x: x["PLAYER_NAME"] == player_name, axis=1
)
print(df_stat.columns)

Index(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE', 'GP_merge', 'MIN_merge',
       'FG3M_merge', 'FG3A_merge', 'FTM_merge', 'FTA_merge', 'OREB_merge',
       'DREB_merge', 'AST_merge', 'TOV_merge', 'STL_merge', 'BLK_merge',
       'BLKA_merge', 'PF_merge', 'PFD_merge', 'PTS_merge', 'PLUS_MINUS_merge',
       'FG2M_merge', 'FG2A_merge', 'GP_RANK', 'MIN_RANK', 'FG3M_RANK',
       'FG3A_RANK', 'FTM_RANK', 'FTA_RANK', 'OREB_RANK', 'DREB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'FG2M_RANK', 'FG2A_RANK',
       'gametime_threshold', 'label_pred', 'label_names', 'season',
       'comp_rank', 'selected_player'],
      dtype='object')


In [18]:
def make_stat_bar(bar_stat: str, df_stat):
    stat_bar = (
        alt.Chart(df_stat)
        .mark_bar(width=30)
        .encode(
            y=f"{bar_stat}:Q",
            x=alt.X(
                "PLAYER_NAME:N",
                # sort=df_stat.sort_values(by=bar_stat)['PLAYER_NAME'].values,
                # sort='ascending', # sorts X-axis string vals
                sort="y",
                axis=alt.Axis(
                    labels=True,
                    title="PLAYER NAME",
                    labelAngle=-30,
                ),
            ),
            color=f"selected_player:N",
        )
        .properties(width=300)
    )
    return stat_bar


make_stat_bar(bar_stat=bar_stat, df_stat=df_stat)