# Altair Demo

Trying out the charts here before moving to streamlit

In [2]:
import altair as alt
import pandas as pd
from pathlib import Path

alt.data_transformers.enable("json")

DataTransformerRegistry.enable('json')

In [3]:
fp = Path("../data/leaguedash_labelled_2018-19.pkl")
# with open(fp, "rb") as f:
#     tor = pickle.load(f)
tor = pd.read_pickle(fp)

In [4]:
fp = Path("../data/leaguedash_labelled_2004-05.pkl")
# with open(fp, "rb") as f:
#     det = pickle.load(f)
det = pd.read_pickle(fp)

## Scatterplot

Scatterplot will be a broad overview of the season's stats:

* FG2A vs FG2M, color=PTS
* FG3A vs FG3M, color=PTS
* FG3A vs FG2A, color=PFD
* AST vs TOV, 
* BLK vs STL, color=PF

In [5]:
tor.columns

Index(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE', 'GP_merge', 'MIN_merge',
       'FG3M_merge', 'FG3A_merge', 'FTM_merge', 'FTA_merge', 'OREB_merge',
       'DREB_merge', 'AST_merge', 'TOV_merge', 'STL_merge', 'BLK_merge',
       'BLKA_merge', 'PF_merge', 'PFD_merge', 'PTS_merge', 'PLUS_MINUS_merge',
       'FG2M_merge', 'FG2A_merge', 'GP_RANK', 'MIN_RANK', 'FG3M_RANK',
       'FG3A_RANK', 'FTM_RANK', 'FTA_RANK', 'OREB_RANK', 'DREB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'FG2M_RANK', 'FG2A_RANK',
       'gametime_threshold', 'label_pred', 'label_names'],
      dtype='object')

In [6]:
tor.groupby(by="label_pred").agg("count")

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_names
label_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,135,135,135,135,135,135,135,135,135,135,...,135,135,135,135,135,135,135,135,135,135
1,253,253,253,253,253,253,253,253,253,253,...,253,253,253,253,253,253,253,253,253,253
2,142,142,142,142,142,142,142,142,142,142,...,142,142,142,142,142,142,142,142,142,142


In [7]:
# from example gallery
# https://altair-viz.github.io/gallery/scatter_linked_table.html

src = tor[tor["gametime_threshold"]][tor["label_pred"] == 2]
brush = alt.selection(type="interval")

# scatterplot
points = (
    alt.Chart(src)
    .mark_point()
    .encode(
        x="FG3A_merge:Q",
        y="FG2A_merge:Q",
        color=alt.condition(brush, "PTS_merge:Q", alt.value("grey")),
    )
    .add_selection(brush)
)
# points

# base chart for data tables
ranked_text = (
    alt.Chart(src)
    .mark_text()
    .encode(y=alt.Y("row_number:O", axis=None))
    .transform_window(row_number="row_number()")
    .transform_filter(brush)
    .transform_window(rank="rank(row_number)")
    .transform_filter(alt.datum.rank < 20)
)

# encoding our data table onto the base
player_name = ranked_text.encode(text="PLAYER_NAME:N").properties(title="Name")
team = ranked_text.encode(text="TEAM_ABBREVIATION:N").properties(title="Team")
pts = ranked_text.encode(text="PTS_merge:Q").properties(title="Points")
text = alt.hconcat(player_name, team, pts)

# build chart
alt.hconcat(
    points,
    text,
).resolve_legend(color="independent")

  src = tor[tor["gametime_threshold"]][tor["label_pred"] == 2]


The click and drag selection feature is kind of amazing. Perhaps I can further categorize by the cluster labels?

Use `st.checkbox('label_')` for user to select which labels to plot, and perhaps encode via shape, if multiple labels are selected

## Violinplot

Showcase the distribution of each stat for the two seasons in a shotgun array. Each subplot will have two violins, one for each season.

Let's plot FG2A and FG3A

In [8]:
tor["season"] = tor.apply(lambda x: "2018-19", axis=1)
det["season"] = det.apply(lambda x: "2004-05", axis=1)
src = pd.concat([tor, det], axis=0)
src.sample(10)

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_pred,label_names,season
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
203518,Alex Abrines,OKC,25.0,31.0,588.273333,2.5,7.8,0.7,0.8,0.3,...,0.498113,0.703774,0.85283,0.266038,0.954717,0.969811,False,1,Cory Joseph-Eric Gordon-Jae Crowder,2018-19
1628973,Jalen Brunson,DAL,22.0,73.0,1591.258333,1.4,4.1,2.0,2.7,0.6,...,0.60566,0.232075,0.396226,0.54717,0.354717,0.392453,True,0,Damian Lillard-Khris Middleton-Paul George,2018-19
1629133,Daryl Macon,DAL,23.0,8.0,89.846667,2.0,4.4,1.6,2.8,0.8,...,0.49434,0.801887,0.75283,0.581132,0.841509,0.639623,False,1,Cory Joseph-Eric Gordon-Jae Crowder,2018-19
2557,Luke Ridnour,SEA,24.0,66.980769,2108.886923,0.794231,2.309615,2.115385,2.373077,0.842308,...,0.846983,0.243534,0.605603,0.413793,0.642241,0.612069,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05
201949,James Johnson,MIA,32.0,55.0,1163.838333,1.5,4.6,1.5,2.2,0.7,...,0.386792,0.715094,0.571698,0.630189,0.530189,0.564151,True,1,Cory Joseph-Eric Gordon-Jae Crowder,2018-19
2407,Jared Jeffries,WAS,23.0,63.185567,1643.369639,0.320619,0.9,1.820619,2.976289,2.720619,...,0.431034,0.786638,0.767241,0.349138,0.676724,0.778017,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05
201961,Wayne Ellington,DET,31.0,46.57377,1143.982978,3.55082,9.736066,1.021311,1.281967,0.452459,...,0.769811,0.890566,0.488679,0.645283,0.937736,0.950943,True,1,Cory Joseph-Eric Gordon-Jae Crowder,2018-19
2617,Udonis Haslem,MIA,39.0,10.0,74.5,0.0,5.8,1.4,1.9,1.4,...,0.216981,0.273585,0.698113,0.966038,0.25283,0.269811,False,2,Paul Millsap-Jusuf Nurkic-Willie Cauley-Stein,2018-19
2775,Ha Ha,POR,19.0,19.0,100.161667,0.0,0.0,2.2,4.0,2.2,...,0.051724,0.954741,0.773707,0.713362,0.590517,0.560345,False,2,Udonis Haslem-Zydrunas Ilgauskas-Rasheed Wallace,2004-05
2060,Marko Jaric,LAC,26.0,50.0,1662.016667,1.2,3.3,1.3,1.8,0.4,...,0.892241,0.077586,0.670259,0.280172,0.767241,0.788793,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05


In [9]:
def make_longform(df):
    """Altair prefers longform structures in its grammar"""
    player_bios = [
        "PLAYER_NAME",
        "TEAM_ABBREVIATION",
        "season",
        "label_names",
        "gametime_threshold",
    ]
    longform = df[df["gametime_threshold"]].melt(
        id_vars=player_bios,
        value_vars=df.drop(player_bios, axis="columns").columns,
        var_name="variable",
        value_name="value",
        ignore_index=False,
    )
    longform = longform.set_index("season", append=True)
    # longform.set_index()
    return longform


src_long = make_longform(src)
src_long.sample(5)

# src_long.set_index('season', append=True).xs("2018-19", level="season").head()
# src_long.set_index('season', append=True).loc[(slice(None) ,"2018-19"),:].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_NAME,TEAM_ABBREVIATION,label_names,gametime_threshold,variable,value
PLAYER_ID,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2804,2004-05,Andres Nocioni,CHI,Dirk Nowitzki-Kevin Garnett-Shawn Marion,True,PLUS_MINUS_merge,-3.425806
2045,2004-05,Hedo Turkoglu,ORL,Dirk Nowitzki-Kevin Garnett-Shawn Marion,True,OREB_RANK,0.579741
202357,2018-19,Nemanja Bjelica,SAC,Cory Joseph-Eric Gordon-Jae Crowder,True,FG2M_RANK,0.520755
1531,2004-05,Marc Jackson,PHI,Udonis Haslem-Zydrunas Ilgauskas-Rasheed Wallace,True,AST_merge,1.335165
458,2004-05,Howard Eisley,UTA,Bruce Bowen-Jeff McInnis-Shane Battier,True,PTS_merge,10.5


In [10]:
vars = ["FG2A_merge", "FG3A_merge", "PFD_merge"]
# src_a = tor[tor["gametime_threshold"]]
# src_b = det[det["gametime_threshold"]]


def make_violins(df, var, gametime_threshold: bool = True):
    """Make violin charts of var stat grouped by seasons

    Parameters:
    -----------

    df: dataframe, wide-format

    var: str
        column name for which the values will be density transformed

    Returns:
    ---------

    violin: alt.Chart() object
    """
    # use .facet()
    if gametime_threshold:
        df = df.loc[df["gametime_threshold"]]

    # hover = alt.selection_single(on="mouseover", nearest=True, empty="none")

    base = (
        alt.Chart(df)
        .transform_density(
            density=var,
            as_=[var, "density"],
            groupby=["season"],  # don't put :N here
        )
        .mark_area(orient="horizontal")
        .encode(
            y=f"{var}:Q",
            color="season:N",
            x=alt.X(
                "density:Q",
                stack="center",
                impute=None,
                title=None,
                axis=alt.Axis(labels=False, values=[0], grid=False, ticks=False),
            ),
            column=alt.Column(
                "season:N",
                header=alt.Header(
                    titleOrient="bottom",
                    labelAnchor="end",
                    labelOrient="bottom",
                    labelAngle=-30,
                    labelPadding=0,
                ),
            ),
        )
        .properties(width=80)
    )

    return base

In [12]:
alt.data_transformers.disable_max_rows()
alt.data_transformers.enable("json")

DataTransformerRegistry.enable('json')

In [13]:
merge_stats = [stat for stat in src.columns if "merge" in stat]
violins = {stat: make_violins(df=src, var=stat) for stat in merge_stats}
# violin_fg3 = make_violins(src, "FG3A_merge")
# violin_fg2 = make_violins(src, "FG2A_merge")
# violin_fg3 | violin_fg2
chart = alt.vconcat()
while violins:
    rows = alt.hconcat()
    for i in range(4):
        if violins:
            rows |= violins.popitem()[1]
    chart &= rows

chart

In [14]:
violin_stat_filter = src_long.apply(lambda x: "merge" in x["variable"], axis=1)
src_long[violin_stat_filter]

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_NAME,TEAM_ABBREVIATION,label_names,gametime_threshold,variable,value
PLAYER_ID,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
203932,2018-19,Aaron Gordon,ORL,Damian Lillard-Khris Middleton-Paul George,True,GP_merge,69.704545
1628988,2018-19,Aaron Holiday,IND,Cory Joseph-Eric Gordon-Jae Crowder,True,GP_merge,44.964286
1627846,2018-19,Abdel Nader,OKC,Cory Joseph-Eric Gordon-Jae Crowder,True,GP_merge,55.805970
201143,2018-19,Al Horford,BOS,Damian Lillard-Khris Middleton-Paul George,True,GP_merge,55.651163
202329,2018-19,Al-Farouq Aminu,POR,Cory Joseph-Eric Gordon-Jae Crowder,True,GP_merge,62.592920
...,...,...,...,...,...,...,...
2216,2004-05,Zach Randolph,POR,Udonis Haslem-Zydrunas Ilgauskas-Rasheed Wallace,True,FG2A_merge,16.500000
2560,2004-05,Zarko Cabarkapa,GSW,Udonis Haslem-Zydrunas Ilgauskas-Rasheed Wallace,True,FG2A_merge,10.700000
2585,2004-05,Zaza Pachulia,MIL,Udonis Haslem-Zydrunas Ilgauskas-Rasheed Wallace,True,FG2A_merge,9.100000
1442,2004-05,Zeljko Rebraca,LAC,Udonis Haslem-Zydrunas Ilgauskas-Rasheed Wallace,True,FG2A_merge,9.100000


In [15]:
violin_stat_filter.head(5)

PLAYER_ID  season 
203932     2018-19    False
1628988    2018-19    False
1627846    2018-19    False
201143     2018-19    False
202329     2018-19    False
dtype: bool

## Bargraphs for direct player comps

When user chooses the player, season A, and season B, the model will return three most similar players from season B according to the label.

In addition to just showing the names, we'll also visualize the similarities via bar graph of each basic stat with the chosen players and the players identified by the model.

To give context for those values, the top and bottom measurements for those stats will also be displayed, from both seasons, according to player label. 

For example, choosing 2018-19 Fred Vanvleet may return 2004-05 players like Derek Fisher and Mo pete. Come time for bar graph visualization of assists, those players will be displayed, as well as the top players like Steve Nash or Chauncey Billups, and the bottom ranked, to show the range

In [19]:
src["comp_rank"] = src["PLUS_MINUS_RANK"] + src["MIN_RANK"]
player_name = "Fred VanVleet"
rec = src.loc[src["PLAYER_NAME"] == player_name]
# returns a pd.Series of len 1
player_label = rec["label_pred"].values[0]

# choosing similar players from season_b
comp_pool = src[
    (src["season"] == "2004-05")
    & (src["label_pred"] == player_label)
    & (src["gametime_threshold"])
]

To find the most similarly ranked player given the `comp_rank` value, calculate a new series, `comp_pool['comp_rank'] - rec['comp_rank']`, sort the resulting deltas in ascending order, and take the top three

In [20]:
similarity_index = (
    (comp_pool["comp_rank"] - rec["comp_rank"].values)
    .abs()
    .sort_values(ascending=True)
    .index
)

In [24]:
similarity_rank = comp_pool.loc[similarity_index]
similarity_rank.head()

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_pred,label_names,season,comp_rank
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2768,Chris Duhon,CHI,22.0,72.297872,1915.336099,1.561702,4.46383,0.953191,1.265957,0.514894,...,0.536638,0.862069,0.286638,0.976293,0.969828,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05,0.510776
468,Jon Barry,HOU,35.0,58.542169,1279.687711,1.76747,4.06747,1.483133,1.666265,0.53253,...,0.849138,0.625,0.077586,0.898707,0.928879,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05,0.517241
2204,Eddie Griffin,MIN,23.0,70.0,1492.668333,1.6,4.9,1.4,1.9,3.1,...,0.900862,0.456897,0.135776,0.648707,0.644397,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05,0.506466
133,David Wesley,HOU,34.0,69.12766,2405.48656,1.470213,3.891489,1.965957,2.295745,0.5,...,0.036638,0.571121,0.422414,0.836207,0.803879,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05,0.521552
109,Robert Horry,SAS,34.0,55.231405,1111.06522,1.642149,4.032231,2.009917,2.647934,2.452066,...,0.398707,0.547414,0.032328,0.855603,0.887931,True,1,Bruce Bowen-Jeff McInnis-Shane Battier,2004-05,0.521552


Choosing the top ranked and bottom ranked player for a specific stat:

In [22]:
# FG3A_merge
bar_stat = "FG3A_merge"


def get_stat_ends(bar_stat: str, comp_pool: pd.DataFrame):
    bar_ranked = comp_pool["FG3A_merge"].sort_values(ascending=False).index
    top = comp_pool.loc[bar_ranked].head(1)
    bot = comp_pool.loc[bar_ranked].tail(1)
    return top, bot


top, bot = get_stat_ends(bar_stat=bar_stat, comp_pool=comp_pool)

In [26]:
df_stat = pd.concat([rec, similarity_rank.head(2), top, bot], axis=0)
df_stat["selected_player"] = df_stat.apply(
    lambda x: x["PLAYER_NAME"] == player_name, axis=1
)
print(df_stat.columns)

Index(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE', 'GP_merge', 'MIN_merge',
       'FG3M_merge', 'FG3A_merge', 'FTM_merge', 'FTA_merge', 'OREB_merge',
       'DREB_merge', 'AST_merge', 'TOV_merge', 'STL_merge', 'BLK_merge',
       'BLKA_merge', 'PF_merge', 'PFD_merge', 'PTS_merge', 'PLUS_MINUS_merge',
       'FG2M_merge', 'FG2A_merge', 'GP_RANK', 'MIN_RANK', 'FG3M_RANK',
       'FG3A_RANK', 'FTM_RANK', 'FTA_RANK', 'OREB_RANK', 'DREB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'FG2M_RANK', 'FG2A_RANK',
       'gametime_threshold', 'label_pred', 'label_names', 'season',
       'comp_rank', 'selected_player'],
      dtype='object')


In [38]:
def make_stat_bar(bar_stat: str, df_stat):
    stat_bar = (
        alt.Chart(df_stat)
        .mark_bar(width=30)
        .encode(
            y=f"{bar_stat}:Q",
            x=alt.X(
                "PLAYER_NAME:N",
                # sort=df_stat.sort_values(by=bar_stat)['PLAYER_NAME'].values,
                # sort='ascending', # sorts X-axis string vals
                sort="y",
                axis=alt.Axis(
                    labels=True,
                    title="PLAYER NAME",
                    labelAngle=-30,
                ),
            ),
            color=f"selected_player:N",
        )
        .properties(width=300)
    )
    return stat_bar


make_stat_bar(bar_stat=bar_stat, df_stat=df_stat)