# Altair Demo

Trying out the charts here before moving to streamlit

In [1]:
import altair as alt
import pandas as pd
from pathlib import Path

alt.data_transformers.disable_max_rows()
alt.data_transformers.enable("json")

DataTransformerRegistry.enable('json')

In [2]:
fp = Path("../data/leaguedash_labelled_2018-19.pkl")
# with open(fp, "rb") as f:
#     tor = pickle.load(f)
tor = pd.read_pickle(fp)

In [3]:
fp = Path("../data/leaguedash_labelled_2004-05.pkl")
# with open(fp, "rb") as f:
#     det = pickle.load(f)
det = pd.read_pickle(fp)

## Scatterplot

Scatterplot will be a broad overview of the season's stats:

* FG2A vs FG2M, color=PTS
* FG3A vs FG3M, color=PTS
* FG3A vs FG2A, color=PFD
* AST vs TOV, 
* BLK vs STL, color=PF

In [4]:
tor.columns

Index(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE', 'GP_merge', 'MIN_merge',
       'FG3M_merge', 'FG3A_merge', 'FTM_merge', 'FTA_merge', 'OREB_merge',
       'DREB_merge', 'AST_merge', 'TOV_merge', 'STL_merge', 'BLK_merge',
       'BLKA_merge', 'PF_merge', 'PFD_merge', 'PTS_merge', 'PLUS_MINUS_merge',
       'FG2M_merge', 'FG2A_merge', 'GP_RANK', 'MIN_RANK', 'FG3M_RANK',
       'FG3A_RANK', 'FTM_RANK', 'FTA_RANK', 'OREB_RANK', 'DREB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'FG2M_RANK', 'FG2A_RANK',
       'gametime_threshold', 'label_pred', 'label_names'],
      dtype='object')

In [5]:
tor.groupby(by="label_pred").agg("count")

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_names
label_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,121,121,121,121,121,121,121,121,121,121,...,121,121,121,121,121,121,121,121,121,121
1,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
2,397,397,397,397,397,397,397,397,397,397,...,397,397,397,397,397,397,397,397,397,397


In [6]:
# from example gallery
# https://altair-viz.github.io/gallery/scatter_linked_table.html

src = tor[tor["gametime_threshold"]][tor["label_pred"] == 2]
brush = alt.selection(type="interval")

# scatterplot
points = (
    alt.Chart(src)
    .mark_point()
    .encode(
        x="FG3A_merge:Q",
        y="FG2A_merge:Q",
        color=alt.condition(brush, "PTS_merge:Q", alt.value("grey")),
    )
    .add_selection(brush)
)
# points

# base chart for data tables
ranked_text = (
    alt.Chart(src)
    .mark_text()
    .encode(y=alt.Y("row_number:O", axis=None))
    .transform_window(row_number="row_number()")
    .transform_filter(brush)
    .transform_window(rank="rank(row_number)")
    .transform_filter(alt.datum.rank < 20)
)

# encoding our data table onto the base
player_name = ranked_text.encode(text="PLAYER_NAME:N").properties(title="Name")
team = ranked_text.encode(text="TEAM_ABBREVIATION:N").properties(title="Team")
pts = ranked_text.encode(text="PTS_merge:Q").properties(title="Points")
text = alt.hconcat(player_name, team, pts)

# build chart
alt.hconcat(
    points,
    text,
).resolve_legend(color="independent")

  src = tor[tor["gametime_threshold"]][tor["label_pred"] == 2]


The click and drag selection feature is kind of amazing. Perhaps I can further categorize by the cluster labels?

Use `st.checkbox('label_')` for user to select which labels to plot, and perhaps encode via shape, if multiple labels are selected

## Violinplot

Showcase the distribution of each stat for the two seasons in a shotgun array. Each subplot will have two violins, one for each season.

Let's plot FG2A and FG3A

In [7]:
tor["season"] = tor.apply(lambda x: "2018-19", axis=1)
det["season"] = det.apply(lambda x: "2004-05", axis=1)
src = pd.concat([tor, det], axis=0)
src.sample(10)

Unnamed: 0_level_0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,GP_merge,MIN_merge,FG3M_merge,FG3A_merge,FTM_merge,FTA_merge,OREB_merge,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,FG2M_RANK,FG2A_RANK,gametime_threshold,label_pred,label_names,season
PLAYER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
202083,Wesley Matthews,IND,32.0,62.246753,1886.09658,2.548052,6.848052,2.154545,2.602597,0.537662,...,0.711321,0.477358,0.518868,0.539623,0.832075,0.803774,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
2592,James Jones,IND,24.0,59.039604,1036.590545,1.774257,4.448515,1.219802,1.674257,1.048515,...,0.648707,0.797414,0.721983,0.31681,0.950431,0.961207,True,2,Brook Lopez-James Harden-Pascal Siakam,2004-05
692,Andrew DeClercq,ORL,32.0,8.0,50.628333,0.0,0.0,0.7,2.1,5.0,...,0.015086,0.657328,0.93319,0.920259,0.773707,0.810345,False,0,Derrick Favors-Clint Capela-Myles Turner,2004-05
1628424,Kobi Simmons,CLE,21.0,1.0,1.8,0.0,0.0,0.0,0.0,0.0,...,0.971698,0.967925,0.998113,0.003774,0.996226,0.984906,False,1,Vincent Edwards-Okaro White-Gary Clark,2018-19
201571,D.J. Augustin,ORL,31.0,72.648352,2035.096099,2.154945,4.954945,2.976923,3.387912,0.656044,...,0.928302,0.337736,0.413208,0.34717,0.675472,0.713208,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
1628605,Dusty Hannahs,MEM,25.0,2.0,25.916667,0.0,6.9,2.8,2.8,0.0,...,0.950943,0.869811,0.796226,0.958491,0.411321,0.283019,False,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
1626220,Royce O'Neale,UTA,26.0,73.630435,1504.375109,1.565217,4.03913,0.71087,0.91087,0.5,...,0.326415,0.903774,0.875472,0.19434,0.813208,0.881132,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
990,Malik Rose,NYK,30.0,76.0,1481.77,0.0,0.2,3.3,4.5,3.1,...,0.19181,0.232759,0.428879,0.338362,0.306034,0.282328,True,0,Derrick Favors-Clint Capela-Myles Turner,2004-05
200794,Paul Millsap,DEN,34.0,54.0,1487.190714,1.042857,2.971429,3.457143,4.7,2.585714,...,0.375472,0.124528,0.307547,0.092453,0.30566,0.318868,True,2,Brook Lopez-James Harden-Pascal Siakam,2018-19
203518,Alex Abrines,OKC,25.0,31.0,588.273333,2.5,7.8,0.7,0.8,0.3,...,0.498113,0.703774,0.85283,0.266038,0.954717,0.969811,False,2,Brook Lopez-James Harden-Pascal Siakam,2018-19


In [8]:
def make_longform(df):
    """Altair prefers longform structures in its grammar"""
    player_bios = [
        "PLAYER_NAME",
        "TEAM_ABBREVIATION",
        "season",
        "label_names",
        "gametime_threshold",
    ]
    longform = df[df["gametime_threshold"]].melt(
        id_vars=player_bios,
        value_vars=df.drop(player_bios, axis="columns").columns,
        var_name="variable",
        value_name="value",
        ignore_index=False,
    )
    longform = longform.set_index("season", append=True)
    # longform.set_index()
    return longform


src_long = make_longform(src)
src_long.sample(5)

# src_long.set_index('season', append=True).xs("2018-19", level="season").head()
# src_long.set_index('season', append=True).loc[(slice(None) ,"2018-19"),:].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_NAME,TEAM_ABBREVIATION,label_names,gametime_threshold,variable,value
PLAYER_ID,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2571,2004-05,Leandro Barbosa,PHX,Brook Lopez-James Harden-Pascal Siakam,True,FG2A_merge,7.293103
203087,2018-19,Jeremy Lamb,CHA,Brook Lopez-James Harden-Pascal Siakam,True,PTS_merge,19.3
1628397,2018-19,Ivan Rabb,MEM,Derrick Favors-Clint Capela-Myles Turner,True,OREB_RANK,0.111321
1626144,2018-19,Emmanuel Mudiay,NYK,Brook Lopez-James Harden-Pascal Siakam,True,MIN_merge,1607.061667
101139,2018-19,CJ Miles,MEM,Brook Lopez-James Harden-Pascal Siakam,True,DREB_RANK,0.758491


In [9]:
vars = ["FG2A_merge", "FG3A_merge", "PFD_merge"]
# src_a = tor[tor["gametime_threshold"]]
# src_b = det[det["gametime_threshold"]]


def make_violins(df, var, gametime_threshold: bool = True):
    """Make violin charts of var stat grouped by seasons

    Parameters:
    -----------

    df: dataframe, wide-format

    var: str
        column name for which the values will be density transformed

    Returns:
    ---------

    violin: alt.Chart() object
    """
    # use .facet()
    if gametime_threshold:
        df = df.loc[df["gametime_threshold"]]

    # hover = alt.selection_single(on="mouseover", nearest=True, empty="none")

    base = (
        alt.Chart(df)
        .transform_density(
            density=var,
            as_=[var, "density"],
            groupby=["season"],  # don't put :N here
        )
        .mark_area(orient="horizontal")
        .encode(
            y=f"{var}:Q",
            color="season:N",
            x=alt.X(
                "density:Q",
                stack="center",
                impute=None,
                title=None,
                axis=alt.Axis(labels=False, values=[0], grid=False, ticks=False),
            ),
            column=alt.Column(
                "season:N",
                header=alt.Header(
                    title=None,
                    labels=False,  # use color legend to define color
                    # titleOrient="bottom",
                    # labelAnchor="end",
                    # labelOrient="bottom",
                    # labelAngle=-30,
                    # labelPadding=0,
                ),
            ),
        )
        .properties(width=80)
    )

    return base

In [10]:
merge_stats = [stat for stat in src.columns if "merge" in stat][:8]
violins = {stat: make_violins(df=src, var=stat) for stat in merge_stats}
# violin_fg3 = make_violins(src, "FG3A_merge")
# violin_fg2 = make_violins(src, "FG2A_merge")
# violin_fg3 | violin_fg2
chart = alt.vconcat()
while violins:
    rows = alt.hconcat()
    for i in range(4):
        if violins:
            rows |= violins.popitem()[1]
    chart &= rows

chart

In [11]:
violin_stat_filter = src_long.apply(lambda x: "merge" in x["variable"], axis=1)
src_long[violin_stat_filter]

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_NAME,TEAM_ABBREVIATION,label_names,gametime_threshold,variable,value
PLAYER_ID,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
203932,2018-19,Aaron Gordon,ORL,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,69.704545
1628988,2018-19,Aaron Holiday,IND,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,44.964286
1627846,2018-19,Abdel Nader,OKC,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,55.805970
201143,2018-19,Al Horford,BOS,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,55.651163
202329,2018-19,Al-Farouq Aminu,POR,Brook Lopez-James Harden-Pascal Siakam,True,GP_merge,62.592920
...,...,...,...,...,...,...,...
2216,2004-05,Zach Randolph,POR,Brook Lopez-James Harden-Pascal Siakam,True,FG2A_merge,16.500000
2560,2004-05,Zarko Cabarkapa,GSW,Brook Lopez-James Harden-Pascal Siakam,True,FG2A_merge,10.700000
2585,2004-05,Zaza Pachulia,MIL,Derrick Favors-Clint Capela-Myles Turner,True,FG2A_merge,9.100000
1442,2004-05,Zeljko Rebraca,LAC,Derrick Favors-Clint Capela-Myles Turner,True,FG2A_merge,9.100000


In [12]:
violin_stat_filter.head(5)

PLAYER_ID  season 
203932     2018-19    False
1628988    2018-19    False
1627846    2018-19    False
201143     2018-19    False
202329     2018-19    False
dtype: bool

## Bargraphs for direct player comps

When user chooses the player, season A, and season B, the model will return three most similar players from season B according to the label.

In addition to just showing the names, we'll also visualize the similarities via bar graph of each basic stat with the chosen players and the players identified by the model.

To give context for those values, the top and bottom measurements for those stats will also be displayed, from both seasons, according to player label. 

For example, choosing 2018-19 Fred Vanvleet may return 2004-05 players like Derek Fisher and Mo pete. Come time for bar graph visualization of assists, those players will be displayed, as well as the top players like Steve Nash or Chauncey Billups, and the bottom ranked, to show the range

In [28]:
src["comp_rank"] = src["PLUS_MINUS_RANK"] + src["MIN_RANK"]
src["selected_player"] = src["PLAYER_NAME"].apply(lambda x: x == player_name)
player_name = "Fred VanVleet"
rec = src.loc[src["PLAYER_NAME"] == player_name]
# returns a pd.Series of len 1
player_label = rec["label_pred"].values[0]

# choosing similar players from season_b
comp_pool = src[
    (src["season"] == "2004-05")
    & (src["label_pred"] == player_label)
    & (src["gametime_threshold"])
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_pool["selected_player"] = comp_pool["PLAYER_NAME"].apply(


To find the most similarly ranked player given the `comp_rank` value, calculate a new series, `comp_pool['comp_rank'] - rec['comp_rank']`, sort the resulting deltas in ascending order, and take the top three

In [31]:
similarity_index = (
    (comp_pool["comp_rank"] - rec["comp_rank"].values)
    .abs()
    .sort_values(ascending=True)
    .index
)
similars = comp_pool.loc[similarity_index].head(2)

Choosing the top ranked and bottom ranked player for a specific stat:

In [16]:
# FG3A_merge
bar_stat = "FG3A_merge"


def get_stat_ends(bar_stat: str, comp_pool: pd.DataFrame):
    bar_ranked = comp_pool[bar_stat].sort_values(ascending=False).index
    top = comp_pool.loc[bar_ranked].head(1)
    bot = comp_pool.loc[bar_ranked].tail(1)
    return top, bot


top, bot = get_stat_ends(bar_stat=bar_stat, comp_pool=comp_pool)

In [17]:
df_stat = pd.concat([rec, similars, top, bot], axis=0)
print(df_stat.columns)

Index(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE', 'GP_merge', 'MIN_merge',
       'FG3M_merge', 'FG3A_merge', 'FTM_merge', 'FTA_merge', 'OREB_merge',
       'DREB_merge', 'AST_merge', 'TOV_merge', 'STL_merge', 'BLK_merge',
       'BLKA_merge', 'PF_merge', 'PFD_merge', 'PTS_merge', 'PLUS_MINUS_merge',
       'FG2M_merge', 'FG2A_merge', 'GP_RANK', 'MIN_RANK', 'FG3M_RANK',
       'FG3A_RANK', 'FTM_RANK', 'FTA_RANK', 'OREB_RANK', 'DREB_RANK',
       'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK',
       'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'FG2M_RANK', 'FG2A_RANK',
       'gametime_threshold', 'label_pred', 'label_names', 'season',
       'comp_rank', 'selected_player'],
      dtype='object')


In [54]:
def make_stat_bar(bar_stat: str, player_stat, similars, comparison_pool: pd.DataFrame):
    bar_ranked = comparison_pool[bar_stat].sort_values(ascending=False).index
    top = comparison_pool.loc[bar_ranked].head(1)
    bot = comparison_pool.loc[bar_ranked].tail(1)
    df_stat = pd.concat([player_stat, similars, top, bot], axis=0)
    stat_bar = (
        alt.Chart(df_stat, title=bar_stat)
        .mark_bar(width=30)
        .encode(
            y=alt.Y(
                f"{bar_stat}:Q",
                axis=alt.Axis(
                    title=None,
                ),
            ),
            x=alt.X(
                "PLAYER_NAME:N",
                # sort=df_stat.sort_values(by=bar_stat)['PLAYER_NAME'].values,
                # sort='ascending', # sorts X-axis string vals
                sort="y",
                axis=alt.Axis(
                    labels=True,
                    title=None,
                    labelAngle=-30,
                ),
            ),
            color=alt.Color(f"selected_player:N", legend=None),
        )
        .properties(width=300)
    )
    return stat_bar


make_stat_bar(bar_stat, rec, similars, comparison_pool=comp_pool)

In [51]:
bars = {
    stat: make_stat_bar(stat, rec, similars, comparison_pool=comp_pool)
    for stat in merge_stats
}

In [34]:
def make_chart_arrays(charts: dict, title: str, rowlen: int = 4):
    """Given a dict of altair charts, display in an array"""
    base = alt.vconcat(title=title)
    while charts:
        rows = alt.hconcat()
        for _ in range(rowlen):
            if charts:
                rows |= charts.popitem()[1]
        base &= rows

    return base

In [52]:
charts = make_chart_arrays(bars, "Bar title", rowlen=3)

In [53]:
charts