In [1]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly import subplots

pd.options.mode.chained_assignment = None

The data was downloaded from [Kaggle](https://www.kaggle.com/nathanlauga/nba-games?select=games_details.csv)

In [2]:
player_stats = pd.read_csv("~/Downloads/Seasons_Stats.csv")

Since we are only interested in 3-point data for this analysis, we select only those columns from the dataframe.

In [3]:
three_point_data = player_stats[["Player", "Year", "3P%", "3PA"]]

We then filter out players where 3-point data is not available (i.e. **3P%** column is set to `NaN`). This is possibly because there was no 3-point line at the time or simply because the player did not attempt any 3-pointers.

In [4]:
# Filter out players that don't have any 3PT data available (no 3PT line or did not attempt 3-pointers)
three_point_data = three_point_data.dropna(subset=["3P%"]).sort_values(
    ["Player", "Year", "3P%"]
)

We then group the 3-point data by player by season since we want to understand how a given player's 3-point percentage changed from one season to the next.

In [5]:
three_point_data = (
    three_point_data.groupby(["Player", "Year"])
    .agg({"3PA": "sum", "3P%": "mean"})
    .rename(columns={"3PA": "total_atp_3", "3P%": "avg_3_pct"})
)

We set a minimum criteria of 3-pointers attempted to make sure all possible outliers are excluded.

In [6]:
filtered_three_point_data = three_point_data[three_point_data["total_atp_3"] > 200]

We then comparer each player's current 3-point percentage to the same percentage in the previous year.
NOTE: This can likely be improved upon from an efficiency and readability standpoint.

In [7]:
filtered_three_point_data["diff"] = (
    filtered_three_point_data["avg_3_pct"]
    - filtered_three_point_data["avg_3_pct"].shift(1)
) * 100

filtered_three_point_data["diff"] = filtered_three_point_data["diff"].round(4)

filtered_three_point_data = filtered_three_point_data.reset_index()

Since the data is sorted by player and year, there are instances where we are comparing 3-point percentages for 2 different players - in those cases, I set the **diff** column value to `None`.

In [8]:
mask = filtered_three_point_data["Player"] != filtered_three_point_data["Player"].shift(
    1
)

filtered_three_point_data["diff"][mask] = np.nan

Finally some results! First, I was curious to know who had the biggest drop from the 3-point line from one year to the next. Interestingly, Toni Kukoc had a big drop from 1999 to 2000 but one of the biggest jumps in history from 2000 to 2001.

In [9]:
min_player_change = (
    filtered_three_point_data.groupby(["Player", "Year", "total_atp_3", "avg_3_pct"])
    .min()
    .reset_index()
    .sort_values(by="diff", ascending=True)
).reset_index()

Here are the top 5 players with a the largest one season increase from the 3-point line.

In [10]:
max_player_change = (
    filtered_three_point_data.groupby(["Player", "Year", "total_atp_3", "avg_3_pct"])
    .max()
    .reset_index()
    .sort_values(by="diff", ascending=False)
)

max_player_change = max_player_change[
    (max_player_change["avg_3_pct"] >= 0.35) & (max_player_change["diff"])
].reset_index()

We then visualize the top 9 players in a 3 by 3 chart with the year of their largest increase in brackets in the chart title.

In [11]:
player_counter = 9
names = []

for idx, row in max_player_change[:player_counter][["Player", "Year"]].iterrows():
    names.append(f"{row['Player']} ({int(row['Year'])})")

names = tuple(names)

fig = subplots.make_subplots(
    rows=int(player_counter / 3), cols=int(player_counter / 3), subplot_titles=names
)

row_counter = 1

for idx, row in max_player_change.iterrows():

    if idx > player_counter - 1:
        break

    col_counter = (idx % 3) + 1

    if idx % 3 == 0 and idx != 0:
        row_counter += 1

    trace_data = three_point_data[
        three_point_data.index.get_level_values("Player") == row["Player"]
    ].reset_index()

    trace = go.Bar(x=trace_data["Year"].tolist(), y=trace_data["avg_3_pct"].tolist())

    fig.append_trace(trace, row_counter, col_counter)

fig.update_layout(height=1000, width=1000, showlegend=False)
fig.write_image("../images/nba_analysis_1.jpeg")

In [12]:
player_counter = 9
names = []

for idx, row in min_player_change[:player_counter][["Player", "Year"]].iterrows():
    names.append(f"{row['Player']} ({int(row['Year'])})")

names = tuple(names)

fig = subplots.make_subplots(
    rows=int(player_counter / 3), cols=int(player_counter / 3), subplot_titles=names
)

row_counter = 1

for idx, row in min_player_change.iterrows():

    if idx > player_counter - 1:
        break

    col_counter = (idx % 3) + 1

    if idx % 3 == 0 and idx != 0:
        row_counter += 1

    trace_data = three_point_data[
        three_point_data.index.get_level_values("Player") == row["Player"]
    ].reset_index()

    trace = go.Bar(x=trace_data["Year"].tolist(), y=trace_data["avg_3_pct"].tolist())

    fig.append_trace(trace, row_counter, col_counter)

fig.update_layout(height=1000, width=1000, showlegend=False)
fig.write_image("../images/nba_analysis_2.jpeg")