In [None]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

The data was downloaded from [Kaggle](https://www.kaggle.com/nathanlauga/nba-games?select=games_details.csv)

In [None]:
player_stats = pd.read_csv("~/Downloads/Seasons_Stats.csv")

Since we are only interested in 3-point data for this analysis, we select only those columns from the dataframe.

In [None]:
three_point_data = player_stats[["Player", "Year", "3P%", "3PA"]]
print(three_point_data.head().to_string())

We then filter out players where 3-point data is not available (i.e. **3P%** column is set to `NaN`). This is possibly because there was no 3-point line at the time or simply because the player did not attempt any 3-pointers.

In [None]:
# Filter out players that don't have any 3PT data available (no 3PT line or did not attempt 3-pointers)
three_point_data = three_point_data.dropna(subset=["3P%"]).sort_values(
    ["Player", "Year", "3P%"]
)

print(three_point_data.head().to_string())

We then group the 3-point data by player by season since we want to understand how a given player's 3-point percentage changed from one season to the next.

In [None]:
three_point_data = (
    three_point_data.groupby(["Player", "Year"])
    .agg({"3PA": "sum", "3P%": "mean"})
    .rename(columns={"3PA": "total_atp_3", "3P%": "avg_3_pct"})
)

print(three_point_data.head().to_string())

We set a minimum criteria of 3-pointers attempted to make sure all possible outliers are excluded.

In [None]:
filtered_three_point_data = three_point_data[three_point_data["total_atp_3"] > 200]

print(filtered_three_point_data.head().to_string())

We then comparer each player's current 3-point percentage to the same percentage in the previous year.
NOTE: This can likely be improved upon from an efficiency and readability standpoint.

In [None]:
filtered_three_point_data["diff"] = ((filtered_three_point_data["avg_3_pct"]
    - filtered_three_point_data["avg_3_pct"].shift(1)) * 100)

filtered_three_point_data["diff"] = filtered_three_point_data["diff"].round(4)

filtered_three_point_data = filtered_three_point_data.reset_index()

print(filtered_three_point_data.head().to_string())

Since the data is sorted by player and year, there are instances where we are comparing 3-point percentages for 2 different players - in those cases, I set the **diff** column value to `None`.

In [None]:
mask = filtered_three_point_data["Player"] != filtered_three_point_data[
    "Player"
].shift(1)

filtered_three_point_data["diff"][mask] = np.nan

print(filtered_three_point_data.head().to_string())

Finally some results! First, I was curious to know who had the biggest drop from the 3-point line from one year to the next. Interestingly, Toni Kukoc had a big drop from 1999 to 2000 but one of the biggest jumps in history from 2000 to 2001.

In [None]:
min_player_change = (
    filtered_three_point_data.groupby(["Player", "Year", "total_atp_3", "avg_3_pct"])
    .min()
    .reset_index()
    .sort_values(by="diff", ascending=True)
)

print(min_player_change.head().to_string())

Here are the top 5 players with a the largest one season increase from the 3-point line.

In [None]:
max_player_change = filtered_three_point_data.groupby(['Player', 'Year', 'total_atp_3', 'avg_3_pct']) \
                                    .max() \
                                    .reset_index() \
                                    .sort_values(by='diff', ascending=False)

max_player_change = max_player_change[(max_player_change['avg_3_pct'] >= 0.35) & (max_player_change['diff'])] \
                                    .reset_index()

print(max_player_change.head().to_string())

max_player_change = filtered_three_point_data.groupby(['Player', 'Year', 'total_atp_3', 'avg_3_pct']) \
                                    .max() \
                                    .reset_index() \
                                    .sort_values(by='diff', ascending=False)

max_player_change = max_player_change[(max_player_change['avg_3_pct'] >= 0.35) & (max_player_change['diff'])] \
                                    .reset_index()

print(max_player_change.head().to_string())

In [None]:
min_player_change = (
    filtered_three_point_data.groupby(["Player", "Year", "total_atp_3", "avg_3_pct"])
    .min()
    .reset_index()
    .sort_values(by="diff", ascending=True)
)

print(min_player_change.head().to_string())

Here are the top 5 players with a the largest one season increase from the 3-point line.

In [None]:
max_player_change = filtered_three_point_data.groupby(['Player', 'Year', 'total_atp_3', 'avg_3_pct']) \
                                    .max() \
                                    .reset_index() \
                                    .sort_values(by='diff', ascending=False)

max_player_change = max_player_change[(max_player_change['avg_3_pct'] >= 0.35) & (max_player_change['diff'])] \
                                    .reset_index()

print(max_player_change.head().to_string())