In [1]:
from nba_api.stats.static import players
from nba_api.stats.endpoints import PlayerGameLog
import pandas as pd
import nba_api.stats.endpoints
import time
import random

In [2]:
draft_df = nba_api.stats.endpoints.drafthistory.DraftHistory().get_data_frames()[0]

In [3]:
draft_2003 = draft_df[draft_df["SEASON"] == "2003"]
draft_2003

Unnamed: 0,PERSON_ID,PLAYER_NAME,SEASON,ROUND_NUMBER,ROUND_PICK,OVERALL_PICK,DRAFT_TYPE,TEAM_ID,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,ORGANIZATION,ORGANIZATION_TYPE,PLAYER_PROFILE_FLAG
1312,2544,LeBron James,2003,1,1,1,Draft,1610612739,Cleveland,Cavaliers,CLE,Saint Vincent-Saint Mary,High School,1
1313,2545,Darko Milicic,2003,1,2,2,Draft,1610612765,Detroit,Pistons,DET,KK Vrsac (Serbia),Other Team/Club,1
1314,2546,Carmelo Anthony,2003,1,3,3,Draft,1610612743,Denver,Nuggets,DEN,Syracuse,College/University,1
1315,2547,Chris Bosh,2003,1,4,4,Draft,1610612761,Toronto,Raptors,TOR,Georgia Tech,College/University,1
1316,2548,Dwyane Wade,2003,1,5,5,Draft,1610612748,Miami,Heat,MIA,Marquette,College/University,1
1317,2549,Chris Kaman,2003,1,6,6,Draft,1610612746,Los Angeles,Clippers,LAC,Central Michigan,College/University,1
1318,2550,Kirk Hinrich,2003,1,7,7,Draft,1610612741,Chicago,Bulls,CHI,Kansas,College/University,1
1319,2551,T.J. Ford,2003,1,8,8,Draft,1610612749,Milwaukee,Bucks,MIL,Texas,College/University,1
1320,2552,Michael Sweetney,2003,1,9,9,Draft,1610612752,New York,Knicks,NYK,Georgetown,College/University,1
1321,2553,Jarvis Hayes,2003,1,10,10,Draft,1610612764,Washington,Wizards,WAS,Georgia,College/University,1


In [4]:
players_name_id = draft_2003[["PLAYER_NAME", "PERSON_ID"]]

In [5]:
players_10_games = []
for player_id, player_name in zip(players_name_id["PERSON_ID"], players_name_id["PLAYER_NAME"]):
	game_log = PlayerGameLog(player_id=player_id, season="2003-04", season_type_all_star="Regular Season")
	df = game_log.get_data_frames()[0]
	df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
	df_sorted = df.sort_values("GAME_DATE")
	first_10_games = df_sorted.head(10)
	if not first_10_games.empty:	# Ensuring they have played at least a game
		player_stats = {
			"player_name": player_name,
			"player_id": player_id,
			"fta": first_10_games["FTA"].sum(),
			"ftm": first_10_games["FTM"].sum(),
			"ft_pct": first_10_games["FTM"].sum() / first_10_games["FTA"].sum() if first_10_games["FTA"].sum() > 0 else 0
		}
		players_10_games.append(player_stats)
	time.sleep(random.uniform(0.5, 1.5))  # To avoid hitting rate limits
df_players_10_games = pd.DataFrame(players_10_games)

In [6]:
# removing players who didnt shoot free throws in their first 10 games
# since they dont provide meaningful data for free throw percentage
df_players_10_games = df_players_10_games[df_players_10_games["fta"] > 0]
df_players_10_games

Unnamed: 0,player_name,player_id,fta,ftm,ft_pct
0,LeBron James,2544,38,23,0.605263
1,Darko Milicic,2545,3,1,0.333333
2,Carmelo Anthony,2546,60,43,0.716667
3,Chris Bosh,2547,27,23,0.851852
4,Dwyane Wade,2548,36,24,0.666667
5,Chris Kaman,2549,17,12,0.705882
6,Kirk Hinrich,2550,22,15,0.681818
7,T.J. Ford,2551,17,15,0.882353
8,Michael Sweetney,2552,6,5,0.833333
9,Jarvis Hayes,2553,11,9,0.818182


In [7]:
df_players_10_games.to_csv("2003_players_10_games.csv", index=True)

Because each player is a binomial and their free throw variance is different we need to apply shrinkage differently to each player.

Reason for applying different shrinkages is beacause if:
Player A:
- has shot 200FT

Player B:
- has shot 20FT

it is not accurate to apply the same shrinkage to both of them

shrinkage equation

$c = 1 - \frac{(k - 3)\sigma^2}{\sum (y - \bar{y})^2}$

$z = \bar{y} + c (y - \bar{y})$

where:
- z is the JS adjusted average
- $\bar{y}$ is the "grand" average


In [8]:
import numpy as np

In [9]:
# applying james stein
# lets try with changing shrinkage c
grand_average = df_players_10_games["ft_pct"].mean()
k = len(df_players_10_games)

def js_avg(df):
    # free throws are a binomial process
	variance = df["ft_pct"] * (1-df["ft_pct"]) / df["fta"]

	shrinkage_c = 1 - ((k-3)*variance) / sum((df["ft_pct"] - grand_average)**2)
	# lowkey apply a sigmoid function here to limit shrinkage c between 0 and 1
	shrinkage_c = 1 / (1 + np.exp(shrinkage_c))
	df["variance"] = variance
	df["shrinkage_c"] = shrinkage_c
	df["js_ft_pct"] = grand_average + shrinkage_c * (df["ft_pct"] - grand_average)

	return df

df_players_10_games_js = js_avg(df_players_10_games)

In [10]:
df_players_10_games_js

Unnamed: 0,player_name,player_id,fta,ftm,ft_pct,variance,shrinkage_c,js_ft_pct
0,LeBron James,2544,38,23,0.605263,0.006287,0.29828,0.675069
1,Darko Milicic,2545,3,1,0.333333,0.074074,0.668712,0.456376
2,Carmelo Anthony,2546,60,43,0.716667,0.003384,0.284505,0.708134
3,Chris Bosh,2547,27,23,0.851852,0.004674,0.290578,0.747488
4,Dwyane Wade,2548,36,24,0.666667,0.006173,0.297729,0.693405
5,Chris Kaman,2549,17,12,0.705882,0.012212,0.32754,0.705115
6,Kirk Hinrich,2550,22,15,0.681818,0.009861,0.31575,0.697503
7,T.J. Ford,2551,17,15,0.882353,0.006106,0.297409,0.757564
8,Michael Sweetney,2552,6,5,0.833333,0.023148,0.385087,0.75426
9,Jarvis Hayes,2553,11,9,0.818182,0.013524,0.334211,0.742654


In [57]:
grand_average

np.float64(0.7047410108871327)

In [63]:
k

38

In [67]:
# lets try transforming the data a lil
transform = lambda k, ft_pct: np.sqrt(4*k) * np.arcsin(np.sqrt(ft_pct))
untransform = lambda k, transformed_value: (np.sin(transformed_value / np.sqrt(4*k)))**2

In [70]:
def js_transformed(df):
	x = transform(k, df["ft_pct"])
	grand_average = np.mean(x)
	diff = x - grand_average
	shrinkage_c = 1 - (k-3) / sum(diff**2)
	df["shrinkage_c"] = shrinkage_c

	# untransform
	js_trans_avg = grand_average + shrinkage_c * diff
	df["js_ft_pct"] = untransform(k, js_trans_avg)
	return df

df_players_10_games_js_transformed = js_transformed(df_players_10_games)

In [71]:
df_players_10_games_js_transformed

Unnamed: 0,player_name,player_id,fta,ftm,ft_pct,shrinkage_c,js_ft_pct,variance
0,LeBron James,2544,38,23,0.605263,0.928306,0.616359,0.006287
1,Darko Milicic,2545,3,1,0.333333,0.928306,0.36302,0.074074
2,Carmelo Anthony,2546,60,43,0.716667,0.928306,0.719291,0.003384
3,Chris Bosh,2547,27,23,0.851852,0.928306,0.845401,0.004674
4,Dwyane Wade,2548,36,24,0.666667,0.928306,0.673062,0.006173
5,Chris Kaman,2549,17,12,0.705882,0.928306,0.709311,0.012212
6,Kirk Hinrich,2550,22,15,0.681818,0.928306,0.687062,0.009861
7,T.J. Ford,2551,17,15,0.882353,0.928306,0.874362,0.006106
8,Michael Sweetney,2552,6,5,0.833333,0.928306,0.82795,0.023148
9,Jarvis Hayes,2553,11,9,0.818182,0.928306,0.81373,0.013524


In [78]:
grand_average

np.float64(0.7047410108871327)

im stuck.

i want the shrinkage applied to each player to be different since variances across players differ depending on the amount of free throws they take.

1. ive tried the appraoch where i calculate individual variances for players but the shrinkage becomes wild especially for players with smaller sample sizes. For example someone who is a 33% ft shooter with 3 attempts gets shrunk to 97%. players with 100% ft% remain at 100%

2. ive tried another approach where i assume constant variance across all players. shrinkage is too powerful and everyone is assumed to shoot at the grand average.

3. ive also tried arcsin transformation. shrinkage is negligible and it also assumes constant variance. 

i think the approach im gonna try next is a modification approach of 1. where i apply a weighting factor on the variance. should look at existing approaches for this