# Exploratory Analysis

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import datetime

In [2]:
os.chdir('c:/Users/tyler/OneDrive/Documents/Python/NBA')
os.getcwd()

'c:\\Users\\tyler\\OneDrive\\Documents\\Python\\NBA'

### Load Data

In [3]:
df = pd.read_csv('backend\data\details\game_details.csv', index_col = 0)

In [4]:
df.head()

Unnamed: 0,date,visitor,home,team,starter,player,mp,fg,fga,fg_perc,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
0,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Kirk Hinrich,34:38,10.0,18.0,0.556,...,0.0,4.0,4.0,3.0,2.0,0.0,0.0,2.0,26.0,23.0
1,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Ben Wallace,28:29,2.0,5.0,0.4,...,6.0,5.0,11.0,1.0,0.0,1.0,0.0,2.0,5.0,13.0
2,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Luol Deng,24:07,4.0,9.0,0.444,...,1.0,1.0,2.0,1.0,1.0,0.0,3.0,2.0,12.0,8.0
3,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Ben Gordon,22:56,1.0,9.0,0.111,...,0.0,1.0,1.0,2.0,1.0,0.0,2.0,3.0,6.0,7.0
4,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,P.J. Brown,18:23,1.0,2.0,0.5,...,0.0,3.0,3.0,1.0,1.0,1.0,2.0,3.0,4.0,15.0


In [5]:
df.columns

Index(['date', 'visitor', 'home', 'team', 'starter', 'player', 'mp', 'fg',
       'fga', 'fg_perc', '3p', '3pa', '3p_perc', 'ft', 'fta', 'ft_perc', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus'],
      dtype='object')

Drop duplicates and NA's

In [6]:
df.drop_duplicates(subset = ['date', 'visitor', 'home', 'team', 'player'], inplace = True)

In [7]:
df.dropna(subset = ['player'], inplace = True)

In [8]:
df.isna().sum()

date               0
visitor            0
home               0
team               0
starter            0
player             0
mp                 0
fg                 0
fga                0
fg_perc        22553
3p                 0
3pa                0
3p_perc       155661
ft                 0
fta                0
ft_perc       186618
orb                0
drb                0
trb                0
ast                0
stl                0
blk                0
tov                0
pf                 0
pts                0
plus_minus        75
dtype: int64

Clean data

In [9]:
df['fg_perc'] = df['fg'] / df['fga']
df['3p_perc'] = df['3p'] / df['3pa']
df['ft_perc'] = df['ft'] / df['fta']
df['min'] = df['mp'].apply(lambda x: int(x.split(':')[0]))
df['sec'] = df['mp'].apply(lambda x: int(x.split(':')[-1]))
df['pt'] = df['min'] + df['sec'] / 60
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['player'] = df['player'].apply(lambda x: x.strip())

In [10]:
df.describe()

Unnamed: 0,team,starter,fg,fga,fg_perc,3p,3pa,3p_perc,ft,fta,...,blk,tov,pf,pts,plus_minus,min,sec,pt,year,month
count,531060.0,531060.0,531060.0,531060.0,419261.0,531060.0,531060.0,286153.0,531060.0,531060.0,...,531060.0,531060.0,531060.0,531060.0,530985.0,531060.0,531060.0,531060.0,531060.0,531060.0
mean,0.500616,0.398674,3.064236,6.700768,0.442902,0.699271,1.956086,0.325678,1.427577,1.871369,...,0.385793,1.085922,1.63647,8.25532,-2.4e-05,18.868209,24.395987,19.274809,2014.489481,5.727396
std,0.5,0.489626,3.12737,6.042869,0.23724,1.203178,2.575641,0.304963,2.238496,2.750662,...,0.801036,1.369826,1.544867,8.366175,9.831873,13.244989,19.279683,13.362198,4.757105,4.352128
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-57.0,0.0,0.0,0.0,2006.0,1.0
25%,0.0,0.0,0.0,1.0,0.318182,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-5.0,6.0,5.0,6.5,2010.0,2.0
50%,1.0,0.0,2.0,6.0,0.45,0.0,1.0,0.333333,0.0,0.0,...,0.0,1.0,1.0,6.0,0.0,20.0,23.0,20.45,2015.0,4.0
75%,1.0,1.0,5.0,10.0,0.571429,1.0,3.0,0.5,2.0,3.0,...,1.0,2.0,3.0,13.0,5.0,30.0,41.0,30.4,2018.0,11.0
max,1.0,1.0,24.0,50.0,1.0,14.0,24.0,1.0,26.0,39.0,...,12.0,12.0,6.0,71.0,57.0,64.0,59.0,64.966667,2023.0,12.0


Feature Engineer

In [11]:
x = df.groupby(['date', 'visitor', 'home', 'team'])[['fga', 'orb', 'tov', 'fta']].transform('sum')
df['possessions'] = x.fga - x.orb + x.tov + 0.4 * x.fta

In [12]:
def get_season(month, year):
    if 10 <= month <= 12:
        return year
    else:
        return year - 1

In [13]:
df['season'] = df.apply(lambda x: get_season(x.month, x.year), 1)
df['opponent'] = np.where(df['team'], df['visitor'], df['home'])

Merge Schedule

In [14]:
schedules = pd.read_csv("C:/Users/tyler/OneDrive/Documents/Python/NBA/backend/data/schedules/2022.csv", index_col = 0)
schedules['date'] = pd.to_datetime(schedules['date'])

home_schedule = schedules.copy()
home_schedule['team'] = 1
home_schedule['opponent'] = home_schedule['visitor']

visitor_schedule = schedules.copy()
visitor_schedule['team'] = 0
visitor_schedule['opponent'] = visitor_schedule['home']

schedules = pd.concat([home_schedule, visitor_schedule])

df = pd.merge(schedules, df, left_on = ['date', 'visitor', 'home', 'team', 'opponent'], right_on = ['date', 'visitor', 'home', 'team', 'opponent'], how = 'left')

Merge Opposing Teams Defense

In [15]:
defense_df = pd.read_csv('backend/data/totals/game_totals.csv', index_col = 0)
defense_df['date'] = pd.to_datetime(defense_df['date'])

defense_df = pd.merge(schedules, defense_df, left_on = ['date', 'visitor', 'home', 'team'], right_on = ['date', 'visitor', 'home', 'team'], how = 'left')

In [16]:
defense_df.drop_duplicates(subset = ['date', 'visitor', 'home', 'team'], inplace = True)
defense_df['team'] = np.where(defense_df['team'], defense_df['visitor'], defense_df['home'])

In [17]:
defense_df['date'] = pd.to_datetime(defense_df['date'])
defense_df['year'] = defense_df['date'].dt.year
defense_df['month'] = defense_df['date'].dt.month
defense_df['season'] = defense_df.apply(lambda x: get_season(x.month, x.year), 1)

In [18]:
defense_df['rb'] = defense_df.groupby(['season', 'team'])['trb'].shift(1)
defense_df['trb_sum'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).sum().sort_index(axis = 0, level = 2).values
defense_df['trb_count'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).count().sort_index(axis = 0, level = 2).values
defense_df['trb_mean'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).mean().sort_index(axis = 0, level = 2).values

In [19]:
rb_lg_sum = defense_df.groupby(['season', 'date'])['trb'].sum().groupby(['season']).shift(1)
rb_lg_sum = rb_lg_sum.groupby(['season']).expanding(1).sum()
rb_lg_count = defense_df.groupby(['season', 'date'])['trb'].count().groupby(['season']).shift(1)
rb_lg_count = rb_lg_count.groupby(['season']).expanding(1).sum()
rb_lg_avg = rb_lg_sum / rb_lg_count
rb_lg_avg.index = defense_df.groupby(['season', 'date'])['trb'].sum().index
rb_lg_avg = rb_lg_avg.reset_index()
defense_df = pd.merge(defense_df, rb_lg_avg, on = ['season', 'date'], how = 'left', suffixes = ('', '_lg_avg'))


In [20]:
df = pd.merge(
    df, 
    defense_df[['team', 'date', 'trb_mean', 'trb_lg_avg']], 
    left_on = ['opponent', 'date'], 
    right_on = ['team', 'date'], 
    how = 'left', 
    suffixes = ('', '_opp')
)

In [21]:
x = df.groupby(['date', 'visitor', 'home', 'team'])[['fga', 'orb', 'tov', 'fta']].transform('sum')
df['possessions'] = x.fga - x.orb + x.tov + 0.4 * x.fta

Enter Player Name and Line

In [22]:
def convert_perc_to_odds(perc):
    if perc > 0.5:
        return round((100 * perc) / (1 - perc)) * -1
    else:
        return round((1 - perc) * 100 / perc)


In [23]:
def convert_odds_to_perc(odds):
    if odds < 0:
        return round(abs(odds) / (abs(odds) + 100), 3)
    else:
        return round(100 / (abs(odds) + 100), 3)

In [24]:
def expected_value(prob, odds):
    return prob * odds - (1 - prob)

In [713]:
today = "2023-01-23"
visitor = "Memphis Grizzlies"
home = "Sacramento Kings"

In [758]:
player = "Kevin Huerter"
total = 3.5
implied_over = convert_odds_to_perc(100)
implied_under = convert_odds_to_perc(-130)

### Player Analysis

In [759]:
player_df = df[(df['player'] == player) | ((df['date'] == today) & (df['visitor'] == visitor) & (df['home'] == home))].copy()

In [760]:
player_df = player_df[player_df['mp'] != '0']

In [761]:
player_df['rb'] = player_df['trb'].shift(1)
player_df['rb_sum'] = player_df['rb'].expanding(1).sum().values
player_df['rb_count'] = player_df['rb'].expanding(1).count().values
player_df['rb_mean'] = player_df['rb'].expanding(1).mean().values

In [762]:
player_df.iloc[-1, :][['rb_sum', 'rb_count', 'trb_mean', 'trb_lg_avg']]

rb_sum            140.0
rb_count           42.0
trb_mean      42.066667
trb_lg_avg    43.529078
Name: 18563, dtype: object

In [763]:
mu_df = player_df.iloc[-1, :][['rb_sum', 'rb_count', 'trb_mean', 'trb_lg_avg']]
normalize = 1 + (mu_df['trb_mean'] - mu_df['trb_lg_avg']) / mu_df['trb_lg_avg']
mu = (mu_df['rb_sum'] / mu_df['rb_count']) * normalize
mu

3.221346020159889

In [764]:
mu_df['rb_sum'] / mu_df['rb_count']

3.3333333333333335

In [765]:
normalize

0.9664038060479666

Calculate Odds

In [766]:
print(f"Implied Over Odds: {implied_over:.1%} ({convert_perc_to_odds(implied_over)})")
print(f"Under Odds: {implied_under:.1%} ({convert_perc_to_odds(implied_under)})")

Implied Over Odds: 50.0% (100)
Under Odds: 56.5% (-130)


In [767]:
under = stats.poisson.cdf(k = total, mu = mu)
over = 1 - under
print(f"Over Odds: {over:.1%} ({convert_perc_to_odds(over)})")
print(f"Under Odds: {under:.1%} ({convert_perc_to_odds(under)})")

Over Odds: 40.2% (149)
Under Odds: 59.8% (-149)


In [768]:
over_ev = expected_value(over, implied_over)
under_ev = expected_value(under, implied_under)
print(f"Over EV: {over_ev:.1%}")
print(f"Under EV: {under_ev:.1%}")

Over EV: -39.7%
Under EV: -6.4%
