# Exploratory Analysis

In [592]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [593]:
os.chdir('c:/Users/tyler/OneDrive/Documents/Python/NBA')
os.getcwd()

'c:\\Users\\tyler\\OneDrive\\Documents\\Python\\NBA'

### Load Data

In [594]:
df = pd.read_csv('backend\data\details\game_details.csv', index_col = 0)

In [595]:
df.head()

Unnamed: 0,date,visitor,home,team,starter,player,mp,fg,fga,fg_perc,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
0,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Kirk Hinrich,34:38,10.0,18.0,0.556,...,0.0,4.0,4.0,3.0,2.0,0.0,0.0,2.0,26.0,23.0
1,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Ben Wallace,28:29,2.0,5.0,0.4,...,6.0,5.0,11.0,1.0,0.0,1.0,0.0,2.0,5.0,13.0
2,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Luol Deng,24:07,4.0,9.0,0.444,...,1.0,1.0,2.0,1.0,1.0,0.0,3.0,2.0,12.0,8.0
3,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Ben Gordon,22:56,1.0,9.0,0.111,...,0.0,1.0,1.0,2.0,1.0,0.0,2.0,3.0,6.0,7.0
4,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,P.J. Brown,18:23,1.0,2.0,0.5,...,0.0,3.0,3.0,1.0,1.0,1.0,2.0,3.0,4.0,15.0


In [596]:
df.columns

Index(['date', 'visitor', 'home', 'team', 'starter', 'player', 'mp', 'fg',
       'fga', 'fg_perc', '3p', '3pa', '3p_perc', 'ft', 'fta', 'ft_perc', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus'],
      dtype='object')

Drop duplicates and NA's

In [597]:
df.drop_duplicates(subset = ['date', 'visitor', 'home', 'team', 'player'], inplace = True)

In [598]:
df.dropna(subset = ['player'], inplace = True)

In [599]:
df.isna().sum()

date               0
visitor            0
home               0
team               0
starter            0
player             0
mp                 0
fg                 0
fga                0
fg_perc        22376
3p                 0
3pa                0
3p_perc       154839
ft                 0
fta                0
ft_perc       185026
orb                0
drb                0
trb                0
ast                0
stl                0
blk                0
tov                0
pf                 0
pts                0
plus_minus        75
dtype: int64

Clean data

In [600]:
df['fg_perc'] = df['fg'] / df['fga']
df['3p_perc'] = df['3p'] / df['3pa']
df['ft_perc'] = df['ft'] / df['fta']
df['min'] = df['mp'].apply(lambda x: int(x.split(':')[0]))
df['sec'] = df['mp'].apply(lambda x: int(x.split(':')[-1]))
df['pt'] = df['min'] + df['sec'] / 60
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['player'] = df['player'].apply(lambda x: x.strip())

In [601]:
df.describe()

Unnamed: 0,team,starter,fg,fga,fg_perc,3p,3pa,3p_perc,ft,fta,...,blk,tov,pf,pts,plus_minus,min,sec,pt,year,month
count,526380.0,526380.0,526380.0,526380.0,415642.0,526380.0,526380.0,283179.0,526380.0,526380.0,...,526380.0,526380.0,526380.0,526380.0,526305.0,526380.0,526380.0,526380.0,526380.0,526380.0
mean,0.500585,0.398742,3.06214,6.698904,0.442612,0.696871,1.949801,0.325494,1.426952,1.870985,...,0.385972,1.086493,1.637049,8.248102,-2.5e-05,18.871752,24.401957,19.278452,2014.414972,5.7567
std,0.5,0.48964,3.124927,6.038618,0.237202,1.201017,2.570178,0.305074,2.236639,2.749087,...,0.801284,1.370012,1.544871,8.356823,9.834385,13.245091,19.278091,13.362211,4.711716,4.346285
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-57.0,0.0,0.0,0.0,2006.0,1.0
25%,0.0,0.0,0.0,1.0,0.315789,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-5.0,6.0,5.0,6.516667,2010.0,2.0
50%,1.0,0.0,2.0,6.0,0.45,0.0,1.0,0.333333,0.0,0.0,...,0.0,1.0,1.0,6.0,0.0,20.0,23.0,20.45,2014.0,4.0
75%,1.0,1.0,5.0,10.0,0.571429,1.0,3.0,0.5,2.0,3.0,...,1.0,2.0,3.0,13.0,5.0,30.0,41.0,30.4,2018.0,11.0
max,1.0,1.0,24.0,50.0,1.0,14.0,24.0,1.0,26.0,39.0,...,12.0,12.0,6.0,70.0,57.0,64.0,59.0,64.966667,2022.0,12.0


Feature Engineer

In [602]:
x = df.groupby(['date', 'visitor', 'home', 'team'])[['fga', 'orb', 'tov', 'fta']].transform('sum')
df['possessions'] = x.fga - x.orb + x.tov + 0.4 * x.fta

In [603]:
def get_season(month, year):
    if 10 <= month <= 12:
        return year
    else:
        return year - 1

In [604]:
df['season'] = df.apply(lambda x: get_season(x.month, x.year), 1)
df['opponent'] = np.where(df['team'], df['visitor'], df['home'])

Merge Opposing Teams Defense

In [605]:
defense_df = pd.read_csv('backend/data/totals/game_totals.csv', index_col = 0)

In [606]:
defense_df.drop_duplicates(subset = ['date', 'visitor', 'home', 'team'], inplace = True)
defense_df.dropna(subset = ['date', 'visitor', 'home', 'team'], inplace = True)
defense_df['team'] = np.where(defense_df['team'], defense_df['visitor'], defense_df['home'])

In [607]:
defense_df['date'] = pd.to_datetime(defense_df['date'])
defense_df['year'] = defense_df['date'].dt.year
defense_df['month'] = defense_df['date'].dt.month
defense_df['season'] = defense_df.apply(lambda x: get_season(x.month, x.year), 1)

In [608]:
defense_df['rb'] = defense_df.groupby(['season', 'team'])['trb'].shift(1)
defense_df['trb_sum'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).sum().sort_index(axis = 0, level = 2).values
defense_df['trb_count'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).count().sort_index(axis = 0, level = 2).values
defense_df['trb_mean'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).mean().sort_index(axis = 0, level = 2).values

In [609]:
rb_lg_sum = defense_df.groupby(['season', 'date'])['trb'].sum().groupby(['season']).shift(1)
rb_lg_sum = rb_lg_sum.groupby(['season']).expanding(1).sum()
rb_lg_count = defense_df.groupby(['season', 'date'])['trb'].count().groupby(['season']).shift(1)
rb_lg_count = rb_lg_count.groupby(['season']).expanding(1).sum()
rb_lg_avg = rb_lg_sum / rb_lg_count
rb_lg_avg.index = defense_df.groupby(['season', 'date'])['trb'].sum().index
rb_lg_avg = rb_lg_avg.reset_index()
defense_df = pd.merge(defense_df, rb_lg_avg, on = ['season', 'date'], how = 'left', suffixes = ('', '_lg_avg'))


In [610]:
df = pd.merge(
    df, 
    defense_df[['team', 'date', 'trb_mean', 'trb_lg_avg']], 
    left_on = ['opponent', 'date'], 
    right_on = ['team', 'date'], 
    how = 'left', 
    suffixes = ('', '_opp')
)

In [611]:
x = df.groupby(['date', 'visitor', 'home', 'team'])[['fga', 'orb', 'tov', 'fta']].transform('sum')
df['possessions'] = x.fga - x.orb + x.tov + 0.4 * x.fta

In [612]:
df

Unnamed: 0,date,visitor,home,team,starter,player,mp,fg,fga,fg_perc,...,sec,pt,year,month,possessions,season,opponent,team_opp,trb_mean,trb_lg_avg
0,2006-10-31,Chicago Bulls,Miami Heat,0,1.0,Kirk Hinrich,34:38,10.0,18.0,0.555556,...,38,34.633333,2006,10,94.8,2006,Miami Heat,Miami Heat,,
1,2006-10-31,Chicago Bulls,Miami Heat,0,1.0,Ben Wallace,28:29,2.0,5.0,0.400000,...,29,28.483333,2006,10,94.8,2006,Miami Heat,Miami Heat,,
2,2006-10-31,Chicago Bulls,Miami Heat,0,1.0,Luol Deng,24:07,4.0,9.0,0.444444,...,7,24.116667,2006,10,94.8,2006,Miami Heat,Miami Heat,,
3,2006-10-31,Chicago Bulls,Miami Heat,0,1.0,Ben Gordon,22:56,1.0,9.0,0.111111,...,56,22.933333,2006,10,94.8,2006,Miami Heat,Miami Heat,,
4,2006-10-31,Chicago Bulls,Miami Heat,0,1.0,P.J. Brown,18:23,1.0,2.0,0.500000,...,23,18.383333,2006,10,94.8,2006,Miami Heat,Miami Heat,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526375,2022-12-28,Denver Nuggets,Sacramento Kings,1,0.0,Neemias Queta,0,0.0,0.0,,...,0,0.000000,2022,12,100.0,2022,Denver Nuggets,Denver Nuggets,40.212121,43.529183
526376,2022-12-28,Denver Nuggets,Sacramento Kings,1,0.0,Matthew Dellavedova,0,0.0,0.0,,...,0,0.000000,2022,12,100.0,2022,Denver Nuggets,Denver Nuggets,40.212121,43.529183
526377,2022-12-28,Denver Nuggets,Sacramento Kings,1,0.0,Chimezie Metu,0,0.0,0.0,,...,0,0.000000,2022,12,100.0,2022,Denver Nuggets,Denver Nuggets,40.212121,43.529183
526378,2022-12-28,Denver Nuggets,Sacramento Kings,1,0.0,Terence Davis,0,0.0,0.0,,...,0,0.000000,2022,12,100.0,2022,Denver Nuggets,Denver Nuggets,40.212121,43.529183


Enter Player Name and Line

In [613]:
def convert_perc_to_odds(perc):
    if perc > 0.5:
        return round((100 * perc) / (1 - perc)) * -1
    else:
        return round((1 - perc) * 100 / perc)


In [614]:
def convert_odds_to_perc(odds):
    if odds < 0:
        return round(abs(odds) / (abs(odds) + 100), 3)
    else:
        return round(100 / (abs(odds) + 100), 3)

In [615]:
def expected_value(prob, odds):
    return prob * odds - (1 - prob)

In [920]:
player = "Mike Conley"
total = 2.5
implied_over = convert_odds_to_perc(145)
implied_under = convert_odds_to_perc(-195)

### Player Analysis

In [921]:
player_df = df[df['player'] == player].copy()

In [922]:
player_df['rb'] = player_df.groupby('season')['trb'].shift(1)
player_df['rb_sum'] = player_df.groupby('season')['rb'].expanding(5).sum().values
player_df['rb_count'] = player_df.groupby('season')['rb'].expanding(5).count().values
player_df['rb_mean'] = player_df.groupby('season')['rb'].expanding(5).mean().values

In [923]:
mu_df = player_df.iloc[-1, :][['trb', 'rb_sum', 'rb_count', 'trb_mean', 'trb_lg_avg']]
normalize = 1 + (mu_df['trb_mean'] - mu_df['trb_lg_avg']) / mu_df['trb_lg_avg']
mu = (mu_df['trb'] + mu_df['rb_sum']) / (mu_df['rb_count'] + 1) * normalize
mu

2.2131420202216665

Calculate Odds

In [924]:
print(f"Implied Over Odds: {implied_over:.1%} ({convert_perc_to_odds(implied_over)})")
print(f"Under Odds: {implied_under:.1%} ({convert_perc_to_odds(implied_under)})")

Implied Over Odds: 40.8% (145)
Under Odds: 66.1% (-195)


In [925]:
under = stats.poisson.cdf(k = total, mu = mu)
over = 1 - under
print(f"Over Odds: {over:.1%} ({convert_perc_to_odds(over)})")
print(f"Under Odds: {under:.1%} ({convert_perc_to_odds(under)})")

Over Odds: 38.1% (163)
Under Odds: 61.9% (-163)


In [926]:
over_ev = expected_value(over, implied_over)
under_ev = expected_value(under, implied_under)
print(f"Over EV: {over_ev:.1%}")
print(f"Under EV: {under_ev:.1%}")

Over EV: -46.4%
Under EV: 2.8%
