# Rebounds

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import datetime
from unidecode import unidecode

In [3]:
os.chdir('/home/tylerengland/NBA')
os.getcwd()

'/home/tylerengland/NBA'

### Load Data

In [4]:
df = pd.read_csv('backend/data/details/game_details.csv')

In [5]:
df.head()

Unnamed: 0,date,visitor,home,team,starter,player,mp,fg,fga,fg_perc,...,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus,season
0,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Kirk Hinrich,34:38,10.0,18.0,0.556,...,4.0,4.0,3.0,2.0,0.0,0.0,2.0,26.0,23.0,
1,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Ben Wallace,28:29,2.0,5.0,0.4,...,5.0,11.0,1.0,0.0,1.0,0.0,2.0,5.0,13.0,
2,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Luol Deng,24:07,4.0,9.0,0.444,...,1.0,2.0,1.0,1.0,0.0,3.0,2.0,12.0,8.0,
3,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,Ben Gordon,22:56,1.0,9.0,0.111,...,1.0,1.0,2.0,1.0,0.0,2.0,3.0,6.0,7.0,
4,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,1.0,P.J. Brown,18:23,1.0,2.0,0.5,...,3.0,3.0,1.0,1.0,1.0,2.0,3.0,4.0,15.0,


In [6]:
df.columns

Index(['date', 'visitor', 'home', 'team', 'starter', 'player', 'mp', 'fg',
       'fga', 'fg_perc', '3p', '3pa', '3p_perc', 'ft', 'fta', 'ft_perc', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus',
       'season'],
      dtype='object')

Drop duplicates and NA's

In [7]:
df.drop_duplicates(subset=['date', 'visitor', 'home', 'team', 'player'], inplace=True)

In [8]:
df.dropna(subset = ['player'], inplace = True)

In [9]:
df.isna().sum()

date               0
visitor            0
home               0
team               0
starter            0
player             0
mp                 0
fg                 0
fga                0
fg_perc        22692
3p                 0
3pa                0
3p_perc       156463
ft                 0
fta                0
ft_perc       188212
orb                0
drb                0
trb                0
ast                0
stl                0
blk                0
tov                0
pf                 0
pts                0
plus_minus        80
season        535562
dtype: int64

Clean data

In [10]:
df['fg_perc'] = df['fg'] / df['fga']
df['3p_perc'] = df['3p'] / df['3pa']
df['ft_perc'] = df['ft'] / df['fta']
df['min'] = df['mp'].apply(lambda x: int(x.split(':')[0]))
df['sec'] = df['mp'].apply(lambda x: int(x.split(':')[-1]))
df['pt'] = df['min'] + df['sec'] / 60
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['player'] = df['player'].apply(lambda x: x.strip())

In [11]:
df.describe()

Unnamed: 0,team,starter,fg,fga,fg_perc,3p,3pa,3p_perc,ft,fta,...,tov,pf,pts,plus_minus,season,min,sec,pt,year,month
count,535640.0,535640.0,535640.0,535640.0,422832.0,535640.0,535640.0,289061.0,535640.0,535640.0,...,535640.0,535640.0,535640.0,535560.0,78.0,535640.0,535640.0,535640.0,535640.0,535640.0
mean,0.500653,0.398607,3.066339,6.70228,0.443158,0.701382,1.961721,0.325734,1.427621,1.871005,...,1.085348,1.635632,8.261681,-2.4e-05,2022.0,18.865008,24.390902,19.271523,2014.562251,5.692508
std,0.5,0.489612,3.129845,6.045804,0.237249,1.205301,2.580121,0.304822,2.239252,2.751188,...,1.369652,1.544588,8.374334,9.83227,0.0,13.243943,19.280511,13.361279,4.801099,4.349959
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-57.0,2022.0,0.0,0.0,0.0,2006.0,1.0
25%,0.0,0.0,0.0,1.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-5.0,2022.0,6.0,5.0,6.5,2010.0,2.0
50%,1.0,0.0,2.0,6.0,0.45,0.0,1.0,0.333333,0.0,0.0,...,1.0,1.0,6.0,0.0,2022.0,20.0,23.0,20.45,2015.0,4.0
75%,1.0,1.0,5.0,10.0,0.571429,1.0,3.0,0.5,2.0,3.0,...,2.0,3.0,13.0,5.0,2022.0,30.0,41.0,30.4,2019.0,11.0
max,1.0,1.0,24.0,50.0,1.0,14.0,24.0,1.0,26.0,39.0,...,12.0,6.0,71.0,57.0,2022.0,64.0,59.0,64.966667,2023.0,12.0


Feature Engineer

In [12]:
x = df.groupby(['date', 'visitor', 'home', 'team'])[['fga', 'orb', 'tov', 'fta']].transform('sum')
df['possessions'] = x.fga - x.orb + x.tov + 0.4 * x.fta

In [13]:
def get_season(month, year):
    if 10 <= month <= 12:
        return year
    else:
        return year - 1

In [14]:
df['season'] = df.apply(lambda x: get_season(x.month, x.year), 1)
df['opponent'] = np.where(df['team'], df['visitor'], df['home'])

Merge Schedule

In [15]:
schedules = pd.read_csv("backend/data/schedules/2022.csv", index_col = 0)
schedules['date'] = pd.to_datetime(schedules['date'])

home_schedule = schedules.copy()
home_schedule['team'] = 1
home_schedule['opponent'] = home_schedule['visitor']

visitor_schedule = schedules.copy()
visitor_schedule['team'] = 0
visitor_schedule['opponent'] = visitor_schedule['home']

schedules = pd.concat([home_schedule, visitor_schedule])

df = pd.merge(schedules, df, left_on = ['date', 'visitor', 'home', 'team', 'opponent'], right_on = ['date', 'visitor', 'home', 'team', 'opponent'], how = 'left')

Teams Defense

In [16]:
defense_df = pd.read_csv('backend/data/totals/game_totals.csv', index_col = 0)
defense_df['date'] = pd.to_datetime(defense_df['date'])

defense_df = pd.merge(schedules, defense_df, left_on = ['date', 'visitor', 'home', 'team'], right_on = ['date', 'visitor', 'home', 'team'], how = 'left')
defense_df.drop_duplicates(subset = ['date', 'visitor', 'home', 'team'], inplace = True)

Feature engineer for defense

In [17]:
defense_df['team'] = np.where(defense_df['team'], defense_df['visitor'], defense_df['home'])
defense_df['date'] = pd.to_datetime(defense_df['date'])
defense_df['year'] = defense_df['date'].dt.year
defense_df['month'] = defense_df['date'].dt.month
defense_df['season'] = defense_df.apply(lambda x: get_season(x.month, x.year), 1)

Cumulative moving average for teams 

In [18]:
defense_df['rb'] = defense_df.groupby(['season', 'team'])['trb'].shift(1)
defense_df['trb_sum'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).sum().sort_index(axis = 0, level = 2).values
defense_df['trb_count'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).count().sort_index(axis = 0, level = 2).values
defense_df['trb_mean'] = defense_df.groupby(['season', 'team'])['rb'].expanding(5).mean().sort_index(axis = 0, level = 2).values

League average

In [19]:
rb_lg_sum = defense_df.groupby(['season', 'date'])['trb'].sum().groupby(['season']).shift(1)
rb_lg_sum = rb_lg_sum.groupby(['season']).expanding(1).sum()
rb_lg_count = defense_df.groupby(['season', 'date'])['trb'].count().groupby(['season']).shift(1)
rb_lg_count = rb_lg_count.groupby(['season']).expanding(1).sum()
rb_lg_avg = rb_lg_sum / rb_lg_count
rb_lg_avg.index = defense_df.groupby(['season', 'date'])['trb'].sum().index
rb_lg_avg = rb_lg_avg.reset_index()
defense_df = pd.merge(defense_df, rb_lg_avg, on = ['season', 'date'], how = 'left', suffixes = ('', '_lg_avg'))


Merge in opposing defense

In [20]:
df = pd.merge(
    df, 
    defense_df[['team', 'date', 'trb_mean', 'trb_lg_avg']], 
    left_on = ['opponent', 'date'], 
    right_on = ['team', 'date'], 
    how = 'left', 
    suffixes = ('', '_opp')
)

Functions to convert odds

In [21]:
def convert_perc_to_odds(perc):
    if perc > 0.5:
        return round((100 * perc) / (1 - perc)) * -1
    else:
        return round((1 - perc) * 100 / perc)


In [22]:
def convert_odds_to_perc(odds):
    if odds < 0:
        return round(abs(odds) / (abs(odds) + 100), 3)
    else:
        return round(100 / (abs(odds) + 100), 3)

Function to calculate EV

In [23]:
def expected_value(prob, odds):
    return prob * odds - (1 - prob)

Player Analysis

In [24]:
def player_analysis(player, date, df=df):
    player_df = df.loc[(df['player'] == player) & (df['date'] <= date)].sort_values(by=['date'], ascending=True).copy()

    # Keep relevant team data (addresses trades)
    player_df.loc[:, 'team'] = np.where(player_df['team'] == 1, player_df['home'], player_df['visitor'])
    most_recent_team = player_df.loc[player_df['date'] == player_df['date'].max(), 'team'].values[0]
    player_df = player_df.loc[player_df['team'] == most_recent_team, :]

    # Keep games where player played
    player_df = player_df[player_df['pt'] != '0']

    # Add row to hole next game predictions
    player_df.loc['next_game', :] = None

    # Shift opposing defense and league average down a game
    player_df.loc[:, 'trb_mean'] = player_df.loc[:, 'trb_mean'].shift(1)
    player_df.loc[:, 'trb_lg_avg'] = player_df.loc[:, 'trb_lg_avg'].shift(1)

    # Cumulative moving average for rebounds per min
    player_df['rb'] = player_df['trb'].shift(1) / player_df['pt'].shift(1)
    player_df['rb_sum'] = player_df['rb'].expanding(1).sum().values
    player_df['rb_count'] = player_df['rb'].expanding(1).count().values
    player_df['rb_mean'] = player_df['rb'].expanding(1).mean().values

    return player_df


Load minute projections

In [25]:
def load_minute_projections():
    minute_projections = pd.read_csv(
        'backend/data/rotowire-nba-projections.csv', 
        header=1, 
        usecols=[0, 4], 
        names=['player', 'min_proj']
    )
    minute_projections = dict(zip(minute_projections.loc[:, 'player'], minute_projections.loc[:, 'min_proj']))

    return minute_projections

Apply minute projections and normalize projected rebounds

In [26]:
def project_and_normalize(player, player_df):
    # Select next game
    mu_df = player_df.iloc[-1, :][['rb_mean', 'trb_mean', 'trb_lg_avg']]

    # Load minute projections
    minute_projections = load_minute_projections()

    # Edge cases where there a difference in player names
    minutes_players = {
        'Wendell Carter Jr.': 'Wendell Carter'
    }

    # Apply minute projections to season average per min
    mu_df['rb_mean'] = mu_df['rb_mean'] * minute_projections[unidecode(minutes_players[player])]

    # Normalize data
    normalize = 1 + (mu_df['trb_mean'] - mu_df['trb_lg_avg']) / mu_df['trb_lg_avg']
    mu = mu_df['rb_mean'] * normalize
    print(f"Normalized projection: {mu}")

    return mu

Calculate odds

In [27]:
def calculate_expected_value(mu, odds):
    # Poisson distribution probability
    under = stats.poisson.cdf(k = odds["total"], mu = mu)
    over = 1 - under

    # Expected value
    over_ev = expected_value(over, convert_odds_to_perc(odds["over"]))
    under_ev = expected_value(under, convert_odds_to_perc(odds["under"]))
    print(f"Over EV: {over_ev:.1%}")
    print(f"Under EV: {under_ev:.1%}")



Enter player name, date and odds

In [28]:
today = "2023-02-23"
player = "Wendell Carter Jr."
odds = {
    "total": 8.5,
    "over": -120,
    "under": -109
}

player_df = player_analysis(player, today)
mu = project_and_normalize(player, player_df)
calculate_expected_value(mu, odds)

Normalized projection: 8.273483601933451
Over EV: -31.2%
Under EV: -15.6%
