### Import packages

In [12]:
import os
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

### Set working directory

In [13]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA
Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA


## Exploratory Data Analysis

### Import data

In [14]:
totals_df = pd.read_csv('backend/data/totals/game_totals.csv').drop(['Unnamed: 0'], axis=1)[['date', 'visitor', 'home', '3p']]
predict_df = pd.read_csv('backend/predictions/3p_predictions.csv').drop(['Unnamed: 0'], axis=1)

# Convert date to datetime
totals_df['date'] = pd.to_datetime(totals_df['date'])
predict_df['date'] = pd.to_datetime(predict_df['date'])

# Total 3pt by both teams
totals_df = totals_df.groupby(['date', 'visitor', 'home']).sum()

# Merge datafrtames
df = pd.merge(predict_df, totals_df, left_on=['date', 'visitor', 'home'], right_on=['date', 'visitor', 'home'], how='left')
df = df.drop_duplicates(['date', 'visitor', 'home']).dropna(axis=0)

# Seperate current games from past games
today_df = df[df['3p'] == 0]
df = df[df['3p'] > 0]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153 entries, 0 to 159
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              153 non-null    datetime64[ns]
 1   visitor           153 non-null    object        
 2   home              153 non-null    object        
 3   linear            153 non-null    int64         
 4   decision_tree     153 non-null    int64         
 5   gradient_boosted  153 non-null    int64         
 6   neural_network    153 non-null    int64         
 7   random_forest     153 non-null    int64         
 8   line              153 non-null    float64       
 9   avg               153 non-null    float64       
 10  over              153 non-null    float64       
 11  under             153 non-null    float64       
 12  3p                153 non-null    float64       
dtypes: datetime64[ns](1), float64(5), int64(5), object(2)
memory usage: 16.7+ KB


In [15]:
def myround(x, base=.5):
    return base * round(x/base)

In [16]:
models = ['linear', 'decision_tree', 'gradient_boosted', 'neural_network', 'random_forest', 'avg']
for model in models:
    df[model + '_predict'] = np.where(df[model] > df['line'], 'o', 'u')
    
    # Compute if model predicted correctly
    df[model + '_hit'] = ((df['3p'] > df['line']) & (df[model] > df['line'])) | \
                            ((df['3p'] < df['line']) & (df[model] < df['line']))
    
    # Compute models profit per bet
    df[model + '_potential_profit'] = np.where(
        df[model] > df['line'], df['over'], df['under'])
    
    df[model + '_potential_profit'] = np.where(
        df[model + '_potential_profit'] > 0, df[model + '_potential_profit'] / 100, -100 / df[model + '_potential_profit'])
    
    df[model + '_profit'] = np.where(df[model + '_hit'], df[model + '_potential_profit'], -1)
    
    # Compute difference between line and prediction
    df[model + '_diff'] = myround(df[model] - df['line'])

df.tail()

Unnamed: 0,date,visitor,home,linear,decision_tree,gradient_boosted,neural_network,random_forest,line,avg,over,under,3p,linear_predict,linear_hit,linear_potential_profit,linear_profit,linear_diff,decision_tree_predict,decision_tree_hit,decision_tree_potential_profit,decision_tree_profit,decision_tree_diff,gradient_boosted_predict,gradient_boosted_hit,gradient_boosted_potential_profit,gradient_boosted_profit,gradient_boosted_diff,neural_network_predict,neural_network_hit,neural_network_potential_profit,neural_network_profit,neural_network_diff,random_forest_predict,random_forest_hit,random_forest_potential_profit,random_forest_profit,random_forest_diff,avg_predict,avg_hit,avg_potential_profit,avg_profit,avg_diff
155,2022-03-16,Oklahoma City Thunder,San Antonio Spurs,24,22,23,23,24,21.5,23.2,-160.0,120.0,32.0,o,True,0.625,0.625,2.5,o,True,0.625,0.625,0.5,o,True,0.625,0.625,1.5,o,True,0.625,0.625,1.5,o,True,0.625,0.625,2.5,o,True,0.625,0.625,1.5
156,2022-03-16,Philadelphia 76ers,Cleveland Cavaliers,24,24,23,22,22,21.5,23.0,-160.0,120.0,21.0,o,False,0.625,-1.0,2.5,o,False,0.625,-1.0,2.5,o,False,0.625,-1.0,1.5,o,False,0.625,-1.0,0.5,o,False,0.625,-1.0,0.5,o,False,0.625,-1.0,1.5
157,2022-03-16,Phoenix Suns,Houston Rockets,26,26,26,25,26,24.5,25.8,-140.0,100.0,29.0,o,True,0.714286,0.714286,1.5,o,True,0.714286,0.714286,1.5,o,True,0.714286,0.714286,1.5,o,True,0.714286,0.714286,0.5,o,True,0.714286,0.714286,1.5,o,True,0.714286,0.714286,1.5
158,2022-03-16,Portland Trail Blazers,New York Knicks,26,25,26,25,25,26.5,25.4,130.0,-170.0,27.0,u,False,0.588235,-1.0,-0.5,u,False,0.588235,-1.0,-1.5,u,False,0.588235,-1.0,-0.5,u,False,0.588235,-1.0,-1.5,u,False,0.588235,-1.0,-1.5,u,False,0.588235,-1.0,-1.0
159,2022-03-16,Toronto Raptors,Los Angeles Clippers,23,24,23,22,22,24.5,22.8,-115.0,-125.0,22.0,u,True,0.8,0.8,-1.5,u,True,0.8,0.8,-0.5,u,True,0.8,0.8,-1.5,u,True,0.8,0.8,-2.5,u,True,0.8,0.8,-2.5,u,True,0.8,0.8,-1.5


In [17]:
unit = 100

for model in models:
    profit = df[model + '_profit'].sum()
    correct = df[df[model + '_hit'] == True][model + '_hit'].count()
    wrong = df[df[model + '_hit'] == False][model + '_hit'].count()
    
    print(model)
    print(f'\tRecord: {correct} - {wrong}')
    print(f'\tAccuracy: {round((correct / (correct + wrong)) * 100)}%')
    print(f'\tProfit: ${round(profit * unit)}  (given a ${unit} unit)\n')

linear
	Record: 82 - 71
	Accuracy: 54%
	Profit: $-666  (given a $100 unit)

decision_tree
	Record: 86 - 67
	Accuracy: 56%
	Profit: $195  (given a $100 unit)

gradient_boosted
	Record: 87 - 66
	Accuracy: 57%
	Profit: $237  (given a $100 unit)

neural_network
	Record: 85 - 68
	Accuracy: 56%
	Profit: $28  (given a $100 unit)

random_forest
	Record: 89 - 64
	Accuracy: 58%
	Profit: $714  (given a $100 unit)

avg
	Record: 88 - 65
	Accuracy: 58%
	Profit: $484  (given a $100 unit)



In [18]:
for model in models:
    temp = df.groupby([model + '_diff'])[[model + '_hit', model + '_profit']].aggregate(['sum', 'count'])
    temp['accuracy'] = round(temp[(model + '_hit', 'sum')] / temp[(model + '_hit', 'count')] * 100)
    temp['profit'] = round(temp[(model + '_profit', 'sum')], 2)
    temp['count'] = temp[(model + '_hit', 'count')]
    print(temp[['profit', 'accuracy', 'count']])

            profit accuracy count
                                 
linear_diff                      
-2.5         -1.20     33.0     3
-1.5          0.72     61.0    18
-0.5         -0.30     55.0    51
 0.5        -10.12     43.0    47
 1.5          2.40     62.0    24
 2.5          2.84     78.0     9
 3.5         -1.00      0.0     1
                   profit accuracy count
                                        
decision_tree_diff                      
-3.5                -1.00      0.0     1
-2.5                -0.77     50.0     6
-1.5                 3.67     65.0    26
-0.5                 1.59     56.0    43
 0.5                -1.98     51.0    39
 1.5                 3.92     65.0    26
 2.5                -1.78     43.0     7
 3.5                 0.30     67.0     3
 4.5                -1.00      0.0     1
 5.5                -1.00      0.0     1
                      profit accuracy count
                                           
gradient_boosted_diff                  

## Make picks

In [19]:
models = ['linear', 'decision_tree', 'gradient_boosted', 'neural_network', 'random_forest', 'avg']
today_df['profit'] = 0
today_df['accuracy'] = 0
today_df['count'] = 0
for model in models:
    # Differential profit and accuarcy
    temp = df.groupby([model + '_diff'])[[model + '_hit', model + '_profit']].aggregate(['sum', 'count'])
    temp['accuracy'] = round(temp[(model + '_hit', 'sum')] / temp[(model + '_hit', 'count')] * 100)
    temp['profit'] = round(temp[(model + '_profit', 'sum')], 2)
    temp['count'] = temp[(model + '_hit', 'count')]
    temp = temp.reset_index()
    temp.columns = [col[0] for col in temp.columns]
    temp = temp[[model + '_diff', 'accuracy', 'profit', 'count']]

    # Compute difference between line and prediction
    today_df[model + '_diff'] = myround(today_df[model] - today_df['line'])
    
    # Merge differential's profit and accuracy
    today_df = pd.merge(today_df, temp, left_on = [model + '_diff'], right_on=[model + '_diff'], how='left', suffixes=('', '_' + model))

In [20]:
models = ['linear', 'decision_tree', 'gradient_boosted', 'neural_network', 'random_forest', 'avg']

# Expected profit
for model in models:
    today_df[model + '_expected_profit'] = today_df['profit_' + model] * today_df['accuracy_' + model]

# Find model with max expected profit
models = ['linear', 'decision_tree', 'gradient_boosted', 'neural_network', 'random_forest', 'avg']
models = [model + '_expected_profit' for model in models]

today_df = today_df.fillna(0)

today_df['total_expected_profit'] = \
            today_df['linear_expected_profit'] + today_df['decision_tree_expected_profit'] + \
            today_df['gradient_boosted_expected_profit'] + today_df['neural_network_expected_profit'] + \
            today_df['random_forest_expected_profit'] + today_df['avg_expected_profit']

today_df['max_expected_profit'] = today_df[models].idxmax(axis=1)
today_df['max_expected_profit'] = today_df['max_expected_profit'].apply(lambda x: '_'.join(x.split('_')[:-2]))

# Predictions for models that result in max expected profit
predictions = []
for index, row in today_df.iterrows():
    model = row['max_expected_profit']
    predictions.append(row[model])

today_df['prediction'] = predictions
today_df['over_under'] = np.where(today_df['prediction'] > today_df['line'], 'o', 'u')

In [21]:
models = ['linear', 'decision_tree', 'gradient_boosted', 'neural_network', 'random_forest', 'avg']
expected_profits = [model + '_expected_profit' for model in models]

for model in models:
    today_df[model] = np.where(today_df[model] > today_df['line'], 'o', 'u')

cols = ['date', 'visitor', 'home', 'line'] + ['total_expected_profit', 'max_expected_profit'] + models + expected_profits
today_df[cols].sort_values(['total_expected_profit'], ascending=False)

Unnamed: 0,date,visitor,home,line,total_expected_profit,max_expected_profit,linear,decision_tree,gradient_boosted,neural_network,random_forest,avg,linear_expected_profit,decision_tree_expected_profit,gradient_boosted_expected_profit,neural_network_expected_profit,random_forest_expected_profit,avg_expected_profit
0,2022-03-17,Detroit Pistons,Orlando Magic,24.5,2123.02,avg,u,u,u,u,o,u,-16.5,238.55,342.6,452.4,120.84,985.13


In [22]:
today_df = today_df[['date', 'visitor', 'home', 'over_under', 'prediction', 'line', 'over', 'under']]
algo_predictions = pd.read_csv('backend/predictions/3p_predictions_algo.csv').drop(['Unnamed: 0'], axis=1)
algo_predictions = algo_predictions.append(today_df, ignore_index=True)
algo_predictions['date'] = pd.to_datetime(algo_predictions['date'])
algo_predictions = algo_predictions.drop_duplicates(['date', 'home', 'visitor'])
algo_predictions.to_csv('backend/predictions/3p_predictions_algo.csv')
algo_predictions.tail(12)

Unnamed: 0,date,visitor,home,over_under,prediction,line,over,under
27,2022-03-16,Boston Celtics,Golden State Warriors,u,26.8,27.5,-110.0,-130.0
28,2022-03-16,Chicago Bulls,Utah Jazz,u,25.0,26.5,110.0,-150.0
29,2022-03-16,Dallas Mavericks,Brooklyn Nets,u,24.0,24.5,-110.0,-130.0
30,2022-03-16,Denver Nuggets,Washington Wizards,u,24.0,24.5,125.0,-165.0
31,2022-03-16,Los Angeles Lakers,Minnesota Timberwolves,u,26.0,26.5,-145.0,105.0
32,2022-03-16,Milwaukee Bucks,Sacramento Kings,u,25.0,26.5,110.0,-150.0
33,2022-03-16,Oklahoma City Thunder,San Antonio Spurs,o,24.0,21.5,-160.0,120.0
34,2022-03-16,Philadelphia 76ers,Cleveland Cavaliers,o,24.0,21.5,-160.0,120.0
35,2022-03-16,Phoenix Suns,Houston Rockets,o,26.0,24.5,-140.0,100.0
36,2022-03-16,Portland Trail Blazers,New York Knicks,u,25.0,26.5,130.0,-170.0
