# Imports

In [1]:
# import lux
import db_func
import pandas as pd
import numpy as np
from tqdm import notebook
#notebook.tqdm(looping)

try:
    %load_ext autotime
    %load_ext jupyterlab_notify
except:
    !pip3 install ipython-autotime
    %load_ext autotime

<jupyterlab_notify.magics._Notification at 0x7f054a83eac0>

time: 4.73 ms (started: 2021-08-01 22:48:48 -04:00)


## Get Database Connection

In [2]:
conn = db_func.get_conn()

time: 3.29 ms (started: 2021-08-01 22:48:48 -04:00)


# Preprocessing

match_df: The final processed dataset to be used in the machine learning models 

## Populate Dataframes From Database

### SQL Query

In [3]:
match_query = '''SELECT
                    m.match_id,  m.away_id, m.home_id,
                    m.date, m.away_pts, m.home_pts, m.playoff_game,
                    h_ml.home_ml, a_ml.away_ml,
                    h_ps.home_spread, a_ps.away_spread,
                    h_ps.home_ps_odds, a_ps.away_ps_odds,
                    over.over, under.under, ou.spread
                FROM match AS m
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS home_ml,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 1 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.home_id
                    GROUP BY m.match_id
                ) AS h_ml ON m.match_id = h_ml.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS away_ml,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 1 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.away_id
                    GROUP BY m.match_id
                ) AS a_ml ON m.match_id = a_ml.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS home_ps_odds,
                        AVG(spread) AS home_spread,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 2 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.home_id
                    GROUP BY m.match_id
                ) AS h_ps ON m.match_id = h_ps.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS away_ps_odds,
                        AVG(spread) AS away_spread,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 2 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.away_id
                    GROUP BY m.match_id
                ) AS a_ps ON m.match_id = a_ps.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS under,
                        m.match_id AS match_id
                    FROM
                        odds AS o, match AS m
                    WHERE
                        o.bet_type_id = 3 AND
                        o.over_under = 'under' AND
                        o.match_id = m.match_id
                    GROUP BY m.match_id
                ) AS under ON m.match_id = under.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS over,
                        m.match_id AS match_id
                    FROM
                        odds AS o, match AS m
                    WHERE
                        o.bet_type_id = 3 AND
                        o.over_under = 'over' AND
                        o.match_id = m.match_id
                    GROUP BY m.match_id
                ) AS over ON m.match_id = over.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(spread) AS spread,
                        m.match_id AS match_id
                    FROM
                        odds AS o, match AS m
                    WHERE
                        o.bet_type_id = 3 AND
                        o.match_id = m.match_id
                    GROUP BY m.match_id
                ) AS ou ON m.match_id = ou.match_id
                WHERE
                    m.date >= DATE('2007-10-30')
                ORDER BY date ASC
                '''


season_query = '''SELECT *
                  FROM season'''

player_performance_query = '''SELECT *
                              FROM player_performance'''


match_df = pd.read_sql(match_query, conn)
match_df.set_index('match_id', inplace=True)

season_df = pd.read_sql(season_query, conn)

playerperf_df = pd.read_sql(player_performance_query, conn)

time: 22.4 s (started: 2021-08-01 22:48:48 -04:00)


## Season

Set the season for each match

In [4]:
def get_season(date):
    return season_df[(season_df['start_date'] <= date) &
                     (season_df['end_date'] >= date)]['season'].values[0]

time: 249 µs (started: 2021-08-01 22:49:11 -04:00)


In [5]:
match_df['season'] = match_df['date'].map(get_season)

time: 4.42 s (started: 2021-08-01 22:49:11 -04:00)


Only matches from seasons 2008-2021 will be used 

# Feature Engineering

## Margin of Victory/Loss (MOVL) with respect to the home team

In [6]:
match_df['movl'] = match_df['home_pts'] - match_df['away_pts']

time: 1.45 ms (started: 2021-08-01 22:49:15 -04:00)


## Team Elo Rating

Each team starts at 1500 elo

In [7]:
match_df['home_elo'] = 1500
match_df['away_elo'] = 1500

time: 1.08 ms (started: 2021-08-01 22:49:15 -04:00)


### Get the previous match of each team to aid elo calculation

In [8]:
def get_prev_match(date, team_id, match_df):
    return match_df[(match_df["date"] < date) &
                    ((match_df["home_id"] == team_id) |
                     (match_df["away_id"] == team_id))].tail(1)

time: 524 µs (started: 2021-08-01 22:49:15 -04:00)


In [9]:
from datetime import datetime
get_prev_match(datetime(2021,7,8).date(), 17, match_df)

Unnamed: 0_level_0,away_id,home_id,date,away_pts,home_pts,playoff_game,home_ml,away_ml,home_spread,away_spread,home_ps_odds,away_ps_odds,over,under,spread,season,movl,home_elo,away_elo
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
17824,17,24,2021-07-06,105.0,118.0,0.0,1.476624,2.796667,-4.944444,4.944444,1.933069,1.90748,1.917301,1.92247,219.555556,2021,13.0,1500,1500


time: 10.4 ms (started: 2021-08-01 22:49:15 -04:00)


### Team elo calculation

In [10]:
def get_prev_elo(team_id, season, prev_match):

    if prev_match.empty:
        prev_elo = 1500
    elif team_id == prev_match['home_id'].values[0]:
        prev_elo = prev_match['home_elo'].values[0]
    elif team_id == prev_match['away_id'].values[0]:
        prev_elo = prev_match['away_elo'].values[0]
    else: 
        print('err')

    if (not prev_match.empty and
            (prev_match['season'].values[0]
             != season)):
        #print(prev_elo, prev_elo * 0.75 + 1505 * 0.25)
        prev_elo = prev_elo * 0.75 + 1505 * 0.25

    return prev_elo

time: 284 µs (started: 2021-08-01 22:49:15 -04:00)


In [11]:
# def update_elo(prev_elo, prev_opp_elo, movl, home_team):
#     if home_team:
#         prev_elo += 100
#     if movl > 0:
#         score = 1
#     else:
#         score = 0

#     exp_score = 1. / (1 + math.pow(10, ((prev_opp_elo - prev_elo)/400.)))

#     k = 20 * (math.pow((abs(movl)+3), 0.8)) / \
#         (7.5 + 0.006 * abs(prev_opp_elo-prev_elo))
# #     if movl > 0:
# #         k = 20 * (math.pow((movl+3), 0.8)) / \
# #         (7.5 + 0.006 * prev_elo - prev_opp_elo)
# #     else:
# #         k = 20 * (math.pow((abs(movl)+3), 0.8)) / \
# #         (7.5 + 0.006 * -1*(prev_elo - prev_opp_elo))

#     updated_elo = prev_elo + k * (score - exp_score)
#     return updated_elo

time: 1.82 ms (started: 2021-08-01 22:49:15 -04:00)


In [12]:

def update_elo(home_elo, away_elo, movl):
    elo_diff = home_elo + 100.0 - away_elo
    if movl > 0:
        h_s = 1.0
        a_s = 0.0
        multiplier = ((movl+3)**(0.8))/(7.5+0.006*elo_diff)

    else:
        h_s = 0.0
        a_s = 1.0
        multiplier = ((-movl+3)**(0.8))/(7.5+0.006*(-elo_diff))
        
    exp_h_s = 1.0 / (1.0 + 10.0 ** (-elo_diff/400.0))
    exp_a_s = 1.0 - exp_h_s
    
    k = 20.0 * multiplier

    new_home_elo = home_elo + k * (h_s - exp_h_s)
    new_away_elo = away_elo + k * (a_s - exp_a_s)

    return (new_home_elo, new_away_elo)

time: 694 µs (started: 2021-08-01 22:49:15 -04:00)


In [13]:
match_df['home_elo'] = 1500.0
match_df['away_elo'] = 1500.0
import math
for idx, row in match_df.iterrows():
    prev_h_match = get_prev_match(row['date'], row['home_id'], match_df)
    prev_a_match = get_prev_match(row['date'], row['away_id'], match_df)
    
    prev_h_elo = get_prev_elo(
        row['home_id'], row['season'], prev_h_match)
    prev_a_elo = get_prev_elo(
        row['away_id'], row['season'], prev_a_match)    
    new_elos = update_elo(prev_h_elo, prev_a_elo, row['movl'])
    match_df.at[idx, 'home_elo'] = new_elos[0]
    match_df.at[idx, 'away_elo'] = new_elos[1]

time: 37.5 s (started: 2021-08-01 22:49:15 -04:00)


In [50]:
get_prev_elo(
    17, 2021,
    get_prev_match(datetime(2021, 7, 8).date(), 17, match_df))

1500.0

time: 3.35 ms (started: 2021-08-01 05:10:35 -04:00)


In [14]:
# import math
# for idx, row in match_df.iterrows():
#     prev_h_match = get_prev_match(row['date'], row['home_id'], match_df)
#     prev_a_match = get_prev_match(row['date'], row['away_id'], match_df)


#     prev_h_elo = get_prev_elo(
#         row['home_id'], row['season'], prev_h_match)
#     prev_a_elo = get_prev_elo(
#         row['away_id'], row['season'], prev_a_match)
#     match_df.at[idx, 'home_elo'] = update_elo(
#         prev_h_elo, prev_a_elo, row['movl'], 1)
#     match_df.at[idx, 'away_elo'] = update_elo(
#         prev_a_elo, prev_h_elo, -1*row['movl'], 0)

time: 154 µs (started: 2021-08-01 04:39:53 -04:00)


## Player Efficiency Rating (PER) Sum of Last 5 Games

In [15]:
def player_efficiency_rating(match_stats):
    per = 0
    if match_stats['sp'] > 0:
        per = match_stats['fgm'] * 85.910 +
        match_stats['steals']
    return per


# calculate per for all matches

def get_injured_players(match):
    return match[match['inactive'] == 1]['player_id']


def team_per(match):
    get_injured_players(match)
    # adjust per for injured players

# PER sum of the last 'x' matches played by a team


def per_sum(match_id, x):

time: 4.78 ms (started: 2021-08-01 04:39:53 -04:00)


### Calculate PER for all matches

In [16]:
# match_df['sum_per'] =
# match_df.apply(lambda x: team_per)

time: 374 µs (started: 2021-08-01 04:39:53 -04:00)


## Odds 

### Moneyline Odds

In [17]:
# match_df['vegas_odds']

time: 322 µs (started: 2021-08-01 04:39:53 -04:00)


### Point Spread Odds

### Over/Under

In [53]:
s = 0
season = 2011
for i in range(1, 31):
    print(match_df[(match_df['home_id'] == i) & (match_df['season'] == season)].tail(1)['home_elo'].values[0],
          match_df[(match_df['home_id'] == i) & (match_df['season'] == season)].tail(1)['home_id'].values[0])
    s += match_df[(match_df['home_id'] == i) & (match_df['season']
                                                == season)].tail(1)['home_elo'].values[0]
s/30

1482.1478553934462 1
1630.9944622054159 2
1298.0382314689004 3
1394.3242449972176 4
1673.8590292962251 5
1324.3022593622798 6
1721.099277299603 7
1621.2227386822135 8
1382.4925310882318 9
1489.3917139323496 10
1563.3412850962 11
1466.9265606028896 12
1436.5175891766519 13
1641.2818202925184 14
1615.9080963709823 15
1700.8512668468118 16
1470.2376404664806 17
1268.6478415797794 18
1499.2723144575862 19
1474.1856281978787 20
1661.6858139398244 21
1614.5014149293957 22
1523.633954985225 23
1492.8499876958601 24
1548.1477552856668 25
1420.3354772427538 26
1618.110974130699 27
1301.215526602053 28
1432.7940162691789 29
1332.1401222700247 30


1503.3485810054783

time: 47 ms (started: 2021-08-01 05:11:53 -04:00)


In [19]:
match_df[(match_df['home_id'] == 10) & (
    match_df['season'] == 2017)].tail(1)['home_elo'].values[0]

1412

time: 2.06 ms (started: 2021-08-01 04:39:53 -04:00)


In [20]:
# elos_df = match_df[(match_df['home_id'] == 1) | (match_df['away_id'] == 1)]

time: 332 µs (started: 2021-08-01 04:39:53 -04:00)


In [21]:
# okc_df = match_df[(match_df['home_id'] == 21) & (match_df['season'] == '2021')]

time: 470 µs (started: 2021-08-01 04:39:53 -04:00)


In [48]:
match_df[match_df['home_elo'] > 1700]['home_elo']

Series([], Name: home_elo, dtype: int64)

time: 1.75 ms (started: 2021-08-01 05:10:11 -04:00)
