# Imports

In [1]:
# import lux
import db_func
import pandas as pd
import numpy as np
from tqdm import notebook
#notebook.tqdm(looping)

try:
    %load_ext autotime
    %load_ext jupyterlab_notify
except:
    !pip3 install ipython-autotime
    %load_ext autotime

<jupyterlab_notify.magics._Notification at 0x7f7d853f2cd0>

time: 8.78 ms (started: 2021-08-01 02:11:52 -04:00)


## Get Database Connection

In [2]:
conn = db_func.get_conn()

time: 34 ms (started: 2021-08-01 02:11:53 -04:00)


# Preprocessing

match_df: The final processed dataset to be used in the machine learning models 

## Populate Dataframes From Database

### SQL Query

In [3]:
match_query = '''SELECT
                    m.match_id,  m.away_id, m.home_id,
                    m.date, m.away_pts, m.home_pts, m.playoff_game,
                    h_ml.home_ml, a_ml.away_ml,
                    h_ps.home_spread, a_ps.away_spread,
                    h_ps.home_ps_odds, a_ps.away_ps_odds,
                    over.over, under.under, ou.spread
                FROM match AS m
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS home_ml,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 1 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.home_id
                    GROUP BY m.match_id
                ) AS h_ml ON m.match_id = h_ml.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS away_ml,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 1 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.away_id
                    GROUP BY m.match_id
                ) AS a_ml ON m.match_id = a_ml.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS home_ps_odds,
                        AVG(spread) AS home_spread,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 2 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.home_id
                    GROUP BY m.match_id
                ) AS h_ps ON m.match_id = h_ps.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS away_ps_odds,
                        AVG(spread) AS away_spread,
                        m.match_id AS match_id
                    FROM
                        odds AS o, team AS t1, team as t2,
                        match AS m
                    WHERE
                        o.bet_type_id = 2 AND
                        o.match_id = m.match_id AND
                        o.team_id = m.away_id
                    GROUP BY m.match_id
                ) AS a_ps ON m.match_id = a_ps.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS under,
                        m.match_id AS match_id
                    FROM
                        odds AS o, match AS m
                    WHERE
                        o.bet_type_id = 3 AND
                        o.over_under = 'under' AND
                        o.match_id = m.match_id
                    GROUP BY m.match_id
                ) AS under ON m.match_id = under.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(decimal_odds) AS over,
                        m.match_id AS match_id
                    FROM
                        odds AS o, match AS m
                    WHERE
                        o.bet_type_id = 3 AND
                        o.over_under = 'over' AND
                        o.match_id = m.match_id
                    GROUP BY m.match_id
                ) AS over ON m.match_id = over.match_id
                LEFT OUTER JOIN
                (
                    SELECT
                        AVG(spread) AS spread,
                        m.match_id AS match_id
                    FROM
                        odds AS o, match AS m
                    WHERE
                        o.bet_type_id = 3 AND
                        o.match_id = m.match_id
                    GROUP BY m.match_id
                ) AS ou ON m.match_id = ou.match_id
                WHERE
                    m.date >= DATE('2007-10-30')
                ORDER BY date ASC
                '''


season_query = '''SELECT *
                  FROM season'''

player_performance_query = '''SELECT *
                              FROM player_performance'''


match_df = pd.read_sql(match_query, conn)
match_df.set_index('match_id', inplace=True)

season_df = pd.read_sql(season_query, conn)

playerperf_df = pd.read_sql(player_performance_query, conn)

time: 50.6 s (started: 2021-08-01 02:11:53 -04:00)


## Season

Set the season for each match

In [4]:
def get_season(date):
    return season_df[(season_df['start_date'] <= date) &
                     (season_df['end_date'] >= date)]['season']

time: 472 µs (started: 2021-08-01 02:12:44 -04:00)


In [5]:
match_df['season'] = match_df['date'].map(get_season)

time: 10.9 s (started: 2021-08-01 02:12:44 -04:00)


Only matches from seasons 2008-2021 will be used 

# Feature Engineering

## Margin of Victory/Loss (MOVL) with respect to the home team

In [6]:
match_df['movl'] = match_df['home_pts'] - match_df['away_pts']

time: 3.7 ms (started: 2021-08-01 02:12:55 -04:00)


## Team Elo Rating

Each team starts at 1500 elo

In [7]:
match_df['home_elo'] = 1500
match_df['away_elo'] = 1500

time: 19.5 ms (started: 2021-08-01 02:12:55 -04:00)


### Get the previous match of each team to aid elo calculation

In [9]:
def get_prev_match(date, team_id, match_df):
    return match_df[(match_df["date"] < date) &
                    ((match_df["home_id"] == team_id) |
                     (match_df["away_id"] == team_id))].tail(1)

time: 17 ms (started: 2021-08-01 02:12:55 -04:00)


### Team elo calculation

In [11]:
def get_prev_elo(team_id, season, prev_match):

    if prev_match.empty:
        prev_elo = 1500
    elif team_id == prev_match['home_id'].values[0]:
        prev_elo = prev_match['home_elo'].values[0]
    else:
        prev_elo = prev_match['away_elo'].values[0]

    if (not prev_match.empty and
            (prev_match['season'].values[0].values[0]
             != season.values[0])):
        prev_elo = prev_elo * 0.75 + 1505 * 0.25

    return prev_elo

time: 399 µs (started: 2021-08-01 02:14:06 -04:00)


In [12]:
def update_elo(prev_elo, prev_opp_elo, movl):
    if movl > 0:
        score = 1
    elif movl == 0:
        score = 0.5
    else:
        score = 0

    exp_score = 1 / (1 + 10 **((prev_opp_elo - prev_elo)/400))

    k = 20 * ((abs(movl)+3) ** 0.8)/(7.5 + 0.006 * abs(prev_elo - prev_opp_elo))

    updated_elo = prev_elo + k * (score - exp_score)
    return updated_elo

time: 46.4 ms (started: 2021-08-01 02:14:06 -04:00)


In [None]:
for idx, row in match_df.iterrows():
    prev_a_match = get_prev_match(row['date'], row['away_id'], match_df)
    prev_h_match = get_prev_match(row['date'], row['home_id'], match_df)

    prev_h_elo = get_prev_elo(
        row['home_id'], row['season'], prev_h_match)
    prev_a_elo = get_prev_elo(
        row['away_id'], row['season'], prev_a_match)

    match_df.at[idx, 'home_elo'] = update_elo(
        prev_h_elo, prev_a_elo, row['movl'])
    match_df.at[idx, 'away_elo'] = update_elo(
        prev_a_elo, prev_h_elo, -1*row['movl'])

## Player Efficiency Rating (PER) Sum of Last 5 Games

In [None]:
def player_efficiency_rating(match_stats):
    per = 0
    if match_stats['sp'] > 0:
        per = match_stats['fgm'] * 85.910 +
        match_stats['steals']
    return per


# calculate per for all matches

def get_injured_players(match):
    return match[match['inactive'] == 1]['player_id']


def team_per(match):
    get_injured_players(match)
    # adjust per for injured players

# PER sum of the last 'x' matches played by a team


def per_sum(match_id, x):

### Calculate PER for all matches

In [None]:
match_df['sum_per'] =
match_df.apply(lambda x: team_per)

## Odds 

### Moneyline Odds

In [None]:
match_df['vegas_odds']

### Point Spread Odds

### Over/Under