# Develop an initial model

In [1]:
# Python modules
import dateutil
from datetime import datetime
import re

# Data science packages
import pandas as pd
import numpy as np

# Scikit Learn utility classes & functions
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# Scikit Learn models
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor,
                              RandomForestRegressor)
from sklearn.svm import SVR, LinearSVR

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [2]:
# Merge match data & betting data

match_df = (pd.read_csv('../data/ft_match_list.csv', parse_dates=['date'])
              .rename(columns={'date': 'datetime'})
              .assign(date=lambda x: x['datetime'].map(lambda y: y.date()))
              .set_index(['date', 'venue', 'home_team', 'away_team'], drop=True))

betting_df = pd.read_csv('../data/afl_betting.csv', index_col=('date', 'venue'), parse_dates=['date'])
home_df = betting_df[betting_df['home'] == 1].drop('home', axis=1).rename(columns=lambda x: f'home_{x}')
away_df = betting_df[betting_df['home'] == 0].drop('home', axis=1).rename(columns=lambda x: f'away_{x}')
home_away_betting_df = (home_df.merge(away_df, on=('date', 'venue'))
                               .reset_index()
                               .set_index(['date', 'venue', 'home_team', 'away_team']))

df = (pd.concat([match_df[match_df['datetime'] > '2010-01-01'], home_away_betting_df], axis=1)
        # The 2017 Grand Final is missing from the betting data for some reason,
        # but that's the only row that should get dropped
        .dropna()
        .reset_index()
        .drop('date', axis=1))
df

Unnamed: 0,venue,home_team,away_team,datetime,crowd,season_round,home_score,away_score,home_win_odds,home_line_odds,away_win_odds,away_line_odds
0,MCG,Richmond,Carlton,2010-03-25 19:10:00,72010,Round 1,64.0,120.0,2.89,16.5,1.42,-16.5
1,MCG,Geelong,Essendon,2010-03-26 19:40:00,57772,Round 1,125.0,94.0,1.21,-28.5,4.50,28.5
2,ANZ Stadium,Sydney,St Kilda,2010-03-27 19:10:00,31330,Round 1,88.0,96.0,2.64,14.5,1.49,-14.5
3,Gabba,Brisbane,West Coast,2010-03-27 19:30:00,29201,Round 1,114.0,82.0,1.32,-21.5,3.42,21.5
4,MCG,Melbourne,Hawthorn,2010-03-27 14:10:00,45615,Round 1,61.0,117.0,4.95,31.5,1.18,-31.5
5,AAMI Stadium,Port Adelaide,North Melbourne,2010-03-28 12:40:00,21205,Round 1,96.0,82.0,1.42,-16.5,2.89,16.5
6,Domain Stadium,Fremantle,Adelaide,2010-03-28 17:10:00,30976,Round 1,118.0,62.0,1.96,1.5,1.85,-1.5
7,Etihad Stadium,Western Bulldogs,Collingwood,2010-03-28 14:10:00,49000,Round 1,93.0,129.0,1.62,-8.5,2.31,8.5
8,Gabba,Brisbane,Carlton,2010-04-01 19:40:00,36780,Round 2,107.0,88.0,1.34,-19.5,3.30,19.5
9,Domain Stadium,West Coast,Port Adelaide,2010-04-03 17:40:00,37010,Round 2,86.0,89.0,1.63,-10.5,2.30,10.5


In [3]:
# Clean & simplify data for basic scikit learn model

DIGITS = re.compile(r'round\s+(\d+)$', flags=re.I)
QUALIFYING = re.compile('qualifying', flags=re.I)
ELIMINATION = re.compile('elimination', flags=re.I)
SEMI = re.compile('semi', flags=re.I)
PRELIMINARY = re.compile('preliminary', flags=re.I)
GRAND = re.compile('grand', flags=re.I)

def get_round_number(x):
    digits = DIGITS.search(x)
    if digits is not None:
        return int(digits.group(1))
    if QUALIFYING.search(x) is not None:
        return 25
    if ELIMINATION.search(x) is not None:
        return 25
    if SEMI.search(x) is not None:
        return 26
    if PRELIMINARY.search(x) is not None:
        return 27
    if GRAND.search(x) is not None:
        return 28

    raise Exception(f"Round label {x} doesn't match any known patterns")

# Filter out 2017 & 2018 seasons, because they will eventually serve as test sets
match_df = (df[df['datetime'] < '2017-01-01']
              .assign(home_score_diff=df['home_score'] - df['away_score'],
                      away_score_diff=df['away_score'] - df['home_score'],
                      round_number=df['season_round'].map(get_round_number),
                      year=df['datetime'].map(lambda x: x.year))
              .drop(['venue', 'datetime', 'crowd', 'season_round'], axis=1))
match_df

Unnamed: 0,home_team,away_team,home_score,away_score,home_win_odds,home_line_odds,away_win_odds,away_line_odds,home_score_diff,away_score_diff,round_number,year
0,Richmond,Carlton,64.0,120.0,2.89,16.5,1.42,-16.5,-56.0,56.0,1,2010
1,Geelong,Essendon,125.0,94.0,1.21,-28.5,4.50,28.5,31.0,-31.0,1,2010
2,Sydney,St Kilda,88.0,96.0,2.64,14.5,1.49,-14.5,-8.0,8.0,1,2010
3,Brisbane,West Coast,114.0,82.0,1.32,-21.5,3.42,21.5,32.0,-32.0,1,2010
4,Melbourne,Hawthorn,61.0,117.0,4.95,31.5,1.18,-31.5,-56.0,56.0,1,2010
5,Port Adelaide,North Melbourne,96.0,82.0,1.42,-16.5,2.89,16.5,14.0,-14.0,1,2010
6,Fremantle,Adelaide,118.0,62.0,1.96,1.5,1.85,-1.5,56.0,-56.0,1,2010
7,Western Bulldogs,Collingwood,93.0,129.0,1.62,-8.5,2.31,8.5,-36.0,36.0,1,2010
8,Brisbane,Carlton,107.0,88.0,1.34,-19.5,3.30,19.5,19.0,-19.0,2,2010
9,West Coast,Port Adelaide,86.0,89.0,1.63,-10.5,2.30,10.5,-3.0,3.0,2,2010


In [4]:
# Reshape model_df to split each match into 2 rows: 1 per participating team to calculate cumulative stats
# Add cumulative stats & ladder position features

# Get cumulative stats by team & year, then group by team and shift one row
# in order to carry over end of last season for a team's first round ranking
def team_year_cum_col(df, stat_label):
    return (df.groupby(level=[0, 1])
              [stat_label]
              .cumsum()
              .groupby(level=[0])
              .shift())

def team_year_percent(df):
    return (team_year_cum_col(df, 'score') / team_year_cum_col(df, 'oppo_score')).rename('cum_percent')

def team_year_win_points(df):
    # Have to shift scores to make them last week's scores,
    # so ladder position is the one leading up to this week's matches
    wins = (df['score'] > df['oppo_score']).rename('win')
    draws = (df['score'] == df['oppo_score']).rename('draw')
    results = pd.concat([wins, draws], axis=1).assign(win_points=lambda x: (x['win'] * 4) + (x['draw'] * 2))

    return team_year_cum_col(results, 'win_points').rename('cum_win_points')

def team_year_ladder_position(df):
    # Pivot to get round-by-round match points and cumulative percent
    ladder_pivot_table = (pd.concat([team_year_percent(df), team_year_win_points(df)], axis=1)
                            .pivot_table(index=['year', 'round_number'],
                                         values=['cum_win_points', 'cum_percent'],
                                         columns='team',
                                         aggfunc={'cum_win_points': np.sum, 'cum_percent': np.mean}))

    # To get round-by-round ladder ranks, we sort each round by win points & percent, then save index numbers
    ladder_index = []
    ladder_values = []

    for idx, row in ladder_pivot_table.iterrows():
        sorted_row = (row.unstack(level=0)
                         .sort_values(['cum_win_points', 'cum_percent'], ascending=False))
        ladder_ranks = np.array(range(len(sorted_row)))


        for ladder_idx, team_name in enumerate(sorted_row.index.get_values()):
            ladder_index.append(tuple([team_name, *idx]))
            ladder_values.append(ladder_idx + 1)

    ladder_position = pd.Series(ladder_values,
                             index=pd.MultiIndex.from_tuples(ladder_index, names=('team', 'year', 'round_number')),
                             name='ladder_position')
    
    return ladder_position

def team_year_oppo_feature(column_label):
    rename_columns = {'oppo_team': 'team'}
    rename_columns[column_label] = f'oppo_{column_label}'

    return lambda x: (x.loc[:, ['year', 'round_number', 'oppo_team', column_label]]
              # We switch out oppo_team for team in the index, then assign feature as oppo_{feature_column}
              .rename(columns=rename_columns)
              .set_index(['team', 'year', 'round_number'])
              .sort_index())

# home_cols = match_df.columns[match_df.columns.str.match(r'home_')]
# away_cols = match_df.columns[match_df.columns.str.match(r'away_')]
home_df = (match_df.assign(at_home=np.ones(len(match_df)))
                   .rename(columns=lambda x: x.replace('home_', ''))
                   .rename(columns=lambda x: x.replace('away_', 'oppo_'))
                   .set_index(['team', 'year', 'round_number'], drop=False))
away_df = (match_df.assign(at_home=np.zeros(len(match_df)))
                   .rename(columns=lambda x: x.replace('away_', ''))
                   .rename(columns=lambda x: x.replace('home_', 'oppo_'))
                   .set_index(['team', 'year', 'round_number'], drop=False))

team_df = (pd.concat([home_df, away_df], join='inner')
             .sort_index()
             # ladder_position depends on cum_percent & cum_win_points, so needs to be assigned after they are
             .assign(ladder_position=team_year_ladder_position,
                     cum_percent=team_year_percent,
                     cum_win_points=team_year_win_points,
                     last_week_score=lambda x: x.groupby(level=0)['score'].shift())
             # oppo_ladder_position depends on ladder_position, so it needs to be assigned after
             .assign(oppo_ladder_position=team_year_oppo_feature('ladder_position'),
                     oppo_cum_percent=team_year_oppo_feature('cum_percent'),
                     oppo_cum_win_points=team_year_oppo_feature('cum_win_points'),
                     oppo_last_week_score=team_year_oppo_feature('last_week_score'))
             # Drop first round as it's noisy due to most data being from previous week's match
             .dropna())

team_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,team,oppo_team,score,oppo_score,win_odds,line_odds,oppo_win_odds,oppo_line_odds,score_diff,oppo_score_diff,...,year,at_home,ladder_position,cum_percent,cum_win_points,last_week_score,oppo_ladder_position,oppo_cum_percent,oppo_cum_win_points,oppo_last_week_score
team,year,round_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Adelaide,2010,2,Adelaide,Sydney,75.0,118.0,1.54,-12.5,2.49,12.5,-43.0,43.0,...,2010,1.0,15,0.525424,0.0,62.0,9,0.916667,0.0,88.0
Adelaide,2010,3,Adelaide,Melbourne,41.0,57.0,1.52,-13.5,2.55,13.5,-16.0,16.0,...,2010,0.0,14,0.580508,0.0,75.0,12,0.719212,0.0,85.0
Adelaide,2010,4,Adelaide,Carlton,55.0,103.0,1.81,-2.5,2.01,2.5,-48.0,48.0,...,2010,1.0,15,0.607509,0.0,41.0,10,1.063910,4.0,75.0
Adelaide,2010,5,Adelaide,Western Bulldogs,72.0,121.0,7.50,40.5,1.09,-40.5,-49.0,49.0,...,2010,0.0,15,0.588384,0.0,55.0,8,1.082873,8.0,79.0
Adelaide,2010,6,Adelaide,Port Adelaide,74.0,97.0,2.55,12.5,1.52,-12.5,-23.0,23.0,...,2010,1.0,15,0.589942,0.0,72.0,10,0.801670,12.0,54.0
Adelaide,2010,7,Adelaide,Richmond,104.0,54.0,1.20,-29.5,4.66,29.5,50.0,-50.0,...,2010,1.0,15,0.617264,0.0,74.0,16,0.491413,0.0,53.0
Adelaide,2010,8,Adelaide,North Melbourne,75.0,84.0,2.38,9.5,1.59,-9.5,-9.0,9.0,...,2010,0.0,15,0.723054,4.0,104.0,11,0.780952,12.0,91.0
Adelaide,2010,9,Adelaide,Brisbane,93.0,81.0,2.14,5.5,1.72,-5.5,12.0,-12.0,...,2010,1.0,15,0.742021,4.0,75.0,9,0.920792,16.0,74.0
Adelaide,2010,10,Adelaide,St Kilda,76.0,123.0,4.25,26.5,1.23,-26.5,-47.0,47.0,...,2010,0.0,15,0.781513,8.0,93.0,5,1.172144,24.0,91.0
Adelaide,2010,11,Adelaide,Fremantle,105.0,82.0,2.89,17.5,1.42,-17.5,23.0,-23.0,...,2010,1.0,15,0.760460,8.0,76.0,2,1.278221,32.0,139.0


In [5]:
# Next we set up data for regressors

def regression_accuracy(y_true, y_pred, **kwargs):
    correct_preds = ((y_true >= 0) & (y_pred >= 0)) | ((y_true <= 0) & (y_pred <= 0))
    return np.mean(correct_preds.astype(int))

model_features = pd.get_dummies(team_df.drop(['score', 'oppo_score', 'score_diff', 'oppo_score_diff'], axis=1))
model_labels = team_df['score_diff']
X_train, X_test, y_train, y_test = train_test_split(model_features, model_labels)

# Use standard scaler, because many of these estimators are sensitive to scale of different features
scaler = StandardScaler()
reg_acc = make_scorer(regression_accuracy)

In [6]:
# Just using basic, mostly linear, models to validate some ideas around how to shape the data & optimise.
# Throwing in RandomForest to get a data point for ensemble models
estimators = (Lasso(), ElasticNet(), Ridge(), LinearSVR())

for estimator in estimators:
    pipeline = make_pipeline(scaler, estimator)
    acc_score = cross_val_score(pipeline, X_train, y_train, scoring=reg_acc, cv=5)

    print(f'\n\n{type(estimator).__name__}')

    mae_score = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    print(f'Mean neg MAE: {np.mean(mae_score)}')
    print(f'Mean accuracy: {np.mean(acc_score)}')

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print('\nTest MAE:', mean_absolute_error(y_test, y_pred))
    print('Test Accuracy:', regression_accuracy(y_test, y_pred))



Lasso
Mean neg MAE: -28.76235746421738
Mean accuracy: 0.719287185779739

Test MAE: 29.172300777367997
Test Accuracy: 0.7311522048364154


ElasticNet
Mean neg MAE: -28.843235054330684
Mean accuracy: 0.715022908669271

Test MAE: 29.604417075803838
Test Accuracy: 0.7268847795163584


Ridge
Mean neg MAE: -29.037059154918143
Mean accuracy: 0.7202373045445847

Test MAE: 29.53449827659189
Test Accuracy: 0.7311522048364154


LinearSVR
Mean neg MAE: -29.141206910532958
Mean accuracy: 0.717390325449449

Test MAE: 30.01986528782558
Test Accuracy: 0.7197724039829303


In [7]:
# ensemble regressors with stacked team df

# Just using basic, mostly linear, models to validate some ideas around how to shape the data & optimise.
# Throwing in RandomForest to get a data point for ensemble models
estimators = (AdaBoostRegressor(),
              BaggingRegressor(),
              ExtraTreesRegressor(),
              GradientBoostingRegressor(),
              RandomForestRegressor(),
              SVR(kernel='rbf'))

for estimator in estimators:
    pipeline = make_pipeline(scaler, estimator)
    acc_score = cross_val_score(pipeline, X_train, y_train, scoring=reg_acc, cv=5)

    print(f'\n\n{type(estimator).__name__}')

    mae_score = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    print(f'Mean neg MAE: {np.mean(mae_score)}')
    print(f'Mean accuracy: {np.mean(acc_score)}')
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print('\nTest MAE:', mean_absolute_error(y_test, y_pred))
    print('Test Accuracy:', regression_accuracy(y_test, y_pred))



AdaBoostRegressor
Mean neg MAE: -29.318346745867945
Mean accuracy: 0.7230876608391215

Test MAE: 30.063071941296805
Test Accuracy: 0.7311522048364154


BaggingRegressor
Mean neg MAE: -31.19816528013869
Mean accuracy: 0.6984194706802807

Test MAE: 31.52788051209104
Test Accuracy: 0.701280227596017


ExtraTreesRegressor
Mean neg MAE: -31.941905978768677
Mean accuracy: 0.6884623611126746

Test MAE: 33.366998577524896
Test Accuracy: 0.6970128022759602


GradientBoostingRegressor
Mean neg MAE: -29.55178422345724
Mean accuracy: 0.7088561425628439

Test MAE: 30.136838152640895
Test Accuracy: 0.7311522048364154


RandomForestRegressor
Mean neg MAE: -30.72724105323592
Mean accuracy: 0.695575868784546

Test MAE: 31.564864864864862
Test Accuracy: 0.6970128022759602


SVR
Mean neg MAE: -31.797987492300614
Mean accuracy: 0.7226058470578965

Test MAE: 32.79401305997477
Test Accuracy: 0.7396870554765291
