# Testing assumptions about data shape and model choice

## Things we want to test

1. Are classifiers or regressors more accurate?
    * Regressors are more convenient due to margin-picking tie-breaker


2. Does it make a difference organising data by match (each match = 1 row) or team match (each match = 2 rows, 1 per participating team)?
    * Team match is more convenient for calculating secondary features and reduces number of columns while doubling number of rows, which can help with Curse of Dimensionality, but splits the key observation (the match) into two that have to be combined later.


3. Which metric is better for model evaluation?
    * [Matter of Stats](http://www.matterofstats.com/mafl-stats-journal/2018/5/17/accuracy-versus-mae-for-assessing-forecaster-ability-in-a-finite-season) suggests that evaluating on, and optimising to, mean absolute error is better than accuracy (perhaps using log loss instead of accuracy for a classifier?), because it is more consistent in identifying the best model.

In [1]:
# Python modules
import dateutil
from datetime import datetime
import re

# Data science packages
import pandas as pd
import numpy as np

# Scikit Learn utility classes & functions
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# Scikit Learn models
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [2]:
# Merge match data & betting data

match_df = (pd.read_csv('../data/ft_match_list.csv', parse_dates=['date'])
              .rename(columns={'date': 'datetime'})
              .assign(date=lambda x: x['datetime'].map(lambda y: y.date()))
              .set_index(['date', 'venue', 'home_team', 'away_team'], drop=True))

betting_df = pd.read_csv('../data/afl_betting.csv', index_col=('date', 'venue'), parse_dates=['date'])
home_df = betting_df[betting_df['home'] == 1].drop('home', axis=1).rename(columns=lambda x: f'home_{x}')
away_df = betting_df[betting_df['home'] == 0].drop('home', axis=1).rename(columns=lambda x: f'away_{x}')
home_away_betting_df = (home_df.merge(away_df, on=('date', 'venue'))
                               .reset_index()
                               .set_index(['date', 'venue', 'home_team', 'away_team']))

df = (pd.concat([match_df[match_df['datetime'] > '2010-01-01'], home_away_betting_df], axis=1)
        # The 2017 Grand Final is missing from the betting data for some reason,
        # but that's the only row that should get dropped
        .dropna()
        .reset_index()
        .drop('date', axis=1))
df

Unnamed: 0,venue,home_team,away_team,datetime,crowd,season_round,home_score,away_score,home_win_odds,home_line_odds,away_win_odds,away_line_odds
0,MCG,Richmond,Carlton,2010-03-25 19:10:00,72010,Round 1,64.0,120.0,2.89,16.5,1.42,-16.5
1,MCG,Geelong,Essendon,2010-03-26 19:40:00,57772,Round 1,125.0,94.0,1.21,-28.5,4.50,28.5
2,ANZ Stadium,Sydney,St Kilda,2010-03-27 19:10:00,31330,Round 1,88.0,96.0,2.64,14.5,1.49,-14.5
3,Gabba,Brisbane,West Coast,2010-03-27 19:30:00,29201,Round 1,114.0,82.0,1.32,-21.5,3.42,21.5
4,MCG,Melbourne,Hawthorn,2010-03-27 14:10:00,45615,Round 1,61.0,117.0,4.95,31.5,1.18,-31.5
5,AAMI Stadium,Port Adelaide,North Melbourne,2010-03-28 12:40:00,21205,Round 1,96.0,82.0,1.42,-16.5,2.89,16.5
6,Domain Stadium,Fremantle,Adelaide,2010-03-28 17:10:00,30976,Round 1,118.0,62.0,1.96,1.5,1.85,-1.5
7,Etihad Stadium,Western Bulldogs,Collingwood,2010-03-28 14:10:00,49000,Round 1,93.0,129.0,1.62,-8.5,2.31,8.5
8,Gabba,Brisbane,Carlton,2010-04-01 19:40:00,36780,Round 2,107.0,88.0,1.34,-19.5,3.30,19.5
9,Domain Stadium,West Coast,Port Adelaide,2010-04-03 17:40:00,37010,Round 2,86.0,89.0,1.63,-10.5,2.30,10.5


In [3]:
# Clean & simplify data for basic scikit learn model

DIGITS = re.compile(r'round\s+(\d+)$', flags=re.I)
QUALIFYING = re.compile('qualifying', flags=re.I)
ELIMINATION = re.compile('elimination', flags=re.I)
SEMI = re.compile('semi', flags=re.I)
PRELIMINARY = re.compile('preliminary', flags=re.I)
GRAND = re.compile('grand', flags=re.I)

def get_round_number(x):
    digits = DIGITS.search(x)
    if digits is not None:
        return int(digits.group(1))
    if QUALIFYING.search(x) is not None:
        return 25
    if ELIMINATION.search(x) is not None:
        return 25
    if SEMI.search(x) is not None:
        return 26
    if PRELIMINARY.search(x) is not None:
        return 27
    if GRAND.search(x) is not None:
        return 28

    raise Exception(f"Round label {x} doesn't match any known patterns")

# Filter out 2017 & 2018 seasons, because they will eventually serve as test sets
model_df = (df[df['datetime'] < '2017-01-01']
              .assign(score_diff=df['home_score'] - df['away_score'],
                      home_win=(df['home_score'] >= df['away_score']).astype(int),
                      round_number=df['season_round'].map(get_round_number),
                      year=df['datetime'].map(lambda x: x.year))
              .drop(['venue', 'datetime', 'crowd', 'season_round', 'home_score', 'away_score'], axis=1))
model_df

Unnamed: 0,home_team,away_team,home_win_odds,home_line_odds,away_win_odds,away_line_odds,score_diff,home_win,round_number,year
0,Richmond,Carlton,2.89,16.5,1.42,-16.5,-56.0,0,1,2010
1,Geelong,Essendon,1.21,-28.5,4.50,28.5,31.0,1,1,2010
2,Sydney,St Kilda,2.64,14.5,1.49,-14.5,-8.0,0,1,2010
3,Brisbane,West Coast,1.32,-21.5,3.42,21.5,32.0,1,1,2010
4,Melbourne,Hawthorn,4.95,31.5,1.18,-31.5,-56.0,0,1,2010
5,Port Adelaide,North Melbourne,1.42,-16.5,2.89,16.5,14.0,1,1,2010
6,Fremantle,Adelaide,1.96,1.5,1.85,-1.5,56.0,1,1,2010
7,Western Bulldogs,Collingwood,1.62,-8.5,2.31,8.5,-36.0,0,1,2010
8,Brisbane,Carlton,1.34,-19.5,3.30,19.5,19.0,1,2,2010
9,West Coast,Port Adelaide,1.63,-10.5,2.30,10.5,-3.0,0,2,2010


## Which is better for our task, classifiers or regressors?

In [4]:
model_features = pd.get_dummies(model_df.drop(['score_diff', 'home_win'], axis=1))
classifier_labels = model_df['home_win']
regressor_labels = model_df['score_diff']

# Use standard scaler, because many of these estimators are sensitive to scale of different features
scaler = StandardScaler()

In [5]:
# First we try some basic classifiers

# Just using basic, mostly linear, models to validate some ideas around how to shape the data & optimise.
# Throwing in RandomForest to get a data point for ensemble models.
estimators = (LogisticRegression(),
              RidgeClassifier(),
              DecisionTreeClassifier(),
              LinearSVC(),
              KNeighborsClassifier(),
              RandomForestClassifier())
X_train, X_test, y_train, y_test = train_test_split(model_features, classifier_labels)

for estimator in estimators:
    pipeline = make_pipeline(scaler, estimator)
    acc_score = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)

    print(f'\n\n{type(estimator).__name__}')
    print(f'Mean accuracy: {np.mean(acc_score)}')

    try:
        log_score = cross_val_score(pipeline, X_train, y_train, scoring='neg_log_loss', cv=5)
        print(f'Mean neg log loss: {np.mean(log_score)}')
    except AttributeError:
        print(f'Mean neg log loss: NA')
        
    pipeline.fit(X_train, y_train)
    print('\nTest Accuracy:', pipeline.score(X_test, y_test))



LogisticRegression
Mean accuracy: 0.6864397838939806
Mean neg log loss: -0.57188686631313

Test Accuracy: 0.7146892655367232


RidgeClassifier
Mean accuracy: 0.698717473999561
Mean neg log loss: NA

Test Accuracy: 0.7090395480225988


DecisionTreeClassifier
Mean accuracy: 0.6261548485524011
Mean neg log loss: -12.976247867287213

Test Accuracy: 0.652542372881356


LinearSVC
Mean accuracy: 0.6930703839118648
Mean neg log loss: NA

Test Accuracy: 0.711864406779661


KNeighborsClassifier
Mean accuracy: 0.6780334545350736
Mean neg log loss: -1.7130456043623656

Test Accuracy: 0.6384180790960452


RandomForestClassifier
Mean accuracy: 0.6695206171003187
Mean neg log loss: -1.5227335462575389

Test Accuracy: 0.6836158192090396


In [6]:
# Next we try some basic regressors

def regression_accuracy(y, y_pred, **kwargs):
    correct_preds = ((y >= 0) & (y_pred >= 0)) | ((y <= 0) & (y_pred <= 0))
    return np.mean(correct_preds.astype(int))

# Just using basic, mostly linear, models to validate some ideas around how to shape the data & optimise.
# Throwing in RandomForest to get a data point for ensemble models
estimators = (Lasso(), ElasticNet(), Ridge(), LinearSVR(), RandomForestRegressor())
X_train, X_test, y_train, y_test = train_test_split(model_features, regressor_labels)
reg_acc = make_scorer(regression_accuracy)

for estimator in estimators:
    pipeline = make_pipeline(scaler, estimator)
    acc_score = cross_val_score(pipeline, X_train, y_train, scoring=reg_acc, cv=5)

    print(f'\n\n{type(estimator).__name__}')
    print(f'Mean accuracy: {np.mean(acc_score)}')

    mae_score = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    print(f'Mean neg MAE: {np.mean(mae_score)}')
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print('\nTest accuracy:', regression_accuracy(y_test, y_pred))



Lasso
Mean accuracy: 0.7344450349898131
Mean neg MAE: -28.045425259356854

Test accuracy: 0.7033898305084746


ElasticNet
Mean accuracy: 0.7287846576313226
Mean neg MAE: -28.28748548482355

Test accuracy: 0.711864406779661


Ridge
Mean accuracy: 0.720316237044911
Mean neg MAE: -28.61787175710581

Test accuracy: 0.711864406779661


LinearSVR
Mean accuracy: 0.7071264062361591
Mean neg MAE: -28.759420715595883

Test accuracy: 0.692090395480226


RandomForestRegressor
Mean accuracy: 0.6949021171051466
Mean neg MAE: -31.18454601824785

Test accuracy: 0.6977401129943502


### Linear classifiers perform about as well as linear regressors on test data

The best regressors are better on cross validation, but the best classifiers are about equal on the test data. Given the issues with over-fitting, this indicates that the classifiers might generalise better. However, regressors are more convenient to use for prediction, because I'll need to predict score differentials for the tipping competition. Also, running the code multiple times (thus, getting different random data splits), shows that whether one is better than the other varies, so they're probably more or less equal over the long run.

## Use match rows or team-match rows?

In [7]:
# Reshape model_df to split each match into 2 rows: 1 per participating team

# Filter out 2017 & 2018 seasons, because they will eventually serve as test sets
match_df = (df[df['datetime'] < '2017-01-01']
              .assign(home_score_diff=df['home_score'] - df['away_score'],
                      away_score_diff=df['away_score'] - df['home_score'],
                      round_number=df['season_round'].map(get_round_number),
                      year=df['datetime'].map(lambda x: x.year))
              .drop(['venue', 'datetime', 'crowd', 'season_round', 'home_score', 'away_score'], axis=1))
match_df

Unnamed: 0,home_team,away_team,home_win_odds,home_line_odds,away_win_odds,away_line_odds,home_score_diff,away_score_diff,round_number,year
0,Richmond,Carlton,2.89,16.5,1.42,-16.5,-56.0,56.0,1,2010
1,Geelong,Essendon,1.21,-28.5,4.50,28.5,31.0,-31.0,1,2010
2,Sydney,St Kilda,2.64,14.5,1.49,-14.5,-8.0,8.0,1,2010
3,Brisbane,West Coast,1.32,-21.5,3.42,21.5,32.0,-32.0,1,2010
4,Melbourne,Hawthorn,4.95,31.5,1.18,-31.5,-56.0,56.0,1,2010
5,Port Adelaide,North Melbourne,1.42,-16.5,2.89,16.5,14.0,-14.0,1,2010
6,Fremantle,Adelaide,1.96,1.5,1.85,-1.5,56.0,-56.0,1,2010
7,Western Bulldogs,Collingwood,1.62,-8.5,2.31,8.5,-36.0,36.0,1,2010
8,Brisbane,Carlton,1.34,-19.5,3.30,19.5,19.0,-19.0,2,2010
9,West Coast,Port Adelaide,1.63,-10.5,2.30,10.5,-3.0,3.0,2,2010


In [8]:
home_cols = match_df.columns[match_df.columns.str.match(r'home_')]
away_cols = match_df.columns[match_df.columns.str.match(r'away_')]
home_df = (match_df.drop(away_cols, axis=1)
                   .assign(oppo_team=match_df['away_team'], at_home=np.ones(len(match_df)))
                   .rename(columns=lambda x: x.replace('home_', ''))
                   .set_index(['team', 'year', 'round_number'], drop=False))
away_df = (match_df.drop(home_cols, axis=1)
                   .assign(oppo_team=match_df['home_team'], at_home=np.zeros(len(match_df)))
                   .rename(columns=lambda x: x.replace('away_', ''))
                   .set_index(['team', 'year', 'round_number'], drop=False))

team_df = pd.concat([home_df, away_df]).sort_index()
team_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,team,win_odds,line_odds,score_diff,round_number,year,oppo_team,at_home
team,year,round_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Adelaide,2010,1,Adelaide,1.85,-1.5,-56.0,1,2010,Fremantle,0.0
Adelaide,2010,2,Adelaide,1.54,-12.5,-43.0,2,2010,Sydney,1.0
Adelaide,2010,3,Adelaide,1.52,-13.5,-16.0,3,2010,Melbourne,0.0
Adelaide,2010,4,Adelaide,1.81,-2.5,-48.0,4,2010,Carlton,1.0
Adelaide,2010,5,Adelaide,7.50,40.5,-49.0,5,2010,Western Bulldogs,0.0
Adelaide,2010,6,Adelaide,2.55,12.5,-23.0,6,2010,Port Adelaide,1.0
Adelaide,2010,7,Adelaide,1.20,-29.5,50.0,7,2010,Richmond,1.0
Adelaide,2010,8,Adelaide,2.38,9.5,-9.0,8,2010,North Melbourne,0.0
Adelaide,2010,9,Adelaide,2.14,5.5,12.0,9,2010,Brisbane,1.0
Adelaide,2010,10,Adelaide,4.25,26.5,-47.0,10,2010,St Kilda,0.0


In [9]:
team_features = pd.get_dummies(team_df.drop('score_diff', axis=1))
team_reg_labels = team_df['score_diff']
team_class_labels = (team_df['score_diff'] >= 0).astype(int)

# Use the same group of regressors to get a variety of results
estimators = (Lasso(), ElasticNet(), Ridge(), LinearSVR(), RandomForestRegressor())
X_train, X_test, y_train, y_test = train_test_split(team_features, team_reg_labels)

for estimator in estimators:
    pipeline = make_pipeline(scaler, estimator)
    acc_score = cross_val_score(pipeline, X_train, y_train, scoring=reg_acc, cv=5)

    print(f'\n\n{type(estimator).__name__}')
    print(f'Mean accuracy: {np.mean(acc_score)}')

    mae_score = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    print(f'Mean neg MAE: {np.mean(mae_score)}')
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print('\nTest accuracy:', regression_accuracy(y_test, y_pred))



Lasso
Mean accuracy: 0.7306981132075472
Mean neg MAE: -28.39219706811837

Test accuracy: 0.7161016949152542


ElasticNet
Mean accuracy: 0.7236403995560488
Mean neg MAE: -29.17909263468879

Test accuracy: 0.7019774011299436


Ridge
Mean accuracy: 0.7283518312985573
Mean neg MAE: -28.59410758457969

Test accuracy: 0.7048022598870056


LinearSVR
Mean accuracy: 0.7250577136514983
Mean neg MAE: -28.741987507372887

Test accuracy: 0.6977401129943502


RandomForestRegressor
Mean accuracy: 0.7000910099889012
Mean neg MAE: -30.640973917869037

Test accuracy: 0.7033898305084746


In [10]:
# Just using basic, mostly linear, models to validate some ideas around how to shape the data & optimise.
# Throwing in RandomForest to get a data point for ensemble models.

X_train, X_test, y_train, y_test = train_test_split(team_features, team_class_labels)

estimators = (LogisticRegression(),
              RidgeClassifier(),
              DecisionTreeClassifier(),
              LinearSVC(),
              KNeighborsClassifier(),
              RandomForestClassifier())

for estimator in estimators:
    pipeline = make_pipeline(scaler, estimator)
    acc_score = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=5)

    print(f'\n\n{type(estimator).__name__}')
    print(f'Mean accuracy: {np.mean(acc_score)}')

    try:
        log_score = cross_val_score(pipeline, X_train, y_train, scoring='neg_log_loss', cv=5)
        print(f'Mean neg log loss: {np.mean(log_score)}')
    except AttributeError:
        print(f'Mean neg log loss: NA')
        
    pipeline.fit(X_train, y_train)
    print('\nTest Accuracy:', pipeline.score(X_test, y_test))



LogisticRegression
Mean accuracy: 0.7099694392771725
Mean neg log loss: -0.5559377712607876

Test Accuracy: 0.731638418079096


RidgeClassifier
Mean accuracy: 0.7137396945490926
Mean neg log loss: NA

Test Accuracy: 0.7274011299435028


DecisionTreeClassifier
Mean accuracy: 0.626636262264672
Mean neg log loss: -12.89468528713797

Test Accuracy: 0.6878531073446328


LinearSVC
Mean accuracy: 0.7142124973295191
Mean neg log loss: NA

Test Accuracy: 0.730225988700565


KNeighborsClassifier
Mean accuracy: 0.6087273347819064
Mean neg log loss: -1.8212978968889364

Test Accuracy: 0.635593220338983


RandomForestClassifier
Mean accuracy: 0.6713566772443763
Mean neg log loss: -1.245490665052046

Test Accuracy: 0.6850282485875706


### Organising data by team-matches has slightly positive impact on performance

Since this organisation is necessary for cumulative features, this is potentiall more convenient, but it may require an extra step when measuring performance to make sure two opponents aren't predicted to win, resulting in 50% accuracy rather than 0% or 100% per match.