# AFL Modelling
Predicting the winner of games, with the probability of winning and the margin of win. <br>
Using data from 2015 till 2023 with 2023 to be used as testing data.

In [89]:
#import packages
import polars as pl
import numpy as np
pl.Config.set_fmt_str_lengths(150);
pl.Config.set_tbl_rows(1000);

In [2]:
#set up R extension
%load_ext rpy2.ipython



Get the fixtures from 2015-2023, taking into account the bye infomation 2019 onwards. 

In [3]:
%%R -o seasons,seasonBye
library('fitzRoy')
library('dplyr')
library('tidyr')

seasonList <- c(2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023)
first <- TRUE
for(s in seasonList) {
    
    df <- fetch_fixture(season = s, comp = "AFLM")

    if (first) {
        seasons <- df
        first <- FALSE
    }else{
        seasons <- seasons %>% 
                      bind_rows(df)
    }
    
}

seasonBye <- seasons %>% 
                select(compSeason.id, compSeason.year, round.roundNumber, round.byes) %>%
                filter(compSeason.year >= 2019) %>%
                unnest(round.byes) %>%
                distinct() %>%
                mutate(round.nextRound = round.roundNumber + 1,
                      ByeFlag = 1) %>%
                select(compSeason.id, round.nextRound, club.id, club.name, ByeFlag)

seasons <- seasons %>% select(-round.byes)
    

R[write to console]: 
Attaching package: 'dplyr'


R[write to console]: The following objects are masked from 'package:stats':

    filter, lag


R[write to console]: The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




v Returning data for "All Rounds, 2015" ... done
v Returning data for "All Rounds, 2016" ... done
v Returning data for "All Rounds, 2017" ... done
v Returning data for "All Rounds, 2018" ... done
v Returning data for "All Rounds, 2019" ... done
v Returning data for "All Rounds, 2020" ... done
v Returning data for "All Rounds, 2021" ... done
v Returning data for "All Rounds, 2022" ... done
v Returning data for "All Rounds, 2023" ... done


Assign -1 as the winner id for a draw, since 2015 there have been 15 draws. We also have a cancelled game, Cats vs. Crows 2015 R14 which we are just going to remove.

In [4]:
#Clean and prep the season bye info
seasonBye = pl.from_pandas(seasonBye).drop('club.name').with_columns(pl.col('round.nextRound').cast(pl.Int32))

In [40]:
#The inital cleaning working with the gneral match details. We can use this as our base for creating the desired attributes
seasonsBase = (pl.from_pandas(seasons)
                .sort('round.id')
                #Keep only the columns we want
                .select('id', 'status', 'compSeason.id', 'compSeason.year', 'round.roundNumber', 'home.team.club.id', 'home.team.name',
                        'away.team.club.id', 'away.team.name', 'home.score.goals', 'home.score.behinds', 'home.score.totalScore',
                        'away.score.goals', 'away.score.behinds', 'away.score.totalScore', 'venue.id', 'venue.name', 'venue.state')
                #Label the id winner (-1 for a draw)
                .with_columns(pl.when(pl.col('home.score.totalScore') > pl.col('away.score.totalScore')).then(pl.col('home.team.club.id'))
                                .when(pl.col('home.score.totalScore') < pl.col('away.score.totalScore')).then(pl.col('away.team.club.id'))
                                .otherwise(pl.lit(-1)).alias('winner'))
                #Remove the Crows vs. Cats 2015 R14 game which was cancelled
                .filter(pl.col('id') != 847)
                #Label Previous comp year
                .with_columns((pl.col('compSeason.year') - 1).alias('prevComp.year'))
                #Label in sequential order (currently missing numbers in existing round.id) to get the previous games 
     .join((pl.from_pandas(seasons)
                .select('compSeason.year', 'round.roundNumber')
                .unique()
                .sort('compSeason.year', 'round.roundNumber')
                .with_columns(pl.col('round.roundNumber').cum_count().alias('round.id'))), how='left', on=['compSeason.year', 'round.roundNumber'])
     #create the margin
     .with_columns((pl.col('home.score.totalScore') - pl.col('away.score.totalScore')).abs().alias('win.margin'))   
     #Add a bye flag
     .join(seasonBye, how='left', left_on=['compSeason.id', 'round.roundNumber', 'home.team.club.id'], right_on=['compSeason.id', 'round.nextRound', 'club.id'])
     .join(seasonBye, how='left', left_on=['compSeason.id', 'round.roundNumber', 'away.team.club.id'], right_on=['compSeason.id', 'round.nextRound', 'club.id'])
     #Rename bye flag
     .rename({'ByeFlag' : 'home.bye.flag',
              'ByeFlag_right' : 'away.bye.flag'})
     #Fill in bye flag nulls
     .with_columns(pl.col('home.bye.flag').fill_null(0).cast(pl.Int32),
                   pl.col('away.bye.flag').fill_null(0).cast(pl.Int32))
    #Fill venue blank states with china (1 is not china but NZ but is in 2015 so we wont worry about that)
    .with_columns(pl.col('venue.state').fill_null('China'))
    
)

In [147]:
#Create a dataframe with all the match details (repeated so that boths teams get to be team A and team B)
allMatchesBothSides = (seasonsBase
     .select('compSeason.year', 'round.roundNumber', 'round.id', 'home.team.club.id', 'home.team.name', 'home.score.totalScore', 'away.team.club.id', 'away.team.name', 'away.score.totalScore')
     .rename({'home.team.name': 'team.A.name',
              'home.team.club.id' : 'team.A.club.id',
              'home.score.totalScore' : 'team.A.score.totalScore',
              'away.team.name': 'team.B.name',
              'away.team.club.id' : 'team.B.club.id',
              'away.score.totalScore' : 'team.B.score.totalScore'})
     .vstack(seasonsBase
     .select('compSeason.year', 'round.roundNumber', 'round.id', 'away.team.club.id', 'away.team.name', 'away.score.totalScore', 'home.team.club.id', 'home.team.name', 'home.score.totalScore')
     .rename({'home.team.name': 'team.B.name',
              'home.team.club.id' : 'team.B.club.id',
              'home.score.totalScore' : 'team.B.score.totalScore',
              'away.team.name': 'team.A.name',
              'away.team.club.id' : 'team.A.club.id',
              'away.score.totalScore' : 'team.A.score.totalScore'}))
     .sort('team.A.club.id', 'round.id')
)

## Get the ladder poistions at beginning round and last seasons final ladder position.

In [7]:
%%R -o ladder

season <- c(2015,2016,2017,2018,2019,2020,2021, 2022, 2023)
rounds <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
first <- TRUE

for(s in season){
    for(r in rounds){

        if(s==2020 && r <= 18){
            df <- fetch_ladder_afl(season=s, round_number=r)
        }else if(s!=2020){
            df <- fetch_ladder_afl(season=s, round_number=r)
        }

        if(first){
            ladder <- df
            first <- FALSE
        }else{
            ladder <- ladder %>% bind_rows(df)
        }
    }

    if(s== 2023){
        ladder <- ladder %>% bind_rows(fetch_ladder_afl(season=s, round_number=24))
    }
}

ladder = ladder %>% select(season, round_number, position, team.club.id, team.club.name)
        

In [14]:
ladderNormal = (pl.from_pandas(ladder)
     .with_columns((pl.col('round_number') + 1).alias('nextRound'))
     .drop('team.club.name', 'round_number')
                  )

In [8]:
#get the final ladder of each season
ladderFinal = (pl.from_pandas(ladder)
                   #Get the final ladder at the end of the season
                   .filter(pl.col('round_number') == pl.col('round_number').max().over('season'))
                   #get the next season (for assigning to the nest round 1)
                   .with_columns((pl.col('season') + 1).alias('Nextseason'),
                                 pl.lit(1).alias('first_round'))
              )

In [18]:
#Add home and away ladder positions before 
ladderPosition = (seasonsBase
     .select('compSeason.year', 'round.roundNumber')
     .unique()
     .sort('compSeason.year', 'round.roundNumber')
     #from round 2 till finals
     .join(ladderNormal, how='left', left_on=['compSeason.year', 'round.roundNumber'], right_on=['season', 'nextRound'])
     #for round 1 the final ladder of previous season
     .join(ladderFinal.drop('season', 'round_number'), how='left', left_on=['compSeason.year', 'round.roundNumber'], right_on=['Nextseason', 'first_round'])
     .rename({'position_right' : 'position1',
              'team.club.id_right' : 'team.club.id1'})
     #for the finals
     .join(ladderFinal.drop('PrevSeason'), how='left', left_on='compSeason.year', right_on='season')
     #put all the parts together
     .with_columns(pl.when(~pl.col('position').is_null()).then(pl.col('position'))
                     .when(~pl.col('position1').is_null()).then(pl.col('position1')).otherwise(pl.col('position_right')).alias('position'),
                   pl.when(~pl.col('team.club.id').is_null()).then(pl.col('team.club.id'))
                     .when(~pl.col('team.club.id1').is_null()).then(pl.col('team.club.id1')).otherwise(pl.col('team.club.id_right')).alias('team.club.id'))
     #select only what we want
     .select('compSeason.year', 'round.roundNumber', 'team.club.id', 'position')
     .unique()
     .sort('compSeason.year', 'round.roundNumber', 'position')
     #get the fianl positioning of the previous year
     .join(ladderFinal.drop('season', 'round_number', 'first_round', 'team.club.name'), how='left', left_on=['compSeason.year', 'team.club.id'], right_on=['Nextseason', 'team.club.id'])
     .rename({'position_right': 'last.season.position'})
                           
)           

In [29]:
seasonsStep2 = (seasonsBase
                 #join the ladderPosition to get the home teams ladder info
                 .join(ladderPosition.rename({'position':'home.ladder.position', 'last.season.position': 'home.last.season.position'}),
                       how='left',
                       left_on=['compSeason.year', 'round.roundNumber', 'home.team.club.id'],
                       right_on=['compSeason.year', 'round.roundNumber', 'team.club.id'])
                 #Clean up any duplicates
                 .unique()
                 #join the ladderPosition to get the away teams ladder info
                 .join(ladderPosition.rename({'position':'away.ladder.position', 'last.season.position': 'away.last.season.position'}),
                       how='left',
                       left_on=['compSeason.year', 'round.roundNumber', 'away.team.club.id'],
                       right_on=['compSeason.year', 'round.roundNumber', 'team.club.id'])
                 #clean up any duplicates
                 .unique()
                 #sort into order
                 .sort('compSeason.year', 'round.roundNumber')
     
                )

## Get the distnace travelled

In [58]:
#Create a table for distance between states and where the team is based (based on capital to capital)
vic = pl.Series([0, 654, 714, 2727, 1373, 0, 0, 1373, 0, 0, 0, 654, 0, 0, 0, 2727, 714, 0])
nsw = pl.Series([714, 1165, 0, 3297, 730, 714, 714, 730, 714, 714, 714, 1165, 714, 714, 714, 3297, 0, 714])
qld = pl.Series([1373, 1602, 730, 3613, 0, 1373, 1373, 0, 1373, 1373, 1373, 1602, 1373, 1373, 1373, 3613, 730, 1373])
nt = pl.Series([3140, 2609, 3144, 2647, 2846, 3140, 3140, 2846, 3140, 3140, 3140, 2609, 3140, 3140, 3140, 2647, 3144, 3140])
wa = pl.Series([2727, 2135, 3297, 0, 3613, 2727, 2727, 3613, 2727,2727, 2727, 2135, 2727, 2727, 2727, 0, 3297, 2727])
sa = pl.Series([654, 0, 1165, 2135, 1602, 654, 654, 1602, 654, 654, 654, 0, 654, 654, 654, 2135, 1165, 654])
tas = pl.Series([597, 1161, 1056, 3015, 1786, 597, 597, 1786, 597, 597, 597, 1161, 597, 597, 597, 3015, 1161, 597])
act = pl.Series([467, 960, 942, 3095, 942, 467, 467, 247, 467, 467, 467, 960, 467, 467, 467, 3095, 247, 467])
china = pl.Series([8036, 6956, 7592, 7050, 6225, 8036, 8036, 6225, 8036, 8036, 8036, 6956, 8036, 8036, 8036, 7050, 7592, 8036])
distanceTravelled = (seasonsBase
                         .select('home.team.name', 'home.team.club.id').unique()
                         .sort('home.team.club.id')
                         .with_columns(vic.alias('VIC'),
                                       nsw.alias('NSW'),
                                       qld.alias('QLD'),
                                       nt.alias('NT'),
                                       wa.alias('WA'),
                                       sa.alias('SA'),
                                       tas.alias('TAS'),
                                       act.alias('ACT'),
                                       china.alias('China'))
                         .rename({'home.team.club.id' : 'team.club.id'})
                         .drop('home.team.name')
                         .melt(id_vars='team.club.id', variable_name='State', value_name='Distance')
                        )

In [67]:
seasonStep3 = (seasonsStep2
                 #Home team distance travelled to venue
                 .join(distanceTravelled.rename({'Distance' : 'home.distance'}), how='left', left_on=['home.team.club.id', 'venue.state'], right_on=['team.club.id', 'State'])
                 #Away team distance travelled to venue
                 .join(distanceTravelled.rename({'Distance' : 'away.distance'}), how='left', left_on=['away.team.club.id', 'venue.state'], right_on=['team.club.id', 'State'])
                )

## Team ELO Rating

In [155]:
#Create a link between teams club id and the elo index 0-17 
eloIndexTeams = allMatchesBothSides.select('team.A.name', 'team.A.club.id').unique().sort('team.A.club.id').with_columns((pl.col('team.A.club.id').cum_count() - 1).alias('EloIndex'))

In [83]:
#create a function to find a teams prediction
def eloTeamPrediction(ratingTeamA, ratingTeamB):
    exp = (-1*(ratingTeamA - ratingTeamB))/400
    b = 1 + 10 ** exp
    result = 1/b
    return result

In [90]:
#create a function to find the teams result, inital B = 0.004 per Multifactorial analysis of factors influencing elite australian football match outcomes: a machine learning approach
def eloTeamResult(scoreTeamA, scoreTeamB):
    exp = -0.004*(scoreTeamA - scoreTeamB)
    b = 1 + np.exp(exp)
    result = 1/b
    return result

In [249]:
#Create a function to get change in teams elo, set K=67.559 per Multifactorial analysis of factors influencing elite australian football match outcomes: a machine learning approach
def eloChange(ratingTeamA, ratingTeamB, scoreTeamA, scoreTeamB):
    #get Team prediction
    prediction = eloTeamPrediction(ratingTeamA, ratingTeamB)
    
    #get actual result
    actual = eloTeamResult(scoreTeamA , scoreTeamB)
    
    #get elo change
    result = 67.559*(actual - prediction)
    
    #get new elo
    newElo = ratingTeamA + result
    
    return newElo

In [131]:
#Create a function to get the elo for the start of the season, set Carry over at 0.70628 per Multifactorial analysis of factors influencing elite australian football match outcomes: a machine learning approach
def eloNewSeason(lastElo):
    carryOver = 0.70628
    result = carryOver * lastElo + 1500 * (1 - carryOver)
    return result

In [291]:
#Function to rule them all! assign elo values
def eloFucntions(roundNumber, teamAScore, teamBScore, prevEloA, prevEloB):
    #If first round of the season get the carry over elo value
    if roundNumber == 1:
        elo = eloNewSeason(prevEloA)
    #If normal round then find new score
    else:
        elo = eloChange(prevEloA, prevEloB, teamAScore, teamBScore)
    #return current elo 
    return elo   
   
    

In [354]:
#Elo list of lists to record elo's and access past elo values, with index = elo index translation found in eloIndexTeams table
#Starting elo as default 1500
eloScores = [[1500] * 18]
#starting roundIndex 

In [355]:
def eloNumber(roundId, roundNumber, firstTeam, teamAScore, teamBScore, indexA, indexB):
    #set roundId for indexing
    roundId = roundId - 1
    #create default elo
    elo=1500.0
    #Check if new round by checking a new list has been added for the round
    if roundId >= np.shape(eloScores)[0]:
        #Append new list for new round
        eloScores.append(eloScores[roundId - 1])
        
    #Deal with very first round we consider, simply create base for next round
    if roundId == 0:
        elo = 1500.0
    #For all other rounds get the elo
    else: 
        #Get the new elo score
        elo = eloFucntions(roundNumber, teamAScore, teamBScore, eloScores[roundId-1][indexA], eloScores[roundId-1][indexB])
        #Change list of list to reflect the new score
        eloScores[roundId][indexA] = elo

    #Return Elo
    return elo
        

In [359]:
eloScoreDF = (allMatchesBothSides
                 #Get the cleaned indexes for A teams
                 .join(eloIndexTeams.rename({'EloIndex': 'elo.index.A'}).drop('team.A.name'), how='left', on='team.A.club.id')
                 #Get the cleaned indexes for B teams
                 .join(eloIndexTeams.rename({'EloIndex': 'elo.index.B'}).drop('team.A.name'), how='left', left_on='team.B.club.id', right_on='team.A.club.id')
                 #sort so the assigning of elo scores are pasted on past club info
                 .sort('round.id', 'team.A.club.id')
                 #Get the elo scores
                 .with_columns(pl.struct(['round.roundNumber', 'round.id', 'team.A.club.id', 'team.A.score.totalScore', 'team.B.score.totalScore', 'elo.index.A', 'elo.index.B'])
                                 .map_elements(lambda x : eloNumber(x['round.id'], x['round.roundNumber'], x['team.A.club.id'], x['team.A.score.totalScore'], x['team.B.score.totalScore'], x['elo.index.A'], x['elo.index.B']))
                                 .alias('elo.score'))
                 #Keep only the info we actually want
                 .select('compSeason.year', 'round.roundNumber', 'round.id', 'team.A.club.id', 'elo.score')
                )

In [366]:
#Join the elo scores up into the main dataframe
seasonStep3 = (seasonsStep2
     #elo scores for the home teams
     .join(eloScoreDF.rename({'elo.score' : 'home.elo.score'}),
           how='left',
           left_on=['compSeason.year', 'round.roundNumber', 'round.id', 'home.team.club.id'],
           right_on=['compSeason.year', 'round.roundNumber', 'round.id', 'team.A.club.id'])
     #elo scores for the away teams
    .join(eloScoreDF.rename({'elo.score' : 'away.elo.score'}),
           how='left',
           left_on=['compSeason.year', 'round.roundNumber', 'round.id', 'away.team.club.id'],
           right_on=['compSeason.year', 'round.roundNumber', 'round.id', 'team.A.club.id'])
)

In [367]:
seasonStep3

id,status,compSeason.id,compSeason.year,round.roundNumber,home.team.club.id,home.team.name,away.team.club.id,away.team.name,home.score.goals,home.score.behinds,home.score.totalScore,away.score.goals,away.score.behinds,away.score.totalScore,venue.id,venue.name,venue.state,winner,prevComp.year,round.id,win.margin,home.bye.flag,away.bye.flag,home.ladder.position,home.last.season.position,away.ladder.position,away.last.season.position,home.elo.score,away.elo.score
i32,str,i32,f64,i32,i32,str,i32,str,i32,i32,i32,i32,i32,i32,i32,str,str,i32,f64,u32,i32,i32,i32,i32,i32,i32,i32,f64,f64
723,"""CONCLUDED""",7,2015.0,1,24,"""Sydney Swans""",10,"""Essendon""",10,12,72,9,6,60,10,"""Accor Stadium""","""NSW""",24,2014.0,1,12,0,0,4,,15,,1500.0,1500.0
726,"""CONCLUDED""",7,2015.0,1,25,"""Western Bulldogs""",23,"""West Coast Eagles""",14,13,97,14,3,87,3,"""Marvel Stadium""","""VIC""",25,2014.0,1,10,0,0,6,,2,,1500.0,1500.0
727,"""CONCLUDED""",7,2015.0,1,1,"""Carlton""",22,"""Richmond""",11,12,78,15,15,105,13,"""MCG""","""VIC""",22,2014.0,1,27,0,0,18,,5,,1500.0,1500.0
731,"""CONCLUDED""",7,2015.0,1,6,"""Fremantle""",19,"""Port Adelaide""",11,9,75,10,8,68,14,"""Domain Stadium""","""WA""",6,2014.0,1,7,0,0,1,,9,,1500.0,1500.0
732,"""CONCLUDED""",7,2015.0,1,14,"""Melbourne""",9,"""Gold Coast Suns""",17,13,115,13,11,89,13,"""MCG""","""VIC""",14,2014.0,1,26,0,0,13,,16,,1500.0,1500.0
741,"""CONCLUDED""",7,2015.0,1,13,"""Brisbane Lions""",12,"""Collingwood""",11,8,74,12,14,86,22,"""Gabba""","""QLD""",12,2014.0,1,12,0,0,17,,12,,1500.0,1500.0
725,"""CONCLUDED""",7,2015.0,1,21,"""St Kilda""",5,"""GWS Giants""",11,12,78,12,15,87,3,"""Marvel Stadium""","""VIC""",5,2014.0,1,9,0,0,14,,11,,1500.0,1500.0
729,"""CONCLUDED""",7,2015.0,1,16,"""Hawthorn""",15,"""Geelong Cats""",17,21,123,8,13,61,13,"""MCG""","""VIC""",16,2014.0,1,62,0,0,3,,10,,1500.0,1500.0
742,"""CONCLUDED""",7,2015.0,1,3,"""Adelaide Crows""",20,"""North Melbourne""",21,14,140,9,9,63,31,"""Adelaide Oval""","""SA""",3,2014.0,1,77,0,0,7,,8,,1500.0,1500.0
739,"""CONCLUDED""",7,2015.0,2,22,"""Richmond""",25,"""Western Bulldogs""",9,12,66,12,13,85,13,"""MCG""","""VIC""",25,2014.0,2,19,0,0,3,,8,,1493.109549,1505.207216
