# AFL Modelling
Predicting the winner of games, with the probability of winning and the margin of win. <br>
Using data from 2015 till 2023 with 2023 to be used as testing data.

In [1]:
#import packages
import polars as pl
pl.Config.set_fmt_str_lengths(150);
pl.Config.set_tbl_rows(1000);

In [2]:
#set up R extension
%load_ext rpy2.ipython



Get the fixtures from 2015-2023, taking into account the bye infomation 2019 onwards. 

In [16]:
%%R -o seasons,seasonBye
library('fitzRoy')
library('dplyr')
library('tidyr')

seasonList <- c(2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023)
first <- TRUE
for(s in seasonList) {
    
    df <- fetch_fixture(season = s, comp = "AFLM")

    if (first) {
        seasons <- df
        first <- FALSE
    }else{
        seasons <- seasons %>% 
                      bind_rows(df)
    }
    
}

seasonBye <- seasons %>% 
                select(compSeason.id, compSeason.year, round.roundNumber, round.byes) %>%
                filter(compSeason.year >= 2019) %>%
                unnest(round.byes) %>%
                distinct() %>%
                mutate(round.nextRound = round.roundNumber + 1,
                      ByeFlag = 1) %>%
                select(compSeason.id, round.nextRound, club.id, club.name, ByeFlag)

seasons <- seasons %>% select(-round.byes)
    

i Returning data for "All Rounds, 2015"
v Returning data for "All Rounds, 2015" ... done

i Returning data for "All Rounds, 2016"
v Returning data for "All Rounds, 2016" ... done

i Returning data for "All Rounds, 2017"
v Returning data for "All Rounds, 2017" ... done

i Returning data for "All Rounds, 2018"
v Returning data for "All Rounds, 2018" ... done

i Returning data for "All Rounds, 2019"
v Returning data for "All Rounds, 2019" ... done

i Returning data for "All Rounds, 2020"
v Returning data for "All Rounds, 2020" ... done

i Returning data for "All Rounds, 2021"
v Returning data for "All Rounds, 2021" ... done

i Returning data for "All Rounds, 2022"
v Returning data for "All Rounds, 2022" ... done

i Returning data for "All Rounds, 2023"
v Returning data for "All Rounds, 2023" ... done




Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Assign -1 as the winner id for a draw, since 2015 there have been 15 draws. We also have a cancelled game, Cats vs. Crows 2015 R14 which we are just going to remove.

In [24]:
#Clean and prep the season bye info
seasonBye = pl.from_pandas(seasonBye).drop('club.name').with_columns(pl.col('round.nextRound').cast(pl.Int32))

In [33]:
#The inital cleaning working with the gneral match details. We can use this as our base for creating the desired attributes
seasonsBase = (pl.from_pandas(seasons)
                .sort('round.id')
                #Keep only the columns we want
                .select('id', 'status', 'compSeason.id', 'compSeason.year', 'round.roundNumber', 'home.team.club.id', 'home.team.name',
                        'away.team.club.id', 'away.team.name', 'home.score.goals', 'home.score.behinds', 'home.score.totalScore',
                        'away.score.goals', 'away.score.behinds', 'away.score.totalScore', 'venue.id', 'venue.name', 'venue.state')
                #Label the id winner (-1 for a draw)
                .with_columns(pl.when(pl.col('home.score.totalScore') > pl.col('away.score.totalScore')).then(pl.col('home.team.club.id'))
                                .when(pl.col('home.score.totalScore') < pl.col('away.score.totalScore')).then(pl.col('away.team.club.id'))
                                .otherwise(pl.lit(-1)).alias('winner'))
                #Remove the Crows vs. Cats 2015 R14 game which was cancelled
                .filter(pl.col('id') != 847)
                #Label Previous comp year
                .with_columns((pl.col('compSeason.year') - 1).alias('prevComp.year'))
                #Label in sequential order (currently missing numbers in existing round.id) to get the previous games 
     .join((pl.from_pandas(seasons)
                .select('compSeason.year', 'round.roundNumber')
                .unique()
                .sort('compSeason.year', 'round.roundNumber')
                .with_columns(pl.col('round.roundNumber').cumcount().alias('round.id'))), how='left', on=['compSeason.year', 'round.roundNumber'])
     #create the margin
     .with_columns((pl.col('home.score.totalScore') - pl.col('away.score.totalScore')).abs().alias('win.margin'))   
     #Add a bye flag
     .join(seasonBye, how='left', left_on=['compSeason.id', 'round.roundNumber', 'home.team.club.id'], right_on=['compSeason.id', 'round.nextRound', 'club.id'])
     .join(seasonBye, how='left', left_on=['compSeason.id', 'round.roundNumber', 'away.team.club.id'], right_on=['compSeason.id', 'round.nextRound', 'club.id'])
     #Rename bye flag
     .rename({'ByeFlag' : 'home.bye.flag',
              'ByeFlag_right' : 'away.bye.flag'})
     #Fill in bye flag nulls
     .with_columns(pl.col('home.bye.flag').fill_null(0).cast(pl.Int32),
                   pl.col('away.bye.flag').fill_null(0).cast(pl.Int32))
    
)

Get the ladder poistions at beginning round.

In [101]:
%%R -o ladder

season <- c(2015,2016,2017,2018,2019,2020,2021, 2022, 2023)
rounds <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23)
first <- TRUE

for(s in season){
    for(r in rounds){

        if(s==2020 && r <= 18){
            df <- fetch_ladder_afl(season=s, round_number=r)
        }else if(s!=2020){
            df <- fetch_ladder_afl(season=s, round_number=r)
        }

        if(first){
            ladder <- df
            first <- FALSE
        }else{
            ladder <- ladder %>% bind_rows(df)
        }
    }

    if(s== 2023){
        ladder <- ladder %>% bind_rows(fetch_ladder_afl(season=s, round_number=24))
    }
}

ladder = ladder %>% select(season, round_number, position, team.club.id, team.club.name)
        

In [116]:
#get the final ladder of each season
ladderFinal = (pl.from_pandas(ladder)
                   #Get the final ladder at the end of the season
                   .filter(pl.col('round_number') == pl.col('round_number').max().over('season'))
                   #get the next season (for assigning to the nest round 1)
                   .with_columns((pl.col('season') + 1).alias('Nextseason'),
                                 pl.lit(1).alias('first_round'))
              )

In [121]:
#Add home and away ladder positions before 
ladderPosition = (seasonsBase
     .select('compSeason.year', 'round.roundNumber')
     .unique()
     .sort('compSeason.year', 'round.roundNumber')
     #from round 2 till finals
     .join(ladderClean, how='left', left_on=['compSeason.year', 'round.roundNumber'], right_on=['season', 'nextRound'])
     #for round 1 the final ladder of previous season
     .join(ladderFinal.drop('season', 'round_number'), how='left', left_on=['compSeason.year', 'round.roundNumber'], right_on=['Nextseason', 'first_round'])
     .rename({'position_right' : 'position1',
              'team.club.id_right' : 'team.club.id1'})
     #for the finals
     .join(ladderFinal.drop('PrevSeason'), how='left', left_on='compSeason.year', right_on='season')
     #put all the parts together
     .with_columns(pl.when(~pl.col('position').is_null()).then(pl.col('position'))
                     .when(~pl.col('position1').is_null()).then(pl.col('position1')).otherwise(pl.col('position_right')).alias('position'),
                   pl.when(~pl.col('team.club.id').is_null()).then(pl.col('team.club.id'))
                     .when(~pl.col('team.club.id1').is_null()).then(pl.col('team.club.id1')).otherwise(pl.col('team.club.id_right')).alias('team.club.id'))
     #select only what we want
     .select('compSeason.year', 'round.roundNumber', 'team.club.id', 'position')
     .unique()
     .sort('compSeason.year', 'round.roundNumber', 'position')
     #get the fianl positioning of the previous year
     .join(ladderFinal.drop('season', 'round_number', 'first_round', 'team.club.name'), how='left', left_on=['compSeason.year', 'team.club.id'], right_on=['Nextseason', 'team.club.id'])
     .rename({'position_right': 'last.season.position'})
                           
)           

In [133]:
(seasonsBase
     .join(ladderPosition.rename({'position':'home.ladder.position', 'last.season.position': 'home.last.season.position'}), how='left', left_on=['compSeason.year', 'round.roundNumber', 'home.team.club.id'], right_on=['compSeason.year', 'round.roundNumber', 'team.club.id'])
     #.filter(pl.col('id') == 1050)
)

id,status,compSeason.id,compSeason.year,round.roundNumber,home.team.club.id,home.team.name,away.team.club.id,away.team.name,home.score.goals,home.score.behinds,home.score.totalScore,away.score.goals,away.score.behinds,away.score.totalScore,venue.id,venue.name,venue.state,winner,prevComp.year,round.id,win.margin,home.bye.flag,away.bye.flag,home.ladder.position,home.last.season.position
i32,str,i32,f64,i32,i32,str,i32,str,i32,i32,i32,i32,i32,i32,i32,str,str,i32,f64,u32,i32,i32,i32,i32,i32
727,"""CONCLUDED""",7,2015.0,1,1,"""Carlton""",22,"""Richmond""",11,12,78,15,15,105,13,"""MCG""","""VIC""",22,2014.0,0,27,0,0,18,
732,"""CONCLUDED""",7,2015.0,1,14,"""Melbourne""",9,"""Gold Coast Suns""",17,13,115,13,11,89,13,"""MCG""","""VIC""",14,2014.0,0,26,0,0,13,
723,"""CONCLUDED""",7,2015.0,1,24,"""Sydney Swans""",10,"""Essendon""",10,12,72,9,6,60,10,"""Accor Stadium""","""NSW""",24,2014.0,0,12,0,0,4,
741,"""CONCLUDED""",7,2015.0,1,13,"""Brisbane Lions""",12,"""Collingwood""",11,8,74,12,14,86,22,"""Gabba""","""QLD""",12,2014.0,0,12,0,0,17,
726,"""CONCLUDED""",7,2015.0,1,25,"""Western Bulldogs""",23,"""West Coast Eagles""",14,13,97,14,3,87,3,"""Marvel Stadium""","""VIC""",25,2014.0,0,10,0,0,6,
725,"""CONCLUDED""",7,2015.0,1,21,"""St Kilda""",5,"""GWS Giants""",11,12,78,12,15,87,3,"""Marvel Stadium""","""VIC""",5,2014.0,0,9,0,0,14,
742,"""CONCLUDED""",7,2015.0,1,3,"""Adelaide Crows""",20,"""North Melbourne""",21,14,140,9,9,63,31,"""Adelaide Oval""","""SA""",3,2014.0,0,77,0,0,7,
731,"""CONCLUDED""",7,2015.0,1,6,"""Fremantle""",19,"""Port Adelaide""",11,9,75,10,8,68,14,"""Domain Stadium""","""WA""",6,2014.0,0,7,0,0,1,
729,"""CONCLUDED""",7,2015.0,1,16,"""Hawthorn""",15,"""Geelong Cats""",17,21,123,8,13,61,13,"""MCG""","""VIC""",16,2014.0,0,62,0,0,3,
724,"""CONCLUDED""",7,2015.0,2,23,"""West Coast Eagles""",1,"""Carlton""",20,11,131,9,8,62,14,"""Domain Stadium""","""WA""",23,2014.0,1,69,0,0,11,


# Added an extra 1000approx. records What??