In [1]:
#packages
import polars as pl
import pickle
import numpy as np
import pandas as pd
import xlsx2csv
from datetime import datetime
#Import replace Team Names form other file so I only have to update in one place
from ipynb.fs.defs.AFLupdate import replaceTeamNames, getCurrentRoundNumbers, getSeason, getNextFixture, getTeamIndex, getDistance, getElo, getMatchDays, getMatchesBothSides, getLadder, getPlayerRanking
#set up R extension
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.lib.dplyr import DataFrame
from rpy2.robjects import pandas2ri
from rpy2.robjects import rl

pl.Config.set_fmt_str_lengths(150);
pl.Config.set_tbl_cols(40);
pl.Config.set_tbl_rows(50);




In [2]:
#Step 1: Get basic fixture information
def getFixture():
    
    fixture, bye = getNextFixture()

    #Add in bye flag if exists
    #check if any byes previous round
    if len(bye.columns) > 1:
        #Clean and prep the season bye info

        bye = (bye
                .with_columns(pl.col('round.currentRound').cast(pl.Int32),
                              pl.lit(1).alias('ByeFlag'))
            )
        #print(bye.head(1))
        #rename clubs names
        bye = replaceTeamNames(bye, 'name')
        bye = replaceTeamNames(bye, 'club.name')

    #replace the team names for fixture
    fixture = replaceTeamNames(fixture, 'home.team.name')
    fixture = replaceTeamNames(fixture, 'away.team.name')


    #if there are bye rounds, create fixture
    if len(bye.columns) > 1:
    
        fixtureBase = (fixture
                .sort('round.id')
                #Keep only the columns we want
                .select('round.roundNumber', 'utcStartTime', 'home.team.name',
                        'away.team.name', pl.col('venue.providerId').alias('venue.id'), 'venue.name', 'venue.state')
                #Create the date in format wanted
                .with_columns(pl.col('utcStartTime').str.slice(0, length=10).str.to_date().alias('Date'))
                .drop('utcStartTime')
                #Add a bye flag
                .join(bye.select('round.currentRound', 'club.name', 'ByeFlag'), how='left', left_on=['round.roundNumber', 'home.team.name'], right_on=[ 'round.currentRound', 'club.name'])
                .join(bye.select('round.currentRound', 'club.name', 'ByeFlag'), how='left', left_on=['round.roundNumber', 'away.team.name'], right_on=['round.currentRound', 'club.name'])
                #Rename bye flag
                .rename({'ByeFlag' : 'home.bye.flag',
                         'ByeFlag_right' : 'away.bye.flag'})
                #Fill in bye flag nulls
                .with_columns(pl.col('home.bye.flag').fill_null(0).cast(pl.Int32),
                              pl.col('away.bye.flag').fill_null(0).cast(pl.Int32))
                #Fill venue blank states with china (1 is not china but NZ but is in 2015 so we wont worry about that)
                .with_columns(pl.col('venue.state').fill_null('China'))
                #Add team index
                .join(getTeamIndex().drop('team.A.club.id'), how='left', left_on='home.team.name', right_on='team.A.name')
                .rename({'EloIndex' : 'home.team.id'})
                .join(getTeamIndex().drop('team.A.club.id'), how='left', left_on='away.team.name', right_on='team.A.name')
                .rename({'EloIndex' : 'away.team.id'})
                )
    #if no bye rounds
    else:
            #First round of the season
        fixtureBase = (fixture
                .sort('round.id')
                  #Keep only the columns we want
                .select('round.roundNumber', 'utcStartTime', 'home.team.name',
                        'away.team.name', pl.col('venue.providerId').alias('venue.id'), 'venue.name', 'venue.state')
                #Create the date in format wanted
                .with_columns(pl.col('utcStartTime').str.slice(0, length=10).str.to_date().alias('Date'))
                .drop('utcStartTime')
                #Add a bye flag
                .with_columns(pl.lit(0).alias('home.bye.flag'),
                              pl.lit(0).alias('away.bye.flag'))
                #Fill venue blank states with china (1 is not china but NZ but is in 2015 so we wont worry about that)
                .with_columns(pl.col('venue.state').fill_null('China'))
                #Add team index
                .join(getTeamIndex().drop('team.A.club.id'), how='left', left_on='home.team.name', right_on='team.A.name')
                .rename({'EloIndex' : 'home.team.id'})
                .join(getTeamIndex().drop('team.A.club.id'), how='left', left_on='away.team.name', right_on='team.A.name')
                .rename({'EloIndex' : 'away.team.id'})
                )
    return fixtureBase


In [3]:
#Step 2: Add current ladder positions
def addLadderPosition(fixture):
    ladder = replaceTeamNames(getLadder(), 'team.name')
    fixture = (fixture
                        #Home team ladder
                        .join(ladder, how='left', left_on='home.team.name', right_on='team.name')
                        .rename({'position' : 'home.ladder.position'})
                        #Away team ladder
                        .join(ladder, how='left', left_on='away.team.name', right_on='team.name')
                        .rename({'position' : 'away.ladder.position'})                    
                    )
    return fixture  

In [4]:
#Step 3: How far did each team have to travel (capital to capital)
def addDistance(fixture):
    distanceMap = getDistance()
    fixture = (fixture
                 #Home team distance travelled to venue
                 .join(distanceMap.rename({'Distance' : 'home.distance'}), how='left', left_on=['home.team.name', 'venue.state'], right_on=['team.name', 'State'])
                 #Away team distance travelled to venue
                 .join(distanceMap.rename({'Distance' : 'away.distance'}), how='left', left_on=['away.team.name', 'venue.state'], right_on=['team.name', 'State'])
                )

    return fixture

In [5]:
#Step 4: Get the elo rating
def addElo(fixture):
    eloScores = getElo()
    fixture = (fixture
                .with_columns(pl.col('home.team.id').map_elements(lambda x : eloScores[x], return_dtype=pl.Float64).alias('home.elo.score'),
                               pl.col('away.team.id').map_elements(lambda x : eloScores[x], return_dtype=pl.Float64).alias('away.elo.score'))
                )
    return fixture

In [6]:
#Step 5: add in time and wins at venue
def addVenueAttributes(fixture):
    #Use matches both sides to figure out last 26 games if played at venue and how many were won
    #Get the portion of games played at venue current season
    venueValues = (getMatchesBothSides()
                     #index the team names for rolling 
                     .sort('team.A.name', 'compSeason.year', 'round.id')
                     .with_columns(pl.col('team.A.name').cum_count().over('team.A.name').alias('team.A.index'))
                     #.sort('team.A.index', 'team.A.name')               
                     #find the winner of each game 
                     .with_columns(pl.when(pl.col('team.A.score.totalScore') >= pl.col('team.B.score.totalScore')).then(pl.lit(1)).otherwise(pl.lit(0)).alias('team.A.winner'))
                     #Get the number of times each team played at a venue during past 26 rounds approx. number of games a season i.e. rolling season
                     .sort('team.A.name', 'round.id'))

    venueValues = (venueValues
                     .rolling(index_column='team.A.index', period='26i', closed='left', group_by=['team.A.name', 'venue.id'])
                     .agg(pl.col('round.id').len().alias('VenuePlayed'),
                          pl.col('team.A.winner').sum().alias('wins'))
                     .join(venueValues.select('compSeason.year', 'round.id', 'team.A.name', 'team.A.index'), how='left', on=['team.A.name', 'team.A.index'])
                     .unique()
                     #.filter(pl.col('round.id').is_not_null())
                     #How much of their time did the team spend at any venue
                     .with_columns((pl.col('VenuePlayed')/26).alias('VenuePortion'))
                     .with_columns((pl.col('wins')/pl.col('VenuePlayed')).alias('VenueWinnerPortion'))
                     .with_columns((pl.col('VenuePortion')*pl.col('VenueWinnerPortion')).alias('VenuePortion'))
                     .sort('compSeason.year', 'team.A.name', 'VenuePlayed')
                     #Only keep the most recent values
                     .with_columns(pl.col('round.id').max().over('team.A.name', 'venue.id').alias('MostRecentRound'))
                     .filter(pl.col('round.id') == pl.col('MostRecentRound'))
                     #Keep only what we need going forward
                     .select('team.A.name', 'venue.id', 'VenuePortion')
                 )
    

    #Add new attributes to fixture
    fixture = (fixture
                #home teme
                .join(venueValues, how='left', left_on=['home.team.name', 'venue.id'], right_on=['team.A.name', 'venue.id'])
                .rename({'VenuePortion' : 'home.venue.portion'})
                #If haven't played at venue in over 26 rounds
                .with_columns(pl.col('home.venue.portion').fill_null(0))
                .with_columns(pl.col('home.venue.portion').fill_nan(0))
                #away team
                .join(venueValues, how='left', left_on=['away.team.name', 'venue.id'], right_on=['team.A.name', 'venue.id'])
                .rename({'VenuePortion' : 'away.venue.portion'})
                #If haven't played at venue in over 26 rounds
                .with_columns(pl.col('away.venue.portion').fill_null(0))
                .with_columns(pl.col('away.venue.portion').fill_nan(0))
              )
    return fixture


In [7]:
#Step 6: Add in days since last match
def addDaysSinceLastMatch(fixture):
    
    #Get the most recent date that a team played
    matchDays = getMatchDays().group_by('Team').agg(pl.col('Date').max().alias('last.date'))
    
    fixture = (fixture
                    .join(matchDays, how='left', left_on='home.team.name', right_on='Team')
                    .rename({'last.date' : 'home.last.date'})
                    .with_columns((pl.col('Date') - pl.col('home.last.date')).dt.total_days().alias('home.day.last.match'))
                    .drop('home.last.date')
                    #away teams
                    .join(matchDays, how='left', left_on='away.team.name', right_on='Team')
                    .rename({'last.date' : 'away.last.date'})
                    .with_columns((pl.col('Date') - pl.col('away.last.date')).dt.total_days().alias('away.day.last.match'))
                    .drop('away.last.date')
 
                    )  
    return fixture


In [8]:
#Step 7: Ass in attacking/defending/opposition numbers
def addTeamAttributes(fixture):
    #Get the goals scored and given away and the margin of each game. Add in the ladder positions as well
    matchesBothSidesStats = (getMatchesBothSides()
                                #Goals scored
                                .with_columns((pl.col('team.A.score.goals') + (pl.col('team.A.score.behinds')*0.5)).alias('strength.attack'))
                                #Goals given away
                                .with_columns((pl.col('team.B.score.goals') + (pl.col('team.B.score.behinds')*0.5)).alias('strength.defense'))
                                #keep only what I want
                                .select('compSeason.year', 'round.roundNumber', 'round.id', 'team.A.club.id', 'team.A.name', 'team.B.club.id', 'team.B.name',
                                        'strength.attack', 'strength.defense', 'team.A.score.totalScore', 'team.B.score.totalScore')
                                #Cast round.id to int rather an unsigned int
                                .with_columns(pl.col('round.id').cast(pl.Int32))
                            )  
    
    #Get the goals scored and given away and the margin of each game. Add in the ladder positions as well
    matchStats = (getMatchesBothSides()
                    #Goals scored
                    .with_columns((pl.col('team.A.score.goals') + (pl.col('team.A.score.behinds')*0.5)).alias('strength.attack'))
                    #Goals given away
                    .with_columns((pl.col('team.B.score.goals') + (pl.col('team.B.score.behinds')*0.5)).alias('strength.defense'))
                    #keep only what I want
                    .select('compSeason.year', 'round.roundNumber', 'round.id', 'team.A.club.id', 'team.A.name', 'team.B.club.id', 'team.B.name',
                            'strength.attack', 'strength.defense', 'team.A.score.totalScore', 'team.B.score.totalScore')
                    #Cast round.id to int rather an unsigned int
                    .with_columns(pl.col('round.id').cast(pl.Int32))
                    #Get the each teams rolling 5 game strength based on margin
                    #get the previous 5 games margin average of opposition
                    #Start with descending round.id for rolling groupby statements
                    .sort(['team.A.name', 'round.id'], descending=[False, True])
                    #A rolling previous avg margin
                    .sort('team.A.name', 'round.id')
                    .rolling(index_column='round.id', period='5i', group_by='team.A.name')
                    .agg(pl.col('team.A.score.totalScore').sum().alias('rolling.team.A.totalScore'),
                        pl.col('team.B.score.totalScore').sum().alias('rolling.team.B.totalScore'))
                    #get the avg margin
                    .with_columns(((pl.col('rolling.team.A.totalScore') - pl.col('rolling.team.B.totalScore'))/5).alias('rolling.margin.avg'))
                    #join back up to df
                    .join(matchesBothSidesStats, how='left', on=['team.A.name', 'round.id'])
                    #arrange columns as desired
                    .select('compSeason.year', 'round.id', 'team.A.name', 'strength.attack',
                            'strength.defense', 'rolling.margin.avg')
                    #Get the previous 5 matches attacking strength
                    .with_columns(pl.col('strength.attack').over('team.A.name').alias('strength.attack.t-1'),
                                pl.col('strength.attack').shift(1).over('team.A.name').alias('strength.attack.t-2'),
                                pl.col('strength.attack').shift(2).over('team.A.name').alias('strength.attack.t-3'),
                                pl.col('strength.attack').shift(3).over('team.A.name').alias('strength.attack.t-4'),
                                pl.col('strength.attack').shift(4).over('team.A.name').alias('strength.attack.t-5'))
                    #Get the previous 5 matches defending strength
                    .with_columns(pl.col('strength.defense').over('team.A.name').alias('strength.defense.t-1'),
                                pl.col('strength.defense').shift(1).over('team.A.name').alias('strength.defense.t-2'),
                                pl.col('strength.defense').shift(2).over('team.A.name').alias('strength.defense.t-3'),
                                pl.col('strength.defense').shift(3).over('team.A.name').alias('strength.defense.t-4'),
                                pl.col('strength.defense').shift(4).over('team.A.name').alias('strength.defense.t-5'))
                    #drop the original strength metrics
                    .drop('strength.attack', 'strength.defense')                            
                    #Will need to grab only the most resent results
                    .sort('compSeason.year', 'round.id')
                    .with_columns(pl.col('round.id').max().over(pl.col('team.A.name')).alias('max.round'))
                    .filter(pl.col('max.round') == pl.col('round.id'))
                    #remove unwanted columns
                    .drop('compSeason.year', 'round.id', 'max.round')                    
                ) 
    #Add the new attributes to the fixture df
    fixture = (fixture
                #join for home team
                .join(matchStats, how='left', left_on= 'home.team.name', right_on= 'team.A.name') 
                 #get past strengths of home team
                .rename({'strength.attack.t-1' : 'home.attack.t-1',
                        'strength.attack.t-2' : 'home.attack.t-2',
                        'strength.attack.t-3' : 'home.attack.t-3',
                        'strength.attack.t-4' : 'home.attack.t-4',
                        'strength.attack.t-5' : 'home.attack.t-5',
                        'strength.defense.t-1' : 'home.defense.t-1',
                        'strength.defense.t-2' : 'home.defense.t-2',
                        'strength.defense.t-3' : 'home.defense.t-3',
                        'strength.defense.t-4' : 'home.defense.t-4',
                        'strength.defense.t-5' : 'home.defense.t-5',
                        #For the away team the oppositions strength
                        'rolling.margin.avg' : 'away.opposition.strength'})
 
                #join for away team
                .join(matchStats, how='left', left_on='away.team.name', right_on='team.A.name') 
                #get past strengths of away team
                .rename({'strength.attack.t-1' : 'away.attack.t-1',
                        'strength.attack.t-2' : 'away.attack.t-2',
                        'strength.attack.t-3' : 'away.attack.t-3',
                        'strength.attack.t-4' : 'away.attack.t-4',
                        'strength.attack.t-5' : 'away.attack.t-5',
                        'strength.defense.t-1' : 'away.defense.t-1',
                        'strength.defense.t-2' : 'away.defense.t-2',
                        'strength.defense.t-3' : 'away.defense.t-3',
                        'strength.defense.t-4' : 'away.defense.t-4',
                        'strength.defense.t-5' : 'away.defense.t-5',
                        #For the home team the oppositions strength
                        'rolling.margin.avg' : 'home.opposition.strength'})
            
                )
    return fixture

In [9]:
#step 8: summed player ratings
def addPlayerRatings(fixture):
    players = replaceTeamNames(getPlayerRanking(), 'team.name')
    #index the players
    playerRoundIndex = (players
            .filter(~pl.col('team.name').is_null())
            .select('compSeason.year', 'round.roundNumber')
            .unique()
            .sort(['compSeason.year', 'round.roundNumber'])
            .with_columns(pl.col('round.roundNumber').cum_count().cast(pl.Int32).alias('round.id'))
            )
    #produce a summed team rating based on the past 10 games
    #get the sum of teams rating
    sumedteamRating = (players
            .join(playerRoundIndex, how='left', on=['compSeason.year', 'round.roundNumber'])
            .filter(~pl.col('team.name').is_null())
            .group_by('team.name', 'round.id')
            .agg(pl.col('ratingPoints').sum())                   
            .sort('team.name', 'round.id')
            .rolling(index_column='round.id', period='10i', group_by=['team.name'])
            .agg(pl.col('ratingPoints').sum().alias('rolling.ratingPoints'))
            .with_columns(pl.col('round.id').max().over('team.name').alias('lastRound'))
            #only want most recent results
            .filter(pl.col('round.id') == pl.col('lastRound'))
            #keep only what we want
            .select('team.name', 'rolling.ratingPoints')
            )
    #Add to fixture
    fixture = (fixture   
            #join home teams ratings
            .join(sumedteamRating, how='left', left_on=['home.team.name'], right_on=['team.name'])
            .rename({'rolling.ratingPoints' : 'home.team.playerPoints'})
            #join away tea,s ratings
            .join(sumedteamRating, how='left', left_on=['away.team.name'], right_on=['team.name'])
            .rename({'rolling.ratingPoints' : 'away.team.playerPoints'})
            )
    
    return fixture

In [10]:
#Check for any missing data
def checkFinalNull(fixture):
    anyNull = (max(fixture.null_count().to_numpy().flatten()).item() != 0)
    if anyNull:
        #See were and how many nulls there are
        print('There Are Nulls!')
        print(fixture.describe())
    return anyNull 


In [None]:
#Put it altogether!
def createFixture():
    #in order
    fixture = getFixture()
    fixture = addLadderPosition(fixture)
    fixture = addDistance(fixture)
    fixture = addElo(fixture)
    fixture = addVenueAttributes(fixture)
    fixture = addDaysSinceLastMatch(fixture)
    fixture = addTeamAttributes(fixture)
    fixture = addPlayerRatings(fixture)

    #check for any nulls, if any don't produce a df
    anyNulls = checkFinalNull(fixture)

    if anyNulls:
        fixture = None

    return fixture#fixtureReady, predictionReady
