# Version 4:

## About:
Notebook is used to generate the inputs for the ML model requirements. <br>
The output files for this are:
<ul>
    <li>team_stats.csv - aggregated stats to team level by round</li>
    <li>team_score_results.csv - stacked scored to team level by round</li>
    <li>merged_stat_score_data_clean.csv - combined dataset of aggregated team stats and team scores by round</li>
</ul>

In [6]:
import pandas as pd

In [7]:
from featuresv2 import features, targets, metadata

In [8]:
# read raw data
game_results = pd.read_csv("data/game_results.csv")
player_stats = pd.read_csv("data/player_stats.csv")

# add season field
player_stats['season'] = [i.strftime('%Y') for i in pd.to_datetime(player_stats['utcStartTime']).to_list()]

  player_stats = pd.read_csv("data/player_stats.csv")


In [11]:
player_stats = player_stats[:-1]
player_stats.tail()

Unnamed: 0,providerId,utcStartTime,status,compSeason.shortName,round.name,round.roundNumber,venue.name,home.team.name,home.team.club.name,away.team.name,...,extendedStats.kickins,extendedStats.kickinsPlayon,player.playerId,player.captain,player.playerJumperNumber,player.givenName,player.surname,teamStatus,team.name,season
73322,CD_M20240142702,2024-09-21T07:15:00.000+0000,CONCLUDED,Premiership,Preliminary Finals,27,MCG,Geelong Cats,Geelong Cats,Brisbane Lions,...,0.0,0.0,CD_I1002347,False,40.0,Jack,Payne,away,Brisbane Lions,2024
73323,CD_M20240142702,2024-09-21T07:15:00.000+0000,CONCLUDED,Premiership,Preliminary Finals,27,MCG,Geelong Cats,Geelong Cats,Brisbane Lions,...,0.0,0.0,CD_I1002235,False,16.0,Cam,Rayner,away,Brisbane Lions,2024
73324,CD_M20240142702,2024-09-21T07:15:00.000+0000,CONCLUDED,Premiership,Preliminary Finals,27,MCG,Geelong Cats,Geelong Cats,Brisbane Lions,...,0.0,0.0,CD_I1002401,False,37.0,Brandon,Starcevich,away,Brisbane Lions,2024
73325,CD_M20240142702,2024-09-21T07:15:00.000+0000,CONCLUDED,Premiership,Preliminary Finals,27,MCG,Geelong Cats,Geelong Cats,Brisbane Lions,...,3.0,3.0,CD_I1017067,False,44.0,Darcy,Wilmot,away,Brisbane Lions,2024
73326,CD_M20240142702,2024-09-21T07:15:00.000+0000,CONCLUDED,Premiership,Preliminary Finals,27,MCG,Geelong Cats,Geelong Cats,Brisbane Lions,...,9.0,9.0,CD_I261224,False,15.0,Dayne,Zorko,away,Brisbane Lions,2024


In [12]:
# define aggregation columns and grouping levels
key_stats = ['kicks', 'handballs', 'disposals', 'marks', 'bounces',
       'tackles', 'contestedPossessions', 'uncontestedPossessions',
       'totalPossessions', 'inside50s', 'marksInside50', 'contestedMarks',
       'hitouts', 'onePercenters', 'disposalEfficiency', 'clangers',
       'freesFor', 'freesAgainst', 'rebound50s',
       'goalAssists', 'turnovers', 'intercepts', 'tacklesInside50', 'shotsAtGoal',
       'scoreInvolvements', 'metresGained', 'clearances.centreClearances',
       'clearances.stoppageClearances', 'clearances.totalClearances',
       'extendedStats.effectiveKicks', 'extendedStats.kickToHandballRatio', 'extendedStats.effectiveDisposals',
       'extendedStats.marksOnLead', 'extendedStats.interceptMarks',
       'extendedStats.hitoutsToAdvantage', 'extendedStats.groundBallGets',
       'extendedStats.f50GroundBallGets', 'extendedStats.scoreLaunches',
       'extendedStats.pressureActs', 'extendedStats.defHalfPressureActs',
       'extendedStats.spoils', 'extendedStats.ruckContests',
       'extendedStats.contestDefOneOnOnes', 'extendedStats.contestDefLosses',
       'extendedStats.contestOffOneOnOnes', 'extendedStats.contestOffWins',
       'extendedStats.centreBounceAttendances', 'extendedStats.kickins',
       'extendedStats.kickinsPlayon']
group_by = ['season','round.roundNumber','team.name','player.field.position']
df_cols = key_stats + group_by

In [18]:
position_map = {'CHF':"FWD",'INT':"Other",
                'FF':"FWD",'HBFL':"DEF",
                'CHB':"DEF",'FPL':"FWD",
                'WR':"MID",'HFFR':"FWD",
                'BPR':"DEF",'FPR':"FWD",
                'HBFR':"DEF",'RR':"MID",
                'HFFL':"FWD",'BPL':"DEF",
                'C':"MID",'RK':"MID",
                'FB':"DEF",'R':"MID",
                'WL':"MID",'SUB':"Other",
                'EMERG':"Other"
               }

In [19]:
player_stats['player.field.position'] = [position_map[i] for i in player_stats['player.player.position']]

In [20]:
field_pos = player_stats['player.field.position'].unique()

In [21]:
# subset to only aggregation + grouping columns
subset_player_stats = player_stats[df_cols]

# apply aggregation and output file
team_stats = subset_player_stats.groupby(group_by).sum()
team_stats.reset_index(inplace=True)
team_stats.to_csv('outputs/team_stats.csv')

In [22]:
# pivot stats out and sum by position for grouped positions
team_stats['idx'] = team_stats.groupby(['season','round.roundNumber','team.name']).cumcount()+1
team_stats_position = team_stats.pivot_table(index=['season','round.roundNumber','team.name','player.field.position'], columns='idx', 
                    values=key_stats, aggfunc='sum')

# set columns to new_names with unstacked columns
df = team_stats_position.sort_index(axis=1, level=1)
df.columns = [f'{x}_{field_pos[y-1]}' for x,y in df.columns]
df = df.reset_index()

# drop the categrorical positions out
df.drop('player.field.position', axis=1, inplace=True)

# group stats data to team level for all new features
df = df.groupby(['season','round.roundNumber','team.name']).sum()
df.reset_index(inplace=True)

In [23]:
df.to_csv('outputs/team_stats_field_position.csv')

In [24]:
# get game results to merge into player stats
score_df = game_results[['round.year', 'round.roundNumber', 'match.homeTeam.name', 'match.awayTeam.name', 'homeTeamScore.matchScore.totalScore','awayTeamScore.matchScore.totalScore']]
score_df['round.year'] = score_df['round.year'].astype(str)
score_df['score_diff'] = score_df['homeTeamScore.matchScore.totalScore'] - score_df['awayTeamScore.matchScore.totalScore']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['round.year'] = score_df['round.year'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['score_diff'] = score_df['homeTeamScore.matchScore.totalScore'] - score_df['awayTeamScore.matchScore.totalScore']


In [25]:
# merge home team stats for round into score dataframe
home_stats = score_df.merge(df, left_on=['round.year', 'round.roundNumber', 'match.homeTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'])
# merge away team stats for round into score+home dataframe
homeaway_stats = home_stats.merge(df, left_on=['round.year', 'round.roundNumber', 'match.awayTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'], suffixes=('|HOME','|AWAY'))

# dump to csv
homeaway_stats.to_csv('outputs/model_training_data.csv', index=False)

In [26]:
# collect names of all features with position suffix added to them
n_features = []

for i in range(0,4):
    for feature in key_stats:
        n_features.append(feature+'_'+str(field_pos[i]))

In [27]:
# combine 'HOME' & 'AWAY' suffixed features from combined df to use for subsetting training data
away_n_feat = [feat + '|AWAY' for feat in n_features]
home_n_feat = [feat + '|HOME' for feat in n_features]
tot_n_feat = home_n_feat + away_n_feat

In [28]:
# write list to .py file to be used in model notebook
with open('position_group_features.py','w+') as file:
    file.write('features = ' + str(tot_n_feat))

In [30]:
homeaway_stats[metadata + tot_n_feat + targets].head()

Unnamed: 0,round.year,round.roundNumber,match.homeTeam.name,match.awayTeam.name,kicks_DEF|HOME,handballs_DEF|HOME,disposals_DEF|HOME,marks_DEF|HOME,bounces_DEF|HOME,tackles_DEF|HOME,...,extendedStats.contestDefOneOnOnes_Other|AWAY,extendedStats.contestDefLosses_Other|AWAY,extendedStats.contestOffOneOnOnes_Other|AWAY,extendedStats.contestOffWins_Other|AWAY,extendedStats.centreBounceAttendances_Other|AWAY,extendedStats.kickins_Other|AWAY,extendedStats.kickinsPlayon_Other|AWAY,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,score_diff
1317,2024,25,Brisbane Lions,Carlton,74.0,19.0,93.0,27.0,2.0,11.0,...,0.0,0.0,4.0,1.0,35.0,0.0,0.0,99,71,28
1318,2024,26,Port Adelaide,Hawthorn,60.0,31.0,91.0,30.0,4.0,8.0,...,3.0,1.0,1.0,1.0,15.0,3.0,3.0,75,72,3
1319,2024,26,GWS GIANTS,Brisbane Lions,61.0,35.0,96.0,33.0,3.0,16.0,...,0.0,0.0,3.0,0.0,0.0,3.0,3.0,100,105,-5
1320,2024,27,Sydney Swans,Port Adelaide,73.0,24.0,97.0,48.0,2.0,10.0,...,0.0,0.0,1.0,0.0,27.0,0.0,0.0,95,59,36
1321,2024,27,Geelong Cats,Brisbane Lions,48.0,21.0,69.0,19.0,0.0,10.0,...,0.0,0.0,1.0,0.0,1.0,3.0,3.0,85,95,-10
