# Version 2:

## About:
Notebook is used to generate the inputs for the ML model requirements. <br>
The output files for this are:
<ul>
    <li>team_stats.csv - aggregated stats to team level by round</li>
    <li>team_score_results.csv - stacked scored to team level by round</li>
    <li>merged_stat_score_data_clean.csv - combined dataset of aggregated team stats and team scores by round</li>
</ul>

In [1]:
import pandas as pd

In [41]:
from featuresv2 import features, targets, metadata

In [23]:
# read raw data
game_results = pd.read_csv("data/game_results.csv")
player_stats = pd.read_csv("data/player_stats.csv")
ladder_stats = pd.read_csv("../ladder_stats.csv")

# add season field
player_stats['season'] = [i.strftime('%Y') for i in pd.to_datetime(player_stats['utcStartTime']).to_list()]

  player_stats = pd.read_csv("data/player_stats.csv")


In [25]:
team_map = {'Sydney' : "Sydney Swans",
            'Gold Coast' : 'Gold Coast Suns',
            'West Coast' : 'West Coast Eagles',
            'GWS' : 'GWS Giants',
            'Footscray' : 'Western Bulldogs',
            'Geelong' : 'Geelong Cats',
            'Adelaide' : 'Adelaide Crows'}

In [26]:
ladder_stats.replace({'Team': team_map}, inplace=True)
ladder_stats['Season'] = ladder_stats['Season'].astype(str)

In [34]:
# define aggregation columns and grouping levels
cols_to_sum = ['goals', 'behinds', 'kicks', 'handballs', 'disposals', 'marks', 'bounces',
       'tackles', 'contestedPossessions', 'uncontestedPossessions',
       'totalPossessions', 'inside50s', 'marksInside50', 'contestedMarks',
       'hitouts', 'onePercenters', 'disposalEfficiency', 'clangers',
       'freesFor', 'freesAgainst', 'rebound50s',
       'goalAssists', 'turnovers', 'intercepts', 'tacklesInside50', 'shotsAtGoal',
       'scoreInvolvements', 'metresGained', 'clearances.centreClearances',
       'clearances.stoppageClearances', 'clearances.totalClearances',
       'extendedStats.effectiveKicks', 'extendedStats.kickToHandballRatio', 'extendedStats.effectiveDisposals',
       'extendedStats.marksOnLead', 'extendedStats.interceptMarks',
       'extendedStats.hitoutsToAdvantage', 'extendedStats.groundBallGets',
       'extendedStats.f50GroundBallGets', 'extendedStats.scoreLaunches',
       'extendedStats.pressureActs', 'extendedStats.defHalfPressureActs',
       'extendedStats.spoils', 'extendedStats.ruckContests',
       'extendedStats.contestDefOneOnOnes', 'extendedStats.contestDefLosses',
       'extendedStats.contestOffOneOnOnes', 'extendedStats.contestOffWins',
       'extendedStats.centreBounceAttendances', 'extendedStats.kickins',
       'extendedStats.kickinsPlayon']
group_by = ['season','round.roundNumber','team.name']
df_cols = cols_to_sum + group_by

In [35]:
# subset to only aggregation + grouping columns
subset_player_stats = player_stats[df_cols]

# apply aggregation and output file
team_stats = subset_player_stats.groupby(group_by).sum()
team_stats.reset_index(inplace=True)
team_stats.to_csv('outputs/team_stats.csv')

In [14]:
score_df = game_results[['round.year', 'round.roundNumber', 'match.homeTeam.name', 'match.awayTeam.name', 'homeTeamScore.matchScore.totalScore','awayTeamScore.matchScore.totalScore']]
score_df['round.year'] = score_df['round.year'].astype(str)
score_df['score_diff'] = score_df['homeTeamScore.matchScore.totalScore'] - score_df['awayTeamScore.matchScore.totalScore']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['round.year'] = score_df['round.year'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['score_diff'] = score_df['homeTeamScore.matchScore.totalScore'] - score_df['awayTeamScore.matchScore.totalScore']


In [27]:
ladder_stats.drop(['Unnamed: 0', 'Season.Points', 'Score.For', 'Score.Against'], axis=1, inplace=True)
ladder_stats.head()

Unnamed: 0,Season,Team,Round.Number,Percentage,Ladder.Position
0,2017,Adelaide Crows,1,1.615385,1
1,2017,Geelong Cats,1,1.575342,2
2,2017,Richmond,1,1.483146,3
3,2017,West Coast Eagles,1,1.462366,4
4,2017,Port Adelaide,1,1.341463,5


In [31]:
home = score_df.merge(ladder_stats, left_on=['round.year', 'round.roundNumber', 'match.homeTeam.name'],
              right_on=['Season', 'Round.Number', 'Team'])
home_away = home.merge(ladder_stats, left_on=['round.year', 'round.roundNumber', 'match.awayTeam.name'],
              right_on=['Season', 'Round.Number', 'Team'], suffixes=('|HOME','|AWAY'))
home_away.drop(['Round.Number|HOME', 'Season|HOME', 'Team|HOME', 'Round.Number|HOME',
               'Round.Number|AWAY', 'Season|AWAY', 'Team|AWAY', 'Round.Number|AWAY'], axis=1, inplace=True)
home_away.head()

Unnamed: 0,round.year,round.roundNumber,match.homeTeam.name,match.awayTeam.name,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,score_diff,Percentage|HOME,Ladder.Position|HOME,Percentage|AWAY,Ladder.Position|AWAY
0,2017,1,Carlton,Richmond,89,132,-43,0.674242,16,1.483146,3
1,2017,1,Collingwood,Western Bulldogs,86,100,-14,0.86,11,1.162791,8
2,2017,1,St Kilda,Melbourne,90,120,-30,0.75,13,1.333333,6
3,2017,1,Sydney Swans,Port Adelaide,82,110,-28,0.745455,14,1.341463,5
4,2017,1,Essendon,Hawthorn,116,91,25,1.274725,7,0.784483,12


In [36]:
home_stats = home_away.merge(team_stats, left_on=['round.year', 'round.roundNumber', 'match.homeTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'])
homeaway_stats = home_stats.merge(team_stats, left_on=['round.year', 'round.roundNumber', 'match.awayTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'], suffixes=('|HOME','|AWAY'))
homeaway_stats.to_csv('outputs/model_training_data_v2.csv')

In [43]:
homeaway_stats[metadata + features + targets]

Unnamed: 0,round.year,round.roundNumber,match.homeTeam.name,match.awayTeam.name,kicks|HOME,handballs|HOME,disposals|HOME,marks|HOME,bounces|HOME,tackles|HOME,...,extendedStats.contestDefOneOnOnes|AWAY,extendedStats.contestDefLosses|AWAY,extendedStats.contestOffOneOnOnes|AWAY,extendedStats.contestOffWins|AWAY,extendedStats.centreBounceAttendances|AWAY,extendedStats.kickins|AWAY,extendedStats.kickinsPlayon|AWAY,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,score_diff
0,2017,1,Carlton,Richmond,215,133,348,99,8,67,...,17,5,26,6,0.0,0.0,0.0,89,132,-43
1,2017,1,Collingwood,Western Bulldogs,246,185,431,118,2,67,...,18,5,13,4,0.0,0.0,0.0,86,100,-14
2,2017,1,St Kilda,Melbourne,179,158,337,64,6,53,...,10,5,15,5,0.0,0.0,0.0,90,120,-30
3,2017,1,Sydney Swans,Port Adelaide,208,150,358,88,2,70,...,22,8,31,13,0.0,0.0,0.0,82,110,-28
4,2017,1,Essendon,Hawthorn,256,172,428,135,8,65,...,19,7,23,9,0.0,0.0,0.0,116,91,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,2023,24,Geelong Cats,Western Bulldogs,224,162,386,107,0,85,...,10,3,11,3,120.0,13.0,12.0,79,104,-25
1346,2023,24,West Coast Eagles,Adelaide Crows,216,142,358,96,6,47,...,13,2,11,3,132.0,5.0,5.0,78,123,-45
1347,2023,24,Port Adelaide,Richmond,229,134,363,108,22,51,...,18,7,17,4,96.0,16.0,12.0,94,63,31
1348,2023,24,Sydney Swans,Melbourne,215,146,361,84,2,76,...,16,2,11,4,88.0,13.0,13.0,56,77,-21
