# Version 4:

## About:
Notebook is used to generate the inputs for the ML model requirements. <br>
The output files for this are:
<ul>
    <li>team_stats.csv - aggregated stats to team level by round</li>
    <li>team_score_results.csv - stacked scored to team level by round</li>
    <li>merged_stat_score_data_clean.csv - combined dataset of aggregated team stats and team scores by round</li>
</ul>

In [1]:
import pandas as pd

In [2]:
from featuresv2 import features, targets, metadata

In [3]:
# read raw data
game_results = pd.read_csv("data/game_results.csv")
player_stats = pd.read_csv("data/player_stats.csv")

# add season field
player_stats['season'] = [i.strftime('%Y') for i in pd.to_datetime(player_stats['utcStartTime']).to_list()]

  player_stats = pd.read_csv("data/player_stats.csv")


In [4]:
player_stats.head()

Unnamed: 0,providerId,utcStartTime,status,compSeason.shortName,round.name,round.roundNumber,venue.name,home.team.name,home.team.club.name,away.team.name,...,extendedStats.kickins,extendedStats.kickinsPlayon,player.playerId,player.captain,player.playerJumperNumber,player.givenName,player.surname,teamStatus,team.name,season
0,CD_M20170140101,2017-03-23T08:20:00.000+0000,CONCLUDED,Premiership,Round 1,1,MCG,Carlton,Carlton,Richmond,...,,,CD_I260278,False,17,Sam,Rowe,home,Carlton,2017
1,CD_M20170140101,2017-03-23T08:20:00.000+0000,CONCLUDED,Premiership,Round 1,1,MCG,Carlton,Carlton,Richmond,...,,,CD_I990704,False,9,Patrick,Cripps,home,Carlton,2017
2,CD_M20170140101,2017-03-23T08:20:00.000+0000,CONCLUDED,Premiership,Round 1,1,MCG,Carlton,Carlton,Richmond,...,,,CD_I240359,False,39,Dale,Thomas,home,Carlton,2017
3,CD_M20170140101,2017-03-23T08:20:00.000+0000,CONCLUDED,Premiership,Round 1,1,MCG,Carlton,Carlton,Richmond,...,,,CD_I270637,False,43,Simon,White,home,Carlton,2017
4,CD_M20170140101,2017-03-23T08:20:00.000+0000,CONCLUDED,Premiership,Round 1,1,MCG,Carlton,Carlton,Richmond,...,,,CD_I1001028,False,1,Jack,Silvagni,home,Carlton,2017


In [5]:
# define aggregation columns and grouping levels
key_stats = ['kicks', 'handballs', 'disposals', 'marks', 'bounces',
       'tackles', 'contestedPossessions', 'uncontestedPossessions',
       'totalPossessions', 'inside50s', 'marksInside50', 'contestedMarks',
       'hitouts', 'onePercenters', 'disposalEfficiency', 'clangers',
       'freesFor', 'freesAgainst', 'rebound50s',
       'goalAssists', 'turnovers', 'intercepts', 'tacklesInside50', 'shotsAtGoal',
       'scoreInvolvements', 'metresGained', 'clearances.centreClearances',
       'clearances.stoppageClearances', 'clearances.totalClearances',
       'extendedStats.effectiveKicks', 'extendedStats.kickToHandballRatio', 'extendedStats.effectiveDisposals',
       'extendedStats.marksOnLead', 'extendedStats.interceptMarks',
       'extendedStats.hitoutsToAdvantage', 'extendedStats.groundBallGets',
       'extendedStats.f50GroundBallGets', 'extendedStats.scoreLaunches',
       'extendedStats.pressureActs', 'extendedStats.defHalfPressureActs',
       'extendedStats.spoils', 'extendedStats.ruckContests',
       'extendedStats.contestDefOneOnOnes', 'extendedStats.contestDefLosses',
       'extendedStats.contestOffOneOnOnes', 'extendedStats.contestOffWins',
       'extendedStats.centreBounceAttendances', 'extendedStats.kickins',
       'extendedStats.kickinsPlayon']
group_by = ['season','round.roundNumber','team.name','player.field.position']
df_cols = key_stats + group_by

In [8]:
position_map = {'CHF':"FWD",'INT':"Other",
                'FF':"FWD",'HBFL':"DEF",
                'CHB':"DEF",'FPL':"FWD",
                'WR':"MID",'HFFR':"FWD",
                'BPR':"DEF",'FPR':"FWD",
                'HBFR':"DEF",'RR':"MID",
                'HFFL':"FWD",'BPL':"DEF",
                'C':"MID",'RK':"MID",
                'FB':"DEF",'R':"MID",
                'WL':"MID",'SUB':"Other",'EMERG':"Other"
               }

In [9]:
player_stats['player.field.position'] = [position_map[i] for i in player_stats['player.player.position']]

In [10]:
field_pos = player_stats['player.field.position'].unique()

In [11]:
# subset to only aggregation + grouping columns
subset_player_stats = player_stats[df_cols]

# apply aggregation and output file
team_stats = subset_player_stats.groupby(group_by).sum()
team_stats.reset_index(inplace=True)
team_stats.to_csv('outputs/team_stats.csv')

In [12]:
# pivot stats out and sum by position for grouped positions
team_stats['idx'] = team_stats.groupby(['season','round.roundNumber','team.name']).cumcount()+1
team_stats_position = team_stats.pivot_table(index=['season','round.roundNumber','team.name','player.field.position'], columns='idx', 
                    values=key_stats, aggfunc='sum')

# set columns to new_names with unstacked columns
df = team_stats_position.sort_index(axis=1, level=1)
df.columns = [f'{x}_{field_pos[y-1]}' for x,y in df.columns]
df = df.reset_index()

# drop the categrorical positions out
df.drop('player.field.position', axis=1, inplace=True)

# group stats data to team level for all new features
df = df.groupby(['season','round.roundNumber','team.name']).sum()
df.reset_index(inplace=True)

In [13]:
df.to_csv('outputs/team_stats_field_position.csv')

In [14]:
# get game results to merge into player stats
score_df = game_results[['round.year', 'round.roundNumber', 'match.homeTeam.name', 'match.awayTeam.name', 'homeTeamScore.matchScore.totalScore','awayTeamScore.matchScore.totalScore']]
score_df['round.year'] = score_df['round.year'].astype(str)
score_df['score_diff'] = score_df['homeTeamScore.matchScore.totalScore'] - score_df['awayTeamScore.matchScore.totalScore']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['round.year'] = score_df['round.year'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['score_diff'] = score_df['homeTeamScore.matchScore.totalScore'] - score_df['awayTeamScore.matchScore.totalScore']


In [15]:
# merge home team stats for round into score dataframe
home_stats = score_df.merge(df, left_on=['round.year', 'round.roundNumber', 'match.homeTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'])
# merge away team stats for round into score+home dataframe
homeaway_stats = home_stats.merge(df, left_on=['round.year', 'round.roundNumber', 'match.awayTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'], suffixes=('|HOME','|AWAY'))

# dump to csv
homeaway_stats.to_csv('outputs/model_training_data.csv', index=False)

In [16]:
# collect names of all features with position suffix added to them
n_features = []

for i in range(0,4):
    for feature in key_stats:
        n_features.append(feature+'_'+str(field_pos[i]))

In [17]:
# combine 'HOME' & 'AWAY' suffixed features from combined df to use for subsetting training data
away_n_feat = [feat + '|AWAY' for feat in n_features]
home_n_feat = [feat + '|HOME' for feat in n_features]
tot_n_feat = home_n_feat + away_n_feat

In [18]:
# write list to .py file to be used in model notebook
with open('position_group_features.py','w+') as file:
    file.write('features = ' + str(tot_n_feat))

In [19]:
homeaway_stats[metadata + tot_n_feat + targets].head()

Unnamed: 0,round.year,round.roundNumber,match.homeTeam.name,match.awayTeam.name,kicks_DEF|HOME,handballs_DEF|HOME,disposals_DEF|HOME,marks_DEF|HOME,bounces_DEF|HOME,tackles_DEF|HOME,...,extendedStats.contestDefOneOnOnes_Other|AWAY,extendedStats.contestDefLosses_Other|AWAY,extendedStats.contestOffOneOnOnes_Other|AWAY,extendedStats.contestOffWins_Other|AWAY,extendedStats.centreBounceAttendances_Other|AWAY,extendedStats.kickins_Other|AWAY,extendedStats.kickinsPlayon_Other|AWAY,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,score_diff
0,2017,1,Carlton,Richmond,64.0,37.0,101.0,34.0,1.0,11.0,...,5.0,1.0,6.0,2.0,0.0,0.0,0.0,89,132,-43
1,2017,1,Collingwood,Western Bulldogs,64.0,45.0,109.0,40.0,0.0,12.0,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,86,100,-14
2,2017,1,St Kilda,Melbourne,49.0,44.0,93.0,18.0,2.0,9.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,90,120,-30
3,2017,1,Sydney Swans,Port Adelaide,61.0,39.0,100.0,30.0,2.0,13.0,...,3.0,1.0,5.0,2.0,0.0,0.0,0.0,82,110,-28
4,2017,1,Essendon,Hawthorn,68.0,38.0,106.0,40.0,2.0,16.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,116,91,25
