## About:
Notebook is used to generate the inputs for the ML model requirements. <br>
The output files for this are:
<ul>
    <li>team_stats.csv - aggregated stats to team level by round</li>
    <li>team_score_results.csv - stacked scored to team level by round</li>
    <li>merged_stat_score_data_clean.csv - combined dataset of aggregated team stats and team scores by round</li>
</ul>

In [None]:
import pandas as pd

In [None]:
# read raw data
game_results = pd.read_csv("game_results.csv")
player_stats = pd.read_csv("player_stats.csv")

# add season field
player_stats['season'] = [i.strftime('%Y') for i in pd.to_datetime(player_stats['utcStartTime']).to_list()]

In [None]:
# define aggregation columns and grouping levels
cols_to_sum = ['goals', 'behinds', 'kicks', 'handballs', 'disposals', 'marks', 'bounces',
       'tackles', 'contestedPossessions', 'uncontestedPossessions',
       'totalPossessions', 'inside50s', 'marksInside50', 'contestedMarks',
       'hitouts', 'onePercenters', 'disposalEfficiency', 'clangers',
       'freesFor', 'freesAgainst', 'rebound50s',
       'goalAssists', 'turnovers', 'intercepts', 'tacklesInside50', 'shotsAtGoal',
       'scoreInvolvements', 'metresGained', 'clearances.centreClearances',
       'clearances.stoppageClearances', 'clearances.totalClearances',
       'extendedStats.effectiveKicks', 'extendedStats.kickToHandballRatio', 'extendedStats.effectiveDisposals',
       'extendedStats.marksOnLead', 'extendedStats.interceptMarks',
       'extendedStats.hitoutsToAdvantage', 'extendedStats.groundBallGets',
       'extendedStats.f50GroundBallGets', 'extendedStats.scoreLaunches',
       'extendedStats.pressureActs', 'extendedStats.defHalfPressureActs',
       'extendedStats.spoils', 'extendedStats.ruckContests',
       'extendedStats.contestDefOneOnOnes', 'extendedStats.contestDefLosses',
       'extendedStats.contestOffOneOnOnes', 'extendedStats.contestOffWins',
       'extendedStats.centreBounceAttendances', 'extendedStats.kickins',
       'extendedStats.kickinsPlayon']
group_by = ['season','round.roundNumber','team.name']
df_cols = cols_to_sum + group_by

In [None]:
# subset to only aggregation + grouping columns
subset_player_stats = player_stats[df_cols]

In [None]:
# apply aggregation and output file
team_stats = subset_player_stats.groupby(group_by).sum()
team_stats.to_csv('team_stats.csv')

In [None]:
# get scores & stack the home & away data to the same structure as stats & output to csv
cols = ['year', 'round', 'team', 'score']
home_team = game_results[['round.year','round.roundNumber','match.homeTeam.name','homeTeamScore.matchScore.totalScore']]
home_team.columns = cols
away_team = game_results[['round.year','round.roundNumber','match.awayTeam.name','awayTeamScore.matchScore.totalScore']]
away_team.columns = cols
scores_df = pd.concat([home_team,away_team])
scores_df.reset_index(inplace=True)
scores_df.drop('index', axis=1, inplace=True)
scores_df.to_csv('team_score_results.csv')

In [None]:
# load processed csvs
team_scores = pd.read_csv('team_score_results.csv')
team_stats = pd.read_csv('team_stats.csv')

In [None]:
# merge the datasets
combined_data_raw = team_stats.merge(team_scores, left_on=['season', 'round.roundNumber', 'team.name'], right_on=['year', 'round', 'team'])
combined_data = combined_data_raw.drop(['Unnamed: 0', 'year', 'round', 'team'], axis=1)

In [None]:
# output cleaned data to csv
combined_data.to_csv('merged_stat_score_data_clean.csv',index=False)

## Version 2:

In [None]:
from featuresv2 import features, targets, metadata

In [None]:
# read raw data
game_results = pd.read_csv("game_results.csv")
player_stats = pd.read_csv("player_stats.csv")
team_stats = pd.read_csv('team_stats.csv')

# add season field
player_stats['season'] = [i.strftime('%Y') for i in pd.to_datetime(player_stats['utcStartTime']).to_list()]

In [None]:
score_df = game_results[['round.year', 'round.roundNumber', 'match.homeTeam.name', 'match.awayTeam.name', 'homeTeamScore.matchScore.totalScore','awayTeamScore.matchScore.totalScore']]
score_df['score_diff'] = score_df['homeTeamScore.matchScore.totalScore'] - score_df['awayTeamScore.matchScore.totalScore']

In [None]:
home_stats = score_df.merge(team_stats, left_on=['round.year', 'round.roundNumber', 'match.homeTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'])
homeaway_stats = home_stats.merge(team_stats, left_on=['round.year', 'round.roundNumber', 'match.awayTeam.name'],
              right_on=['season', 'round.roundNumber', 'team.name'], suffixes=('|HOME','|AWAY'))
homeaway_stats.to_csv('model_training_data_v2.csv')

In [None]:
homeaway_stats[metadata + features + targets]