In [19]:
import os
import sys
import math
from rpy2.robjects import packages, pandas2ri
import pandas as pd
import numpy as np

PROJECT_PATH = os.path.join(os.getcwd(), '../')
                            
if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

from server.data_processors.feature_functions import (
    add_last_week_result,
    add_last_week_score,
    add_cum_percent,
    add_cum_win_points,
    add_rolling_last_week_win_rate,
    add_ladder_position,
    add_win_streak
)
from server.data_processors import FeatureBuilder


d = {'package.dependencies': 'package_dot_dependencies',
     'package_dependencies': 'package_uscore_dependencies'}
fitzroy = packages.importr('fitzRoy', robject_translations = d)

In [2]:
matches = fitzroy.get_match_results()
matches

Game,Date,Round,...,Season,Round.Type,Round.Number
1.000000,-26535.000000,'R1',...,1897.000000,'Regular',1
2.000000,-26535.000000,'R1',...,1897.000000,'Regular',1
3.000000,-26535.000000,'R1',...,1897.000000,'Regular',1
4.000000,-26535.000000,'R1',...,1897.000000,'Regular',1
...,...,...,...,...,...,...
15404.000000,17789.000000,'SF',...,2018.000000,'Finals',25
15405.000000,17795.000000,'PF',...,2018.000000,'Finals',26
15406.000000,17796.000000,'PF',...,2018.000000,'Finals',26
15407.000000,17803.000000,'GF',...,2018.000000,'Finals',27


In [3]:
columns = {
    'Game': 'game',
    'Date': 'date',
    'Round': 'round',
    'Home.Team': 'home_team',
    'Home.Goals': 'home_goals',
    'Home.Behinds': 'home_behinds',
    'Home.Points': 'home_points',
    'Away.Team': 'away_team',
    'Away.Goals': 'away_goals',
    'Away.Behinds': 'away_behinds',
    'Away.Points': 'away_points',
    'Venue': 'venue',
    'Margin': 'home_margin',
    'Season': 'year',
    'Round.Type': 'round_type',
    'Round.Number': 'round_number'
}
match_df = (pandas2ri
            .ri2py(matches)
            .rename(columns=columns)
            .assign(date=lambda x: pd.to_datetime(x['date'], unit='D'))
            .drop('round', axis=1))

match_df

  res = PandasDataFrame.from_items(items)


Unnamed: 0,game,date,home_team,home_goals,home_behinds,home_points,away_team,away_goals,away_behinds,away_points,venue,home_margin,year,round_type,round_number
0,1.0,1897-05-08,Fitzroy,6,13,49,Carlton,2,4,16,Brunswick St,33,1897.0,Regular,1
1,2.0,1897-05-08,Collingwood,5,11,41,St Kilda,2,4,16,Victoria Park,25,1897.0,Regular,1
2,3.0,1897-05-08,Geelong,3,6,24,Essendon,7,5,47,Corio Oval,-23,1897.0,Regular,1
3,4.0,1897-05-08,Sydney,3,9,27,Melbourne,6,8,44,Lake Oval,-17,1897.0,Regular,1
4,5.0,1897-05-15,Sydney,6,4,40,Carlton,5,6,36,Lake Oval,4,1897.0,Regular,2
5,6.0,1897-05-15,Essendon,4,6,30,Collingwood,8,2,50,East Melbourne,-20,1897.0,Regular,2
6,7.0,1897-05-15,St Kilda,3,8,26,Fitzroy,10,6,66,Junction Oval,-40,1897.0,Regular,2
7,8.0,1897-05-15,Melbourne,9,10,64,Geelong,3,1,19,M.C.G.,45,1897.0,Regular,2
8,9.0,1897-05-22,Collingwood,6,5,41,Geelong,5,7,37,Victoria Park,4,1897.0,Regular,3
9,10.0,1897-05-22,Fitzroy,5,9,39,Melbourne,7,8,50,Brunswick St,-11,1897.0,Regular,3


In [4]:
match_dict = {
    'team': np.append(match_df['home_team'], match_df['away_team']),
    'year': np.append(match_df['year'], match_df['year']).astype('int'),
    'round_number': np.append(match_df['round_number'], match_df['round_number']),
    'at_home': np.append(np.ones(len(match_df)), np.zeros(len(match_df))),
    'goals': np.append(match_df['home_goals'], match_df['away_goals']),
    'behinds': np.append(match_df['home_behinds'], match_df['away_behinds']),
    'score': np.append(match_df['home_points'], match_df['away_points']),
    'oppo_team': np.append(match_df['away_team'], match_df['home_team']),
    'oppo_goals': np.append(match_df['away_goals'], match_df['home_goals']),
    'oppo_behinds': np.append(match_df['away_behinds'], match_df['home_behinds']),
    'oppo_score': np.append(match_df['away_points'], match_df['home_points']),
    'venue': np.append(match_df['venue'], match_df['venue']),
    'round_type': np.append(match_df['round_type'], match_df['round_type']),
    'margin': np.append(match_df['home_margin'], match_df['home_margin'] * -1)
}
stacked_df = (pd
              .DataFrame(match_dict)
              .set_index(['team', 'year', 'round_number'], drop=False)
              .sort_index()
              # Have to drop drawn finals, because they are replayed with same team/year/round_number combo
              .drop_duplicates(subset=('team', 'year', 'round_number'), keep='last'))

# Due to data entry error, need to fix this
stacked_df.loc[('Geelong', 1897, 15), 'oppo_team'] = 'Collingwood'

stacked_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,team,year,round_number,at_home,goals,behinds,score,oppo_team,oppo_goals,oppo_behinds,oppo_score,venue,round_type,margin
team,year,round_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Adelaide,1991,1,Adelaide,1991,1,1.0,24,11,155,Hawthorn,9,15,69,Football Park,Regular,86
Adelaide,1991,2,Adelaide,1991,2,1.0,12,9,81,Carlton,15,14,104,Football Park,Regular,-23
Adelaide,1991,3,Adelaide,1991,3,0.0,19,18,132,Sydney,15,18,108,S.C.G.,Regular,24
Adelaide,1991,4,Adelaide,1991,4,0.0,6,11,47,Essendon,12,20,92,Windy Hill,Regular,-45
Adelaide,1991,5,Adelaide,1991,5,0.0,9,11,65,West Coast,19,16,130,Subiaco,Regular,-65
Adelaide,1991,6,Adelaide,1991,6,1.0,19,14,128,Footscray,14,13,97,Football Park,Regular,31
Adelaide,1991,7,Adelaide,1991,7,0.0,4,7,31,St Kilda,24,18,162,Moorabbin Oval,Regular,-131
Adelaide,1991,9,Adelaide,1991,9,0.0,18,10,118,North Melbourne,18,12,120,M.C.G.,Regular,-2
Adelaide,1991,10,Adelaide,1991,10,1.0,15,16,106,Melbourne,10,12,72,Football Park,Regular,34
Adelaide,1991,11,Adelaide,1991,11,0.0,14,9,93,Geelong,27,15,177,Kardinia Park,Regular,-84


In [69]:
CITIES = {
    'Adelaide': {
        'state': 'SA',
        'lat': -34.9285,
        'long': 138.6007
    },
    'Sydney': {
        'state': 'NSW',
        'lat': -33.8688,
        'long': 151.2093
    },
    'Melbourne': {
        'state': 'VIC',
        'lat': -37.8136,
        'long': 144.9631
    },
    'Geelong': {
        'state': 'VIC',
        'lat': 38.1499,
        'long': 144.3617
    },
    'Perth': {
        'state': 'WA',
        'lat': -31.9505,
        'long': 115.8605
    },
    'Gold Coast': {
        'state': 'QLD',
        'lat': -28.0167,
        'long': 153.4000
    },
    'Brisbane': {
        'state': 'QLD',
        'lat': -27.4698,
        'long': 153.0251
    },
    'Launceston': {
        'state': 'TAS',
        'lat': -41.4332,
        'long': 147.1441
    },
    'Canberra': {
        'state': 'ACT',
        'lat': -35.2809,
        'long': 149.1300
    },
    'Hobart': {
        'state': 'TAS',
        'lat': -42.8821,
        'long': 147.3272
    },
    'Darwin': {
        'state': 'NT',
        'lat': -12.4634,
        'long': 130.8456
    },
    'Alice Springs': {
        'state': 'NT',
        'lat': -23.6980,
        'long': 133.8807
    },
    'Wellington': {
        'state': 'NZ',
        'lat': -41.2865,
        'long': 174.7762
    },
    'Euroa': {
        'state': 'VIC',
        'lat': -36.7500,
        'long': 145.5667
    },
    'Yallourn': {
        'state': 'VIC',
        'lat': -38.1803,
        'long': 146.3183
    },
    'Cairns': {
        'state': 'QLD',
        'lat': -6.9186,
        'long': 145.7781
    },
    'Ballarat': {
        'state': 'VIC',
        'lat': -37.5622,
        'long': 143.8503
    },
    'Shanghai': {
        'state': 'CHN',
        'lat': 31.2304,
        'long': 121.4737
    },
    'Albury': {
        'state': 'NSW',
        'lat': 36.0737,
        'long': 146.9135
    }
}

TEAM_CITIES = {
    'Adelaide': 'Adelaide',
    'Brisbane Lions': 'Brisbane',
    'Carlton': 'Melbourne',
    'Collingwood': 'Melbourne',
    'Essendon': 'Melbourne',
    'Fitzroy': 'Melbourne',
    'Footscray': 'Melbourne',
    'Fremantle': 'Perth',
    'GWS': 'Sydney',
    'Geelong': 'Geelong',
    'Gold Coast': 'Gold Coast',
    'Hawthorn': 'Melbourne',
    'Melbourne': 'Melbourne',
    'North Melbourne': 'Melbourne',
    'Port Adelaide': 'Adelaide',
    'Richmond': 'Melbourne',
    'St Kilda': 'Melbourne',
    'Sydney': 'Sydney',
    'University': 'Melbourne',
    'West Coast': 'Perth'
}

VENUE_CITIES = {
    'Football Park': 'Adelaide',
    'S.C.G.': 'Sydney',
    'Windy Hill': 'Melbourne',
    'Subiaco': 'Perth',
    'Moorabbin Oval': 'Melbourne',
    'M.C.G.': 'Melbourne',
    'Kardinia Park': 'Geelong',
    'Victoria Park': 'Melbourne',
    'Waverley Park': 'Melbourne',
    'Princes Park': 'Melbourne',
    'Western Oval': 'Melbourne',
    'W.A.C.A.': 'Perth',
    'Carrara': 'Gold Coast',
    'Gabba': 'Brisbane',
    'Docklands': 'Melbourne',
    'York Park': 'Launceston',
    'Manuka Oval': 'Canberra',
    'Sydney Showground': 'Sydney',
    'Adelaide Oval': 'Adelaide',
    'Bellerive Oval': 'Hobart',
    'Marrara Oval': 'Darwin',
    'Traeger Park': 'Alice Springs',
    'Perth Stadium': 'Perth',
    'Stadium Australia': 'Sydney',
    'Wellington': 'Wellington',
    'Lake Oval': 'Melbourne',
    'East Melbourne': 'Melbourne',
    'Corio Oval': 'Geelong',
    'Junction Oval': 'Melbourne',
    'Brunswick St': 'Melbourne',
    'Punt Rd': 'Melbourne',
    'Glenferrie Oval': 'Melbourne',
    'Arden St': 'Melbourne',
    'Olympic Park': 'Melbourne',
    'Yarraville Oval': 'Melbourne',
    'Toorak Park': 'Melbourne',
    'Euroa': 'Euroa',
    'Coburg Oval': 'Melbourne',
    'Brisbane Exhibition': 'Brisbane',
    'North Hobart': 'Hobart',
    'Bruce Stadium': 'Canberra',
    'Yallourn': 'Yallourn',
    "Cazaly's Stadium": 'Cairns',
    'Eureka Stadium': 'Ballarat',
    'Blacktown': 'Sydney',
    'Jiangwan Stadium': 'Shanghai',
    'Albury': 'Albury'
}

def add_last_week_goals(data_frame):
    last_week_goals = data_frame['goals'].groupby(level=0).shift()

    return data_frame.assign(last_week_goals=last_week_goals).drop('goals', axis=1)

def add_last_week_behinds(data_frame):
    last_week_behinds = data_frame['behinds'].groupby(level=0).shift()

    return data_frame.assign(last_week_behinds=last_week_behinds).drop('behinds', axis=1)

def add_out_of_state(data_frame):
    venue_state = data_frame['venue'].map(lambda x: CITIES[VENUE_CITIES[x]]['state'])
    team_state = data_frame['team'].map(lambda x: CITIES[TEAM_CITIES[x]]['state'])
    
    return data_frame.assign(out_of_state=(team_state != venue_state).astype(int))

# https://www.movable-type.co.uk/scripts/latlong.html
def haversine_formula(lats_longs):
    (lat1, long1), (lat2, long2) = lats_longs
    EARTH_RADIUS = 6371
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = phi2 - phi1
    delta_lambda = math.radians(long2 - long1)
    a = math.sin(delta_phi / 2)**2 + (math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return EARTH_RADIUS * c

def add_travel_distance(data_frame):
    venue_lat_long = data_frame['venue'].map(
        lambda x: (CITIES[VENUE_CITIES[x]]['lat'],
                   CITIES[VENUE_CITIES[x]]['long'])
    )
    team_lat_long = data_frame['team'].map(
        lambda x: (CITIES[TEAM_CITIES[x]]['lat'],
                   CITIES[TEAM_CITIES[x]]['long'])
    )
    
    return data_frame.assign(travel_distance=[
        haversine_formula(lats_longs) for lats_longs in zip(venue_lat_long, team_lat_long)
    ])

FEATURE_FUNCS = [
    add_out_of_state,
    add_travel_distance,
    add_last_week_goals,
    add_last_week_behinds,
    add_last_week_result,
    add_last_week_score,
    add_cum_percent,
    add_cum_win_points,
    add_rolling_last_week_win_rate,
    add_ladder_position,
    add_win_streak
]

match_model_df = FeatureBuilder(feature_funcs=FEATURE_FUNCS).transform(stacked_df).dropna()
match_model_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,team,year,round_number,at_home,score,oppo_team,oppo_goals,oppo_behinds,oppo_score,venue,...,cum_percent,oppo_cum_percent,cum_win_points,oppo_cum_win_points,rolling_last_week_win_rate,oppo_rolling_last_week_win_rate,ladder_position,oppo_ladder_position,win_streak,oppo_win_streak
team,year,round_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Adelaide,1991,2,Adelaide,1991,2,1.0,81,Carlton,15,14,104,Football Park,...,1.0,1.0,4.0,0.0,1.000000,0.478261,2,8,1.0,-2.0
Adelaide,1991,3,Adelaide,1991,3,0.0,132,Sydney,15,18,108,S.C.G.,...,1.0,1.0,4.0,0.0,0.500000,0.217391,2,14,-1.0,-2.0
Adelaide,1991,4,Adelaide,1991,4,0.0,47,Essendon,12,20,92,Windy Hill,...,1.0,1.0,8.0,8.0,0.666667,0.739130,2,4,1.0,2.0
Adelaide,1991,5,Adelaide,1991,5,0.0,65,West Coast,19,16,130,Subiaco,...,1.0,1.0,8.0,12.0,0.500000,0.695652,5,4,-1.0,3.0
Adelaide,1991,6,Adelaide,1991,6,1.0,128,Footscray,14,13,97,Football Park,...,1.0,1.0,8.0,12.0,0.400000,0.565217,8,5,-2.0,2.0
Adelaide,1991,7,Adelaide,1991,7,0.0,31,St Kilda,24,18,162,Moorabbin Oval,...,1.0,1.0,12.0,10.0,0.500000,0.369565,5,9,1.0,-1.0
Adelaide,1991,9,Adelaide,1991,9,0.0,118,North Melbourne,18,12,120,M.C.G.,...,1.0,1.0,12.0,16.0,0.428571,0.608696,9,8,-1.0,3.0
Adelaide,1991,10,Adelaide,1991,10,1.0,106,Melbourne,10,12,72,Football Park,...,1.0,1.0,12.0,24.0,0.375000,0.695652,10,3,-2.0,5.0
Adelaide,1991,11,Adelaide,1991,11,0.0,93,Geelong,27,15,177,Kardinia Park,...,1.0,1.0,16.0,20.0,0.444444,0.391304,10,8,1.0,-1.0
Adelaide,1991,12,Adelaide,1991,12,1.0,50,Fitzroy,7,5,47,Football Park,...,1.0,1.0,16.0,8.0,0.400000,0.260870,9,13,-1.0,1.0
