In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tpot import TPOTRegressor
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
from featuresv2 import features, targets, metadata
%matplotlib inline



## Version 2 Model

In [2]:
data = pd.read_csv('outputs/model_training_data_v2.csv')

In [3]:
round_matchups = [
    ('Sydney Swans','Melbourne'),
    ('Brisbane Lions','Carlton'),
    ('Gold Coast Suns','Richmond'),
    ('GWS Giants','Collingwood')]

In [21]:
round1 = [('Carlton', 'Richmond'),
('Collingwood','Sydney Swans'),
('Essendon','Hawthorn'),
('GWS Giants','North Melbourne'),
('Geelong Cats','St Kilda'),
('Gold Coast Suns','Adelaide Crows'),
('Melbourne','Western Bulldogs'),
('Port Adelaide','West Coast Eagles'),
('Fremantle','Brisbane Lions')]

In [7]:
targets

['homeTeamScore.matchScore.totalScore',
 'awayTeamScore.matchScore.totalScore',
 'score_diff']

In [13]:
def generate_test_train_split(data, features, target, test_size=0.3):
    X = data[features]
    y = data[target]

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    return {"x_train":x_train,"x_test":x_test,"y_train":y_train,"y_test":y_test}

In [14]:
def tpot_pipeline_v2(data, features, target, size=20, scoring='r2'):
    data_ = generate_test_train_split(data, features, target)

    my_scorer = make_scorer(mean_squared_error, greater_is_better=False)

    pipeline_optimizer = TPOTRegressor(generations=5, population_size=size, verbosity=2, scoring=scoring)

    pipeline_optimizer.fit(data_['x_train'], data_['y_train'])
    
    print(pipeline_optimizer.score(data_['x_test'], data_['y_test']))
    
    return pipeline_optimizer

In [68]:
def return_stats_input(data, home_team, away_team, model_inputs, games=5):
    '''Take in player_stats and returns aggregated team stats for previous home/away team matchups'''
    
#     if 'team.name' not in model_inputs:
#         model_inputs.append('team.name')
        
    match_ups = data[(data['match.homeTeam.name']==home_team) & (data['match.awayTeam.name']==away_team)]
    weights = range(1,len(match_ups)+1)

    stats = match_ups[model_inputs+['match.homeTeam.name']].groupby('match.homeTeam.name').apply(lambda x: np.average(x, weights=weights, axis=0))
    
    return pd.DataFrame(dict(zip(model_inputs,stats[0])),index=[0])

In [69]:
def generate_margin_tear_off(model, round_matchups, team_stats, model_inputs):
    print('-------------------' + str(model.fitted_pipeline_[0]).replace('()','') + ' TEAR-OFF-------------------')
    for tup in round_matchups:
        predict = model.predict(return_stats_input(team_stats, tup[0], tup[1], model_inputs))[0]
        if predict < 0:
            winner = tup[1]
        else:
            winner = tup[0]
        print("------------------------------------------------")
        print("Home: " + tup[0])
        print("Away: " + tup[1])
        print("Winner: " + winner + "\t       " + "Margin: " + str(abs(round(predict))))

In [70]:
def generate_score_tear_off(home_model, away_model, round_matchups, team_stats, model_inputs):
   # print('-------------------' + str(model.fitted_pipeline_[0]).replace('()','') + ' TEAR-OFF-------------------')
    preds = []
    for tup in round_matchups:
        h_predict = home_model.predict(return_stats_input(team_stats, tup[0], tup[1], model_inputs))[0]
        a_predict = away_model.predict(return_stats_input(team_stats, tup[0], tup[1], model_inputs))[0]
        if h_predict < a_predict:
            winner = tup[1]
            margin = round(a_predict) - round(h_predict)
        else:
            winner = tup[0]
            margin = round(h_predict) - round(a_predict)
        paddingh = 25 - len("Home: " + tup[0])
        paddinga = 25 - len("Away: " + tup[1])
        print("------------------------------------------------")
        print("Home: " + tup[0] + " "*paddingh + str(abs(round(h_predict))))
        print("Away: " + tup[1] + " "*paddinga + str(abs(round(a_predict))))
        print("Winner: " + winner + " by "+ str(margin) + " points")
        print("Total Score: " + str(round(h_predict) + round(a_predict)))
        preds.append((h_predict, a_predict, margin))
    return preds

In [64]:
home_model = tpot_pipeline_v2(data, features, targets[0], size=10)

Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8869561348324426

Generation 2 - Current best internal CV score: 0.8869561348324426

Generation 3 - Current best internal CV score: 0.9714292773697034

Generation 4 - Current best internal CV score: 0.9716509556394616

Generation 5 - Current best internal CV score: 0.971650955641367

Best pipeline: RidgeCV(StandardScaler(input_matrix))
0.9732472467442386


In [65]:
away_model = tpot_pipeline_v2(data, features, targets[1], size=10)

Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8991087717317608

Generation 2 - Current best internal CV score: 0.9724996539690842

Generation 3 - Current best internal CV score: 0.9724996539690842

Generation 4 - Current best internal CV score: 0.9724996539690842

Generation 5 - Current best internal CV score: 0.9726279979357889

Best pipeline: ElasticNetCV(RobustScaler(input_matrix), l1_ratio=0.9, tol=0.01)
0.9705590440341471


In [19]:
margin_model = tpot_pipeline_v2(data, features, targets[2], size=10)

Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9731446552014216

Generation 2 - Current best internal CV score: 0.9731446552014216

Generation 3 - Current best internal CV score: 0.9731446552014216

Generation 4 - Current best internal CV score: 0.9733778941243735

Generation 5 - Current best internal CV score: 0.9733778941243735

Best pipeline: ElasticNetCV(RobustScaler(input_matrix), l1_ratio=1.0, tol=0.0001)
0.9720120903463568


In [74]:
generate_score_tear_off(home_model, away_model, round1, data, features)

------------------------------------------------
Home: Carlton            86
Away: Richmond           81
Winner: Carlton by 5 points
Total Score: 167
------------------------------------------------
Home: Collingwood        65
Away: Sydney Swans       48
Winner: Collingwood by 17 points
Total Score: 113
------------------------------------------------
Home: Essendon           100
Away: Hawthorn           83
Winner: Essendon by 17 points
Total Score: 183
------------------------------------------------
Home: GWS Giants         57
Away: North Melbourne    84
Winner: North Melbourne by 27 points
Total Score: 141




------------------------------------------------
Home: Geelong Cats       98
Away: St Kilda           67
Winner: Geelong Cats by 31 points
Total Score: 165
------------------------------------------------
Home: Gold Coast Suns    99
Away: Adelaide Crows     79
Winner: Gold Coast Suns by 20 points
Total Score: 178
------------------------------------------------
Home: Melbourne          108
Away: Western Bulldogs   68
Winner: Melbourne by 40 points
Total Score: 176
------------------------------------------------
Home: Port Adelaide      109
Away: West Coast Eagles  50
Winner: Port Adelaide by 59 points
Total Score: 159
------------------------------------------------
Home: Fremantle          81
Away: Brisbane Lions     89
Winner: Brisbane Lions by 8 points
Total Score: 170


In [72]:
# print(return_stats_inputv3(data, home_team, away_team, features))
generate_margin_tear_off(margin_model, round1, data, features)
# data[(data['match.homeTeam.name']==home_team) & (data['match.awayTeam.name']==away_team)][features].head()

-------------------RobustScaler TEAR-OFF-------------------
------------------------------------------------
Home: Carlton
Away: Richmond
Winner: Carlton	       Margin: 6
------------------------------------------------
Home: Collingwood
Away: Sydney Swans
Winner: Collingwood	       Margin: 17
------------------------------------------------
Home: Essendon
Away: Hawthorn
Winner: Essendon	       Margin: 18
------------------------------------------------
Home: GWS Giants
Away: North Melbourne
Winner: North Melbourne	       Margin: 27
------------------------------------------------
Home: Geelong Cats
Away: St Kilda
Winner: Geelong Cats	       Margin: 31
------------------------------------------------
Home: Gold Coast Suns
Away: Adelaide Crows
Winner: Gold Coast Suns	       Margin: 19
------------------------------------------------
Home: Melbourne
Away: Western Bulldogs
Winner: Melbourne	       Margin: 40
------------------------------------------------
Home: Port Adelaide
Away: West C