In [102]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tpot import TPOTRegressor
from sklearn.metrics import make_scorer
import xgboost as xg 
import matplotlib.pyplot as plt
#from featuresv1 import features, target
from featuresv2 import features, targets, metadata
%matplotlib inline

In [82]:
data = pd.read_csv('merged_stat_score_data_clean.csv')
player_stats = pd.read_csv("player_stats.csv")

player_stats['season'] = [i.strftime('%Y') for i in pd.to_datetime(player_stats['utcStartTime']).to_list()]

  player_stats = pd.read_csv("player_stats.csv")


In [83]:
def generate_test_train_split(data, features, target, test_size=0.3):
    X = data[features]
    y = data[target]

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    return {"x_train":x_train,"x_test":x_test,"y_train":y_train,"y_test":y_test}

In [84]:
def tpot_pipeline(data, features, target, size=20, scoring='r2'):
    data_ = generate_test_train_split(data, features, target)

    my_scorer = make_scorer(mean_squared_error, greater_is_better=False)

    pipeline_optimizer = TPOTRegressor(generations=5, population_size=size, verbosity=2, scoring=scoring)

    pipeline_optimizer.fit(data_['x_train'], list(data_['y_train'].score))
    
    print(pipeline_optimizer.score(data_['x_test'], data_['y_test']))
    
    return pipeline_optimizer

In [88]:
model = tpot_pipeline(data, features, target)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9068551112995064

Generation 2 - Current best internal CV score: 0.9068551112995064

Generation 3 - Current best internal CV score: 0.9068551112995064

Generation 4 - Current best internal CV score: 0.907100104890662

Generation 5 - Current best internal CV score: 0.907100104890662

Best pipeline: RidgeCV(MaxAbsScaler(ZeroCount(input_matrix)))
0.9156231010904672


  y = column_or_1d(y, warn=True)


In [None]:
def fit_lin_regression(x_train, y_train):

    # fit linear regression line
    regr_model = LinearRegression()
    regr_model.fit(x_train, y_train)
    
    return regr_model

In [None]:
def eval_lin_regression(x_test, y_test):
    # predict the values
    pred = regr.predict(x_test)
    
    # Regression Score 
    print("Regression Score (R-square): " + str(regr.score(x_test, y_test)))

    #RMSE
    print("RMSE: " + str(mean_squared_error(y_test, pred)**0.5))

In [None]:
def fit_xgboost(x_train, y_train):# Instantiation 
    xgb_r = xg.XGBRegressor(objective ='reg:squarederror', 
                      n_estimators = 10, seed = 42) 

    # Fitting the model 
    xgb_r.fit(x_train, y_train)
    
    return xgb_r

In [None]:
def eval_xgboost(x_test, y_test):
    # Predict the model 
    pred = xgb_r.predict(x_test) 

    # RMSE Computation 
    rmse = np.sqrt(mean_squared_error(y_test, pred)) 
    print("RMSE : % f" %(rmse)) 

In [90]:
def return_stats_input(data, home_team, away_team, model_inputs, games=5):
    '''Take in player_stats and returns aggregated team stats for previous home/away team matchups'''
    
    if 'team.name' not in model_inputs:
        model_inputs.append('team.name')
        
    match_ups = data[(data['home.team.name']==home_team) & (data['away.team.name']==away_team)]
    group_by = ['season','round.roundNumber','team.name']
    team_stats = match_ups.groupby(group_by).sum()
    team_stats.reset_index(inplace=True)
    
    home_stats = team_stats[team_stats['team.name']==home_team][model_inputs].groupby('team.name').mean()
    
    away_stats = team_stats[team_stats['team.name']==away_team][model_inputs].groupby('team.name').mean()
    
    return (home_stats, away_stats)

In [91]:
round_matchups = [
    ('Sydney Swans','Melbourne'),
    ('Brisbane Lions','Carlton'),
    ('Gold Coast Suns','Richmond'),
    ('GWS Giants','Collingwood')]

In [None]:
def generate_tear_off_lr(model, round_matchups, player_stats,model_inputs):
    print('-------------------LINEAR REGRESSION TEAR-OFF-------------------')
    for tup in round_matchups:
        print("Home: " + tup[0] + '\t' + "Away: " + tup[1])
        print(f"{model.predict(return_stats_input(player_stats,tup[0],tup[1],model_inputs)[0])}" + '\t\t\t' + \
             f"{model.predict(return_stats_input(player_stats,tup[0],tup[1],model_inputs)[1])}")

In [95]:
def generate_tear_off(model, round_matchups, player_stats,model_inputs):
    print('-------------------' + str(model.fitted_pipeline_[0]).replace('()','') + ' TEAR-OFF-------------------')
    for tup in round_matchups:
        print("Home: " + tup[0] + '\t' + "Away: " + tup[1])
        print(f"{round(model.predict(return_stats_input(player_stats,tup[0],tup[1],model_inputs)[0])[0])}" + '\t\t\t' + \
             f"{round(model.predict(return_stats_input(player_stats,tup[0],tup[1],model_inputs)[1])[0])}")

In [96]:
generate_tear_off(model, round_matchups, player_stats, features)

-------------------ZeroCount TEAR-OFF-------------------
Home: Sydney Swans	Away: Melbourne
68			72
Home: Brisbane Lions	Away: Carlton
94			67
Home: Gold Coast Suns	Away: Richmond
69			99
Home: GWS Giants	Away: Collingwood
87			72


In [97]:
round1 = [('Carlton', 'Richmond'),
('Collingwood','Sydney Swans'),
('Essendon','Hawthorn'),
('GWS Giants','North Melbourne'),
('Geelong Cats','St Kilda'),
('Gold Coast Suns','Adelaide Crows'),
('Melbourne','Western Bulldogs'),
('Port Adelaide','West Coast Eagles'),
('Fremantle','Brisbane Lions')]

## Version 2 Model

In [103]:
data = pd.read_csv('model_training_data_v2.csv')

In [104]:
targets

['homeTeamScore.matchScore.totalScore',
 'awayTeamScore.matchScore.totalScore',
 'score_diff']

In [105]:
def tpot_pipeline_v2(data, features, target, size=20, scoring='r2'):
    data_ = generate_test_train_split(data, features, target)

    my_scorer = make_scorer(mean_squared_error, greater_is_better=False)

    pipeline_optimizer = TPOTRegressor(generations=5, population_size=size, verbosity=2, scoring=scoring)

    pipeline_optimizer.fit(data_['x_train'], data_['y_train'])
    
    print(pipeline_optimizer.score(data_['x_test'], data_['y_test']))
    
    return pipeline_optimizer

In [106]:
model = tpot_pipeline_v2(data, features, targets[2])

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9731446552014216

Generation 2 - Current best internal CV score: 0.9732267451139534

Generation 3 - Current best internal CV score: 0.9732267451139534

Generation 4 - Current best internal CV score: 0.9732267451139534

Generation 5 - Current best internal CV score: 0.9732869385758363

Best pipeline: LinearSVR(RidgeCV(RobustScaler(input_matrix)), C=0.0001, dual=False, epsilon=0.01, loss=squared_epsilon_insensitive, tol=1e-05)
0.9712729972100358


In [107]:
def return_stats_inputv2(data, home_team, away_team, model_inputs, games=5):
    '''Take in player_stats and returns aggregated team stats for previous home/away team matchups'''
    
#     if 'team.name' not in model_inputs:
#         model_inputs.append('team.name')
        
    match_ups = data[(data['match.homeTeam.name']==home_team) & (data['match.awayTeam.name']==away_team)]
    stats = match_ups[model_inputs+['match.homeTeam.name']].groupby('match.homeTeam.name').mean()
    
    return stats[model_inputs]

In [108]:
def generate_tear_offv2(model, round_matchups, team_stats, model_inputs):
    print('-------------------' + str(model.fitted_pipeline_[0]).replace('()','') + ' TEAR-OFF-------------------')
    for tup in round_matchups:
        predict = model.predict(return_stats_inputv2(team_stats, tup[0], tup[1], model_inputs))[0]
        if predict < 0:
            winner = tup[1]
        else:
            winner = tup[0]
        print("------------------------------------------------")
        print("Home: " + tup[0])
        print("Away: " + tup[1])
        print("Winner: " + winner + "\t       " + "Margin: " + str(abs(round(predict))))

In [60]:
model.predict(stats[features])[0]

-3.4166093584537105

In [110]:
generate_tear_offv2(model, round1, data, features)

-------------------RobustScaler TEAR-OFF-------------------
------------------------------------------------
Home: Carlton
Away: Richmond
Winner: Richmond	       Margin: 3
------------------------------------------------
Home: Collingwood
Away: Sydney Swans
Winner: Collingwood	       Margin: 15
------------------------------------------------
Home: Essendon
Away: Hawthorn
Winner: Essendon	       Margin: 17
------------------------------------------------
Home: GWS Giants
Away: North Melbourne
Winner: North Melbourne	       Margin: 27
------------------------------------------------
Home: Geelong Cats
Away: St Kilda
Winner: Geelong Cats	       Margin: 27
------------------------------------------------
Home: Gold Coast Suns
Away: Adelaide Crows
Winner: Gold Coast Suns	       Margin: 3
------------------------------------------------
Home: Melbourne
Away: Western Bulldogs
Winner: Melbourne	       Margin: 29
------------------------------------------------
Home: Port Adelaide
Away: West C

In [99]:
generate_tear_offv2(model, round_matchups, data, features)

-------------------ZeroCount TEAR-OFF-------------------


KeyError: 'match.homeTeam.name'