In [1]:
#Final version of model 
#Models are trained on 100% of the training data, having previously been tested using cross val and then a hold out test sample
#Final model is saved to disk for use in scoring notebook


In [2]:
#Importing basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from numpy import mean
from numpy import std
from matplotlib import pyplot
from numpy import set_printoptions
from pickle import dump
from pickle import load

In [3]:
#Importing the Dataset
#Data from old 2019 model training, includes data from 2014 to 2019
afl = pd.read_csv('C:\\Users\\D648007\\AFLdata2.csv')
dataset = afl

In [4]:
#Identify which columns have NaN (Not a Number - i.e. missing values)
nan_values = dataset.isna()
nan_columns = nan_values.any()

columns_with_nan = dataset.columns[nan_columns].tolist()
print(columns_with_nan)

#Need to remove the NaN

['hm_PWk_lad', 'aw_PWk_lad', 'PWk_lad_gap', 'hm_PWk_perc', 'aw_PWk_perc', 'PWk_perc_gap', 'hm_PWk_pts', 'aw_PWk_pts', 'PWk_pts_gap', 'hm_PWk_pts_stan', 'aw_PWk_pts_stan', 'PWk_pts_gap_stan', 'aw_PY', 'aw_PY_perc', 'aw_2PY', 'aw_2PY_perc', 'aw_3PY', 'aw_3PY_perc', 'aw_4PY', 'aw_4PY_perc', 'aw_PY_2PY', 'aw_PY_3PY', 'aw_PY_4PY', 'aw_Pypc_2Pypc', 'aw_Pypc_3Pypc', 'aw_Pypc_4Pypc', 'PY_gap', 'PY_%_gap', '2PY_gap', '2PY_%_gap', '3PY_gap', '3PY_%_gap', '4PY_gap', '4PY_%_gap', 'aw_Dr_PY', 'aw_Dr_2PY', 'aw_Dr_3py', 'aw_Dr_4py', 'aw_Dr_Tot', 'Dr_PY_gap', 'Dr_2PY_gap', 'Dr_3PY_gap', 'Dr_4PY_gap', 'Dr_Tot_gap', 'aw_Pwk_win', 'aw_2Pwk_win', 'aw_3Pwk_win', 'aw_4Pwk_win', 'aw_win_1', 'aw_win_2', 'aw_win_3', 'aw_win_4', 'h_win_1_gap', 'h_win_2_gap', 'h_win_3_gap', 'h_win_4_gap', 'a_win_1_gap', 'a_win_2_gap', 'a_win_3_gap', 'a_win_4_gap', 'Interstate']


In [5]:
# Get names of indexes for which column hm_score has value 0
indexNames = dataset[ dataset['hm_score'] == 0 ].index

In [6]:
# Delete these row indexes from dataFrame
dataset.drop(indexNames , inplace=True)

In [7]:
#Identify which columns have NaN (Not a Number - i.e. missing values)
nan_values = dataset.isna()
nan_columns = nan_values.any()

columns_with_nan = dataset.columns[nan_columns].tolist()
print(columns_with_nan)

#Need to remove the NaN

[]


In [8]:
#Create training dataset of features identified through feature importance testing
dataset = dataset[["round", "hm_PWk_lad", "aw_PWk_lad", "PWk_lad_gap", "hm_PWk_perc", "aw_PWk_perc" , "PWk_perc_gap", "hm_PWk_pts", 
                  "aw_PWk_pts", "PWk_pts_gap", "hm_PWk_pts_stan", "aw_PWk_pts_stan", "PWk_pts_gap_stan", "hm_PY" , "hm_PY_perc",
                  "aw_PY", "aw_PY_perc", "PY_gap", "PY_%_gap", "2PY_%_gap", "3PY_%_gap", "home_win"]]

In [9]:
#Find index location of outcome metric
dataset.columns.get_loc("home_win")

21

In [10]:
#Before splitting into an array for model fitting, save the feature names for later use
feature_names = list(dataset.columns)
#remove the label name 'home_win'
del feature_names[21]

In [11]:
#Split into X and Y for training
#Separate the dataset into dependent (x) and independent (y) components, starting with converting to array
array = dataset.values

X = array[:, :21]
Y = array[:, 21]


In [12]:
Y = [1 if x == 1 else 0 for x in Y]

In [13]:
#Scale Training data
scaler = StandardScaler()
rescaledX=scaler.fit_transform(X)

#summarise transformed data
set_printoptions(precision=3)
print(rescaledX[0:5, :])

[[-1.627  1.634 -1.61  -2.322 -1.515  1.052 -1.845 -0.87   2.894  4.059
  -1.536  1.143  1.93   1.659 -1.666 -1.637  1.282  2.36  -2.106 -1.071
  -0.009]
 [-1.627 -1.285 -0.256  0.733  0.604  0.9   -0.229  2.217  1.71  -0.541
   0.721  0.294 -0.299 -1.25   0.679 -0.282  1.11  -0.692 -0.308 -0.491
  -1.235]
 [-1.627 -0.896  0.131  0.733  0.985  0.091  0.633  1.979  1.473 -0.541
   0.548  0.125 -0.299 -0.862  1.102  0.105  0.189 -0.692  0.651 -0.654
  -0.218]
 [-1.627  0.466 -1.029 -1.072  0.038  0.512 -0.347  1.505  2.183  0.737
   0.2    0.634  0.32   0.495  0.054 -1.057  0.669  1.112 -0.44   1.21
   0.5  ]
 [-1.627  0.661 -0.643 -0.933 -0.891  0.155 -0.746  0.555  1.947  1.503
  -0.494  0.464  0.691  0.689 -0.975 -0.669  0.262  0.973 -0.884 -0.793
  -0.918]]


In [14]:
print(rescaledX.shape)

(1762, 21)


In [15]:
#Fit Random Forest
# Random Forest creates a lot of trees and then slects the most frequent class as the winning prediction.

rf = RandomForestClassifier(n_estimators=1000, max_features='log2', max_depth= 5)
rf.fit(rescaledX, Y)

RandomForestClassifier(max_depth=5, max_features='log2', n_estimators=1000)

In [16]:
#Save the model to disk

filename = 'afl2022modelRF.sav'
dump(rf, open(filename, 'wb'))