In [48]:
# Brownlow Votes Prediction
# predict the votes that a player will recieve in a Season
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import warnings

In [86]:
# read csv file
data = pd.read_csv('RawData.csv')
data.head()

Unnamed: 0,Date,Name,Team,Season,Round,Home Team,Away Team,Home Score,Away Score,Margin,...,Frees For,Frees Against,Contested Pos,Uncontested Pos,Contested Marks,Marks Inside 50,One Percenters,Goal Assists,Brownlow Votes,TOG
0,2019-09-28 00:00:00,Zac Williams,Greater Western Sydney,2019.0,GF,Richmond,Greater Western Sydney,114,25,89,...,0.0,0.0,7.0,13.0,0.0,1.0,0.0,0.0,0.0,68
1,2019-09-28 00:00:00,Lachie Whitfield,Greater Western Sydney,2019.0,GF,Richmond,Greater Western Sydney,114,25,89,...,0.0,1.0,4.0,9.0,0.0,0.0,1.0,0.0,0.0,91
2,2019-09-28 00:00:00,Adam Tomlinson,Greater Western Sydney,2019.0,GF,Richmond,Greater Western Sydney,114,25,89,...,0.0,1.0,6.0,11.0,0.0,0.0,1.0,0.0,0.0,83
3,2019-09-28 00:00:00,Sam Taylor,Greater Western Sydney,2019.0,GF,Richmond,Greater Western Sydney,114,25,89,...,1.0,0.0,6.0,8.0,1.0,0.0,9.0,0.0,0.0,91
4,2019-09-28 00:00:00,Tim Taranto,Greater Western Sydney,2019.0,GF,Richmond,Greater Western Sydney,114,25,89,...,3.0,1.0,15.0,18.0,0.0,0.0,1.0,0.0,0.0,80


In [87]:
data.columns

Index(['Date', 'Name', 'Team', 'Season', 'Round', 'Home Team', 'Away Team',
       'Home Score', 'Away Score', 'Margin', 'Disposals', 'Kicks', 'Marks',
       'Handballs', 'Goals', 'Behinds', 'Hitouts', 'Tackles', 'Rebounds',
       'Inside 50s', 'Clearances', 'Clangers', 'Frees For', 'Frees Against',
       'Contested Pos', 'Uncontested Pos', 'Contested Marks',
       'Marks Inside 50', 'One Percenters', 'Goal Assists', 'Brownlow Votes',
       'TOG'],
      dtype='object')

In [88]:
# split the dataset into training set and test set

training_data = data[data['Season']<2016]
test_data = data[data['Season']>=2016]
# training_data_y = training_data['Brownlow Votes']
# training_data_x = training_data.drop(columns=['Brownlow Votes'])
# test_data_y = test_data['Brownlow Votes']
# test_data_x = test_data.drop(columns=['Brownlow Votes'])

In [89]:
# find two attributes which mostly contribute to the votes
training_data.groupby('Marks Inside 50')['Brownlow Votes'].mean()

Marks Inside 50
0.0     0.092398
1.0     0.153933
2.0     0.201241
3.0     0.222519
4.0     0.294043
5.0     0.569880
6.0     0.813483
7.0     1.199005
8.0     1.505618
9.0     1.673469
10.0    1.954545
11.0    1.909091
12.0    2.500000
13.0    3.000000
Name: Brownlow Votes, dtype: float64

In [90]:
training_data.groupby('Goals')['Brownlow Votes'].mean()

Goals
0.0     0.069862
1.0     0.145658
2.0     0.228319
3.0     0.333848
4.0     0.597926
5.0     0.994126
6.0     1.542751
7.0     2.216495
8.0     2.551724
9.0     2.500000
10.0    2.750000
11.0    3.000000
12.0    3.000000
13.0    3.000000
Name: Brownlow Votes, dtype: float64

In [91]:
# number of data for different season
# each match has 44 players, the number for different season should be divided by 44, except 2003
# maybe there is a missing data in it. for convinence and efficiency, I will dismiss the data in 2003
for i in range(2003, 2020):
    print(data[data['Season']==i].shape)
training_data = training_data[training_data['Season']>2003]

# training_data_y = training_data['Brownlow Votes']
# training_data_x = training_data.drop(columns=['Brownlow Votes'])

(8139, 32)
(8140, 32)
(8140, 32)
(8140, 32)
(8140, 32)
(8140, 32)
(8140, 32)
(8184, 32)
(8624, 32)
(9108, 32)
(9108, 32)
(9108, 32)
(9064, 32)
(9108, 32)
(9108, 32)
(9108, 32)
(9108, 32)


In [92]:
# normalize the data 
def normalization(myData):
    max_min = myData.max(axis = 0) - myData.min(axis = 0) 
    max_min = (max_min==0) + max_min # avoid 0
    return (myData - myData.min(axis = 0)) \
           / max_min

In [93]:
def preprocess(data, start, end):
    # drop unnecessary columns
    drop_columns = ['Date', 'Team', 'Round', 'Home Team', 'Away Team',
       'Home Score', 'Away Score', 'Margin', 'Disposals', 'Kicks', 'Marks',
       'Handballs','Behinds', 'Hitouts', 'Tackles', 'Rebounds',
       'Inside 50s', 'Clearances', 'Clangers', 'Frees For', 'Frees Against',
       'Contested Pos', 'Uncontested Pos', 'Contested Marks','One Percenters', 'Goal Assists',
       'TOG']
    data = data.drop(columns=drop_columns)
    preprocess = np.array([0]*4)
    for i in range(start,end+1):
        match = int(data[data['Season']==i].shape[0]/44)
        # iterate each match and do the normalization
        for j in range(match):
            data_for_one_match = data.iloc[44*j:44*(j+1)]
            one_match = normalization(np.array(data_for_one_match)[:,2:4])
            one_match = np.concatenate((one_match,np.array(data_for_one_match)[:,-1].reshape(-1,1)), axis=1)
            one_match = np.concatenate((np.array(data_for_one_match)[:,0].reshape(-1,1),one_match), axis=1)
            preprocess = np.vstack((preprocess, one_match))
    return pd.DataFrame(preprocess[1:,:], columns=['Name','Goals','Marks Inside 50','Votes'])
    

In [94]:
train_preprocess = preprocess(training_data, 2004, 2015)

In [95]:
test_preprocess = preprocess(test_data, 2016, 2019)

In [96]:
train_preprocess

Unnamed: 0,Name,Goals,Marks Inside 50,Votes
0,Zac Williams,0,0,0
1,Lachie Whitfield,0,0,0
2,Callan Ward,0.5,0.4,1
3,Adam Treloar,0,0,0
4,Adam Tomlinson,0,0,0
...,...,...,...,...
102031,Raphael Clarke,0,0,0
102032,Jason Blake,0.2,0,0
102033,Steven Baker,0,0,0
102034,David Armitage,0.2,0,3


In [123]:
test_preprocess

Unnamed: 0,Name,Goals,Marks Inside 50,Votes,vote_1,vote_2,vote_3
0,Zac Williams,0,0.25,0,0,0,0
1,Lachie Whitfield,0,0,0,0,0,0
2,Adam Tomlinson,0,0,0,0,0,0
3,Sam Taylor,0,0,0,0,0,0
4,Tim Taranto,0,0,0,0,0,0
...,...,...,...,...,...,...,...
36427,Paddy Dow,0,0.333333,0,0,0,0
36428,Ed Curnow,0,0.666667,0,0,0,0
36429,Charlie Curnow,0.333333,0,0,0,0,0
36430,David Cuningham,0,0,0,0,0,0


In [97]:
# train_X = train_preprocess[:,:2]
# train_y = train_preprocess[:,-1]
# test_X = train_preprocess[:,:2]
# test_y = train_preprocess[:,-1]

In [98]:
# preprocess y, divide it in to 3 columns
def y_preprocess(data):
    data['vote_1'] = (data['Votes'].values==1)+0
    data['vote_2'] = (data['Votes'].values==2)+0
    data['vote_3'] = (data['Votes'].values==3)+0
    
    return data
    

In [99]:
train=y_preprocess(train_preprocess)
test=y_preprocess(test_preprocess)

In [129]:
test

Unnamed: 0,Name,Goals,Marks Inside 50,Votes,vote_1,vote_2,vote_3
0,Zac Williams,0,0.25,0,0,0,0
1,Lachie Whitfield,0,0,0,0,0,0
2,Adam Tomlinson,0,0,0,0,0,0
3,Sam Taylor,0,0,0,0,0,0
4,Tim Taranto,0,0,0,0,0,0
...,...,...,...,...,...,...,...
36427,Paddy Dow,0,0.333333,0,0,0,0
36428,Ed Curnow,0,0.666667,0,0,0,0
36429,Charlie Curnow,0.333333,0,0,0,0,0
36430,David Cuningham,0,0,0,0,0,0


In [144]:
# select the rows whose 'Goals' and 'Marks Inside 50' values are not 0
train_new=pd.concat([train[train['Goals'] != 0],train[train['Marks Inside 50'] != 0]])
test_new=pd.concat([test[test['Goals'] != 0],test[test['Marks Inside 50'] != 0]])

In [146]:
# drop duplicate
train_new = train_new[~train_new.index.duplicated(keep='first')]
test_new = test_new[~test_new.index.duplicated(keep='first')]

In [147]:
test_new

Unnamed: 0,Name,Goals,Marks Inside 50,Votes,vote_1,vote_2,vote_3
12,Jacob Hopper,0.2,0,0,0,0,0
13,Harrison Himmelberg,0.2,0.5,0,0,0,0
21,Jeremy Cameron,0.2,0,0,0,0,0
23,Ivan Soldo,0.2,0.25,0,0,0,0
25,Daniel Rioli,0.2,0,0,0,0,0
...,...,...,...,...,...,...,...
36403,Jack Graham,0,0.333333,0,0,0,0
36405,Trent Cotchin,0,0.333333,0,0,0,0
36414,Will Setterfield,0,0.333333,0,0,0,0
36427,Paddy Dow,0,0.333333,0,0,0,0


In [132]:
# LogisticRegression
def lr_(x_train, y_train):
    def decorator(x_train, y_train):
        model = LogisticRegression(solver='liblinear',\
                        penalty='l1',\
                        class_weight='balanced')
        model.fit(x_train, y_train)
        return model


    model = decorator(x_train,y_train)
#     y_pre = model.predict(x_train)
    score = cross_val_score(model,x_train,y_train,cv=5,scoring='roc_auc').mean()
    accuracy = cross_val_score(model,x_train,y_train,cv=5,scoring='accuracy').mean()
    precision = cross_val_score(model,x_train,y_train,cv=5,scoring='precision').mean()
    recall = cross_val_score(model,x_train,y_train,cv=5,scoring='recall').mean()
    f1_score = cross_val_score(model,x_train,y_train,cv=5,scoring='f1').mean()

    report = {'score':score,'accuracy': accuracy,'precision':precision,'recall':recall,'f1_score':f1_score}

    warnings.filterwarnings('ignore') 
    
    return report 


In [133]:
# extract players that are able to get votes (no matter 1,2,3) in training data using cross validation
# training_x is 'Goals' and 'Marks inside 50'
# training_y is 'Votes' (value of 'Votes' > 0)
lr_(np.array(train_new)[:,1:3],(np.array(train_new)[:,3]>0)+0)

{'score': 0.6769406847143423,
 'accuracy': 0.6877126381700019,
 'precision': 0.3322596937890399,
 'recall': 0.5360338701643699,
 'f1_score': 0.28604573652480614}

In [134]:
# players get 1 score in train data using cross validation
# training_x is 'Goals' and 'Marks inside 50'
# training_y is 'vote_1' (value of 'vote_1' > 0)
lr_(np.array(train_new)[:,1:3],(np.array(train_new)[:,4]>0)+0)

{'score': 0.6177824282801596,
 'accuracy': 0.670147075334007,
 'precision': 0.11528194794496642,
 'recall': 0.474914089347079,
 'f1_score': 0.1105306259760376}

In [135]:
# players get 2 score in train data using cross validation
# training_x is 'Goals' and 'Marks inside 50'
# training_y is 'vote_2' (value of 'vote_2' > 0)
lr_(np.array(train_new)[:,1:3],(np.array(train_new)[:,5]>0)+0)

{'score': 0.6384422419011939,
 'accuracy': 0.6297385764099068,
 'precision': 0.16145914413390958,
 'recall': 0.5262004115147203,
 'f1_score': 0.13633373968588045}

In [136]:
# players get 3 score in train data using cross validation
# training_x is 'Goals' and 'Marks inside 50'
# training_y is 'vote_3' (value of 'vote_3' > 0)
lr_(np.array(train_new)[:,1:3],(np.array(train_new)[:,6]>0)+0)

{'score': 0.7348390804995689,
 'accuracy': 0.7145166722802291,
 'precision': 0.21380934069074317,
 'recall': 0.6178041543026707,
 'f1_score': 0.19465835274369922}

In [148]:
# test data
# collect the probability
def prob(train_x, train_y, test_x):
    model = LogisticRegression(solver='liblinear',\
                        penalty='l1',\
                        class_weight='balanced')
    model.fit(train_x, train_y)
    return model.predict_proba(test_x)[:,1]

In [149]:
train_x = np.array(train_new)[:,1:3]
test_x = np.array(test_new)[:,1:3]

In [150]:
# predict the probability that a player can get votes (no matter the number)
test_new['Votes_prob'] = prob(train_x,(np.array(train_new)[:,3]>0)+0,test_x)

In [151]:
# predict the probability that a player can get 1, 2, 3 votes, respectively
test_new['vote1_prob'] = prob(train_x,(np.array(train_new)[:,4]>0)+0,test_x)
test_new['vote2_prob'] = prob(train_x,(np.array(train_new)[:,5]>0)+0,test_x)
test_new['vote3_prob'] = prob(train_x,(np.array(train_new)[:,6]>0)+0,test_x)

In [152]:
# collect the max votes a play can get according to the prob of getting votes
max_votes = []
for i,row in test_new.iterrows():
    # prob of getting votes is small than prob of not getting votes
    if row['Votes_prob'] < 0.5:
        max_votes.append(0)
        continue
        
    # if getting votes, select the biggest prob from vote1,vote2,vote3
    m = max(row['vote1_prob'],row['vote2_prob'],row['vote3_prob'])
    if m == row['vote1_prob']:
        max_votes.append(1)
    elif m == row['vote2_prob']:
        max_votes.append(2)
    else:
        max_votes.append(3)
    

In [153]:
test_new['predict_votes'] = max_votes

In [154]:
test_new

Unnamed: 0,Name,Goals,Marks Inside 50,Votes,vote_1,vote_2,vote_3,Votes_prob,vote1_prob,vote2_prob,vote3_prob,predict_votes
12,Jacob Hopper,0.2,0,0,0,0,0,0.402979,0.418876,0.437763,0.364018,0
13,Harrison Himmelberg,0.2,0.5,0,0,0,0,0.375404,0.445719,0.390281,0.314303,0
21,Jeremy Cameron,0.2,0,0,0,0,0,0.402979,0.418876,0.437763,0.364018,0
23,Ivan Soldo,0.2,0.25,0,0,0,0,0.389102,0.432248,0.413822,0.338716,0
25,Daniel Rioli,0.2,0,0,0,0,0,0.402979,0.418876,0.437763,0.364018,0
...,...,...,...,...,...,...,...,...,...,...,...,...
36403,Jack Graham,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0
36405,Trent Cotchin,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0
36414,Will Setterfield,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0
36427,Paddy Dow,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0


In [155]:
# concat the previous test values whose 'Goal' and 'Marks inside 50' are 0
test_0 = test[(test['Goals'] == 0) & (test['Marks Inside 50'] == 0)]
test_0['predict_votes'] = [0] * test_0.shape[0]

In [156]:
test_0

Unnamed: 0,Name,Goals,Marks Inside 50,Votes,vote_1,vote_2,vote_3,predict_votes
1,Lachie Whitfield,0,0,0,0,0,0,0
2,Adam Tomlinson,0,0,0,0,0,0,0
3,Sam Taylor,0,0,0,0,0,0,0
4,Tim Taranto,0,0,0,0,0,0,0
5,Heath Shaw,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
36422,Liam Jones,0,0,0,0,0,0,0
36423,Michael Gibbons,0,0,0,0,0,0,0
36424,Jarrod Garlett,0,0,0,0,0,0,0
36430,David Cuningham,0,0,0,0,0,0,0


In [157]:
test_new

Unnamed: 0,Name,Goals,Marks Inside 50,Votes,vote_1,vote_2,vote_3,Votes_prob,vote1_prob,vote2_prob,vote3_prob,predict_votes
12,Jacob Hopper,0.2,0,0,0,0,0,0.402979,0.418876,0.437763,0.364018,0
13,Harrison Himmelberg,0.2,0.5,0,0,0,0,0.375404,0.445719,0.390281,0.314303,0
21,Jeremy Cameron,0.2,0,0,0,0,0,0.402979,0.418876,0.437763,0.364018,0
23,Ivan Soldo,0.2,0.25,0,0,0,0,0.389102,0.432248,0.413822,0.338716,0
25,Daniel Rioli,0.2,0,0,0,0,0,0.402979,0.418876,0.437763,0.364018,0
...,...,...,...,...,...,...,...,...,...,...,...,...
36403,Jack Graham,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0
36405,Trent Cotchin,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0
36414,Will Setterfield,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0
36427,Paddy Dow,0,0.333333,0,0,0,0,0.289100,0.375425,0.325352,0.223341,0


In [158]:
# drop unnecessary columns
test_new_drop = test_new.drop(columns=['Votes_prob','vote1_prob','vote2_prob','vote3_prob'])

In [159]:
test_final = pd.concat((test_new_drop,test_0))

In [160]:
test_final = test_final[~test_final.index.duplicated(keep='first')]

In [161]:
test_final

Unnamed: 0,Name,Goals,Marks Inside 50,Votes,vote_1,vote_2,vote_3,predict_votes
12,Jacob Hopper,0.2,0,0,0,0,0,0
13,Harrison Himmelberg,0.2,0.5,0,0,0,0,0
21,Jeremy Cameron,0.2,0,0,0,0,0,0
23,Ivan Soldo,0.2,0.25,0,0,0,0,0
25,Daniel Rioli,0.2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
36422,Liam Jones,0,0,0,0,0,0,0
36423,Michael Gibbons,0,0,0,0,0,0,0
36424,Jarrod Garlett,0,0,0,0,0,0,0
36430,David Cuningham,0,0,0,0,0,0,0


In [162]:
test_final['Votes'] = test_final['Votes'].astype(int)
test_final['predict_votes']=test_final['predict_votes'].astype(int)

In [177]:
# score = roc_auc_score(test_new['Votes'].values, test_new['predict_votes'].values)
accuracy = accuracy_score(test_final['Votes'].values, test_final['predict_votes'].values)
precision = precision_score(test_final['Votes'].values, test_final['predict_votes'].values, average = 'weighted')
recall = recall_score(test_final['Votes'].values, test_final['predict_votes'].values,average='weighted')
f1 = f1_score(test_final['Votes'].values, test_final['predict_votes'].values,average='weighted')
report = {'accuracy': accuracy,'precision':precision,'recall':recall,'f1_score':f1}

In [178]:
report

{'accuracy': 0.8410188844971453,
 'precision': 0.9066392638263775,
 'recall': 0.8410188844971453,
 'f1_score': 0.8712510153999892}