# Neural Networks for Goal Rush

First we need to fetch the data from football-data.

We define a function called load, this takes a url to fetch. This url is downloaded and the content is loaded into a data frame. The function assumes the fetched content is in CSV format.

We then use this function to load the data for each league.

In [0]:
import pandas as pd
import io
import requests

def load(url):
  content=requests.get(url).content
  return pd.read_csv(io.StringIO(content.decode('utf-8')))


prem = load("https://www.football-data.co.uk/mmz4281/1819/E0.csv")
champ = load("https://www.football-data.co.uk/mmz4281/1819/E1.csv")
l1 = load("https://www.football-data.co.uk/mmz4281/1819/E2.csv")
l2 = load("https://www.football-data.co.uk/mmz4281/1819/E3.csv")

We can look at the results of this.

I dont think we need to care much about the league separation, so we can merge these dataframes into one big one.

In [0]:
from datetime import datetime
allGames = pd.concat([prem, champ, l1, l2])
allGames["Date"] = allGames["Date"].map(lambda x: datetime.strptime(x, "%d/%m/%Y"))
allGames["expH"] = allGames["FTHG"] / allGames["HS"]
allGames["expA"] = allGames["FTAG"] / allGames["AS"]
allGames

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA,expH,expA
0,E0,2018-08-10,Man United,Leicester,2,1,H,1.0,0.0,H,A Marriner,8.0,13.0,6.0,4.0,11.0,8.0,2.0,5.0,2.0,1.0,0.0,0.0,1.57,3.90,7.50,1.53,4.00,7.50,1.55,3.80,7.00,1.58,3.93,7.50,1.57,3.80,6.00,1.57,4.00,7.00,39,1.60,1.56,4.20,3.92,8.05,7.06,38,2.12,2.03,1.85,1.79,17,-0.75,1.75,1.70,2.29,2.21,1.55,4.07,7.69,0.250000,0.076923
1,E0,2018-08-11,Bournemouth,Cardiff,2,0,H,1.0,0.0,H,K Friend,12.0,10.0,4.0,1.0,11.0,9.0,7.0,4.0,1.0,1.0,0.0,0.0,1.90,3.60,4.50,1.90,3.40,4.40,1.90,3.50,4.10,1.89,3.63,4.58,1.91,3.50,4.00,1.87,3.60,4.75,39,1.93,1.88,3.71,3.53,4.75,4.37,38,2.05,1.98,1.92,1.83,20,-0.75,2.20,2.13,1.80,1.75,1.88,3.61,4.70,0.166667,0.000000
2,E0,2018-08-11,Fulham,Crystal Palace,0,2,A,0.0,1.0,A,M Dean,15.0,10.0,6.0,9.0,9.0,11.0,5.0,5.0,1.0,2.0,0.0,0.0,2.50,3.40,3.00,2.45,3.30,2.95,2.40,3.30,2.95,2.50,3.46,3.00,2.45,3.30,2.80,2.50,3.40,3.00,39,2.60,2.47,3.49,3.35,3.05,2.92,38,2.00,1.95,1.96,1.87,22,-0.25,2.18,2.11,1.81,1.77,2.62,3.38,2.90,0.000000,0.200000
3,E0,2018-08-11,Huddersfield,Chelsea,0,3,A,0.0,2.0,A,C Kavanagh,6.0,13.0,1.0,4.0,9.0,8.0,2.0,5.0,2.0,1.0,0.0,0.0,6.50,4.00,1.61,6.25,3.90,1.57,6.20,4.00,1.55,6.41,4.02,1.62,5.80,3.90,1.57,6.50,4.00,1.62,38,6.85,6.09,4.07,3.90,1.66,1.61,37,2.05,1.98,1.90,1.84,23,1.00,1.84,1.80,2.13,2.06,7.24,3.95,1.58,0.000000,0.230769
4,E0,2018-08-11,Newcastle,Tottenham,1,2,A,1.0,2.0,A,M Atkinson,15.0,15.0,2.0,5.0,11.0,12.0,3.0,5.0,2.0,2.0,0.0,0.0,3.90,3.50,2.04,3.80,3.50,2.00,3.70,3.35,2.05,3.83,3.57,2.08,3.80,3.20,2.05,3.90,3.40,2.10,39,4.01,3.83,3.57,3.40,2.12,2.05,38,2.10,2.01,1.88,1.81,20,0.25,2.20,2.12,1.80,1.76,4.74,3.53,1.89,0.066667,0.133333
5,E0,2018-08-11,Watford,Brighton,2,0,H,1.0,0.0,H,J Moss,19.0,6.0,5.0,0.0,10.0,16.0,8.0,2.0,2.0,2.0,0.0,0.0,2.37,3.20,3.40,2.35,3.10,3.30,2.20,3.30,3.40,2.43,3.22,3.33,2.38,3.00,3.30,2.40,3.20,3.40,39,2.48,2.36,3.30,3.14,3.42,3.31,37,2.46,2.35,1.67,1.59,22,-0.25,2.07,2.01,1.90,1.86,2.58,3.08,3.22,0.105263,0.000000
6,E0,2018-08-11,Wolves,Everton,2,2,D,1.0,1.0,D,C Pawson,11.0,6.0,4.0,5.0,8.0,7.0,3.0,6.0,0.0,1.0,0.0,1.0,2.37,3.30,3.30,2.35,3.20,3.20,2.25,3.35,3.20,2.36,3.40,3.28,2.30,3.20,3.20,2.38,3.30,3.30,38,2.41,2.33,3.40,3.27,3.40,3.23,36,2.20,2.09,1.83,1.75,22,-0.25,2.04,1.98,1.92,1.88,2.44,3.23,3.32,0.181818,0.333333
7,E0,2018-08-12,Arsenal,Man City,0,2,A,0.0,1.0,A,M Oliver,9.0,17.0,3.0,8.0,11.0,14.0,2.0,9.0,2.0,2.0,0.0,0.0,4.00,3.80,1.95,3.70,3.75,1.95,3.60,3.60,2.00,4.00,3.97,1.93,3.80,3.80,1.91,3.90,4.00,1.91,39,4.15,3.83,4.00,3.80,2.00,1.92,36,1.60,1.55,2.55,2.42,20,0.75,1.78,1.74,2.21,2.15,4.43,4.13,1.81,0.000000,0.117647
8,E0,2018-08-12,Liverpool,West Ham,4,0,H,2.0,0.0,H,A Taylor,18.0,5.0,8.0,2.0,14.0,9.0,5.0,4.0,1.0,2.0,0.0,0.0,1.25,6.50,14.00,1.20,6.75,14.00,1.25,6.10,11.00,1.27,6.35,13.25,1.25,5.50,12.00,1.25,6.50,13.00,38,1.29,1.25,6.79,6.22,15.00,12.30,33,1.49,1.44,2.88,2.72,21,-1.75,1.95,1.90,2.06,1.97,1.25,6.95,12.00,0.222222,0.000000
9,E0,2018-08-12,Southampton,Burnley,0,0,D,0.0,0.0,D,G Scott,18.0,16.0,3.0,6.0,10.0,9.0,8.0,5.0,0.0,1.0,0.0,0.0,1.85,3.50,5.00,1.80,3.50,4.75,1.80,3.60,4.50,1.86,3.51,4.99,1.83,3.25,4.80,1.85,3.40,5.20,39,1.90,1.84,3.61,3.43,5.20,4.80,37,2.45,2.34,1.67,1.60,20,-0.75,2.19,2.11,1.82,1.76,2.03,3.19,4.65,0.000000,0.000000


Strings in the team names will likely be a problem for a learning algorithm. Lets make an integer ID for each team.



In [0]:
teams = set()

for team in allGames["HomeTeam"].values:
  teams.add(team)

teamDict = {}
count = 0
for team in sorted(teams):
  teamDict[team] = count
  count = count + 1

allGames["HomeTeamID"] = allGames["HomeTeam"].map(lambda x: teamDict[x])
allGames["AwayTeamID"] = allGames["AwayTeam"].map(lambda x: teamDict[x])

## Creating our feature set

We dont need all these columns, so lets select what we think matters and start building a feature dataframe.

In [0]:
features = allGames[["HomeTeamID", "AwayTeamID", "FTHG", "FTAG"]]
features

Unnamed: 0,HomeTeamID,AwayTeamID,FTHG,FTAG
0,48,42,2,1
1,9,19,2,0
2,35,28,0,2
3,38,22,0,3
4,54,82,1,2
5,85,12,2,0
6,89,31,2,2
7,2,47,0,2
8,44,87,4,0
9,75,15,0,0


Now we map these onto our feature dataframe

In [0]:
labels = pd.DataFrame()
labels["rush"] = ((features["FTHG"] > 0) & (features["FTAG"] > 0)).map(lambda x: 1 if x else 0)

features.drop(["FTHG", "FTAG"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [0]:
def meanPrevHomeGoals(teamId, before):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["HomeTeamID"] == teamId]["FTHG"].mean()
  return 0

def meanPrevHomeConceded(teamId, before):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["HomeTeamID"] == teamId]["FTAG"].mean()
  return 0

def meanPrevAwayGoals(teamId, before):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["AwayTeamID"] == teamId]["FTAG"].mean()
  return 0

def meanPrevAwayConceded(teamId, before):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["AwayTeamID"] == teamId]["FTHG"].mean()
  return 0

def last6HomeGoalsAVG(teamId, before):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["HomeTeamID"] == teamId].iloc[-6:,:]["FTHG"].mean()
  return 0

def last6AwayGoalsAVG(teamId, before):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["AwayTeamID"] == teamId].iloc[-6:,:]["FTAG"].mean()
  return 0

def last6HomeStatAverage(teamId, before, stat):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["HomeTeamID"] == teamId].iloc[-6:,:][stat].sum()
  return 0

def last6AwayStatAverage(teamId, before, stat):
  filtered = allGames[allGames["Date"] < before]
  if not filtered.empty:
    return filtered[filtered["AwayTeamID"] == teamId].iloc[-6:,:][stat].sum()
  return 0

features["date"] = allGames["Date"]
features["last6HomeTeamHomeExp"] = features.apply(lambda x: last6HomeStatAverage(x.HomeTeamID, x.date, "expH"), axis=1)
features["last6AwayTeamAwayExp"] = features.apply(lambda x: last6HomeStatAverage(x.HomeTeamID, x.date, "expA"), axis=1)
# features["homeTeamHomeGoalsAverage"] = features.apply(lambda x: meanPrevHomeGoals(x.HomeTeamID, x.date), axis=1)
# features["awayTeamAwayGoalsAverage"] = features.apply(lambda x: meanPrevAwayGoals(x.AwayTeamID, x.date), axis=1)
# features["homeTeamHomeConcededAverage"] = features.apply(lambda x: meanPrevHomeConceded(x.HomeTeamID, x.date), axis=1)
# features["homeTeamAwayConcededAverage"] = features.apply(lambda x: meanPrevAwayConceded(x.AwayTeamID, x.date), axis=1)
# features["homeTeamGoalsAverage"] = features.apply(lambda x: (meanPrevHomeGoals(x.HomeTeamID, x.date) + meanPrevAwayGoals(x.HomeTeamID, x.date)) / 2, axis=1)
# features["awayTeamGoalsAverage"] = features.apply(lambda x: (meanPrevHomeGoals(x.AwayTeamID, x.date) + meanPrevAwayGoals(x.AwayTeamID, x.date)) / 2, axis=1)
# features["homeTeamConcededAverage"] = features.apply(lambda x: (meanPrevHomeConceded(x.HomeTeamID, x.date) + meanPrevAwayConceded(x.HomeTeamID, x.date)) / 2, axis=1)
# features["awayTeamConcededAverage"] = features.apply(lambda x: (meanPrevHomeConceded(x.AwayTeamID, x.date) + meanPrevAwayConceded(x.AwayTeamID, x.date)) / 2, axis=1)
# features["last6HomeTeamHomeGoalsAvg"] = features.apply(lambda x: last6HomeGoalsAVG(x.HomeTeamID, x.date), axis=1)
# features["last6AwayTeamAwayGoalsAvg"] = features.apply(lambda x: last6AwayGoalsAVG(x.AwayTeamID, x.date), axis=1)
# features["homeTeamHomeGoalsAverage"].fillna(0, inplace=True)
# features["awayTeamAwayGoalsAverage"].fillna(0, inplace=True)
# features["homeTeamHomeConcededAverage"].fillna(0, inplace=True)
# features["homeTeamAwayConcededAverage"].fillna(0, inplace=True)
# features["homeTeamGoalsAverage"].fillna(0, inplace=True)
# features["awayTeamGoalsAverage"].fillna(0, inplace=True)
# features["homeTeamConcededAverage"].fillna(0, inplace=True)
# features["awayTeamConcededAverage"].fillna(0, inplace=True)
# features["last6HomeTeamHomeGoalsAvg"].fillna(0, inplace=True)
# features["last6AwayTeamAwayGoalsAvg"].fillna(0, inplace=True)

features.drop(["date", "HomeTeamID", "AwayTeamID"], axis=1, inplace=True)
features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,last6HomeTeamHomeExp,last6AwayTeamAwayExp
0,0.000000,0.000000
1,0.000000,0.000000
2,0.000000,0.000000
3,0.000000,0.000000
4,0.000000,0.000000
5,0.000000,0.000000
6,0.000000,0.000000
7,0.000000,0.000000
8,0.000000,0.000000
9,0.000000,0.000000


## Neural Network Training

Lets set up a basic neural network, to be improved with more features later on.

In [0]:
from keras import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [0]:
from sklearn.model_selection import train_test_split

featuresTrain, featuresTest, labelsTrain, labelsTest = train_test_split(features.iloc[300:,:], labels.iloc[300:,:], train_size=0.8, random_state=90)

# from sklearn.preprocessing import MinMaxScaler

# scalar = MinMaxScaler()
# scalar.fit(featuresTrain)
# featuresTrainScaled = pd.DataFrame(scalar.transform(featuresTrain), columns=featuresTrain.columns)
# featuresTrainScaled

In [0]:
featuresTrain

Unnamed: 0,last6HomeTeamHomeExp,last6AwayTeamAwayExp
15,0.000000,0.000000
74,0.410714,0.301471
348,0.992954,0.266667
264,1.102020,0.342075
437,0.531506,0.594538
96,0.659341,0.000000
452,0.726263,0.550311
373,0.592380,0.831624
358,0.929915,0.617857
51,0.116883,0.333333


In [0]:
classifier = Sequential()
classifier.add(Dense(featuresTrain.shape[1], activation='relu', input_dim=featuresTrain.shape[1]))
classifier.add(Dense(1, activation='sigmoid'))

classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])
classifier.fit(featuresTrain, labelsTrain, epochs=50)
classifier.evaluate(featuresTrain, labelsTrain, batch_size=100)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[0.6924364491223601, 0.5194524438992701]

In [0]:
results = pd.DataFrame()
predictions = pd.DataFrame(classifier.predict(featuresTest))
results['predictions'] = pd.Series(predictions[0].values)
results['labels'] = pd.Series(labelsTest['rush'].values)
results

Unnamed: 0,predictions,labels
0,0.518409,0
1,0.518409,1
2,0.518409,1
3,0.518409,1
4,0.518409,1
5,0.518409,1
6,0.518409,1
7,0.518409,1
8,0.518409,0
9,0.518409,0
