Script for predicting football scores in the German Bundesliga.

Required training data for each past match:

Predictor Variables:
- HT   = HomeTeam, 
- AT   = AwayTeam, 
- R    = Round, 
- HTP  = HomeTeamPos, 
- ATP  = AwayTeamPos, 
- HTGS = HomeTeamGoalsScored, 
- HTGA = HomeTeamGoalsAgainst, 
- ATGS = AwayTeamGoalsScored, 
- ATGA = AwayTeamGoalsAgainst

Prediction Variables:
- HG = HomeTeamGoals
- AG = AwayTeamGoals

Idea 1:
- TimeSeries approach: prediction based on exponential smoothing on past n rounds and some global strength value for each team based on global points

Idea 2:
- Regression approach: take all data into account and find out relevant prediction variables and regression function using SVM

In [230]:
import pandas as pd
import numpy as np
from numpy import zeros
from sklearn import datasets, svm, metrics, linear_model
from sklearn.svm import SVR

In [231]:
# load data from last 11 years (first 10 will be taken as training set and last one as test set)
# data format in CSV: (Round,Date,Team 1,Team 2,FT (full-time score),HT (half-time score))
# each row represents one match of the season
# scores are in format "n-m", e.g. "2-1", "1-0", etc.
data_2003_2004 = pd.read_csv('data/de/2000s/2003-04/1-bundesliga.csv', parse_dates=[2])
data_2004_2005 = pd.read_csv('data/de/2000s/2004-05/1-bundesliga.csv', parse_dates=[2])
data_2005_2006 = pd.read_csv('data/de/2000s/2005-06/1-bundesliga.csv', parse_dates=[2])
data_2006_2007 = pd.read_csv('data/de/2000s/2006-07/1-bundesliga.csv', parse_dates=[2])
data_2007_2008 = pd.read_csv('data/de/2000s/2007-08/1-bundesliga.csv', parse_dates=[2])
data_2008_2009 = pd.read_csv('data/de/2000s/2008-09/1-bundesliga.csv', parse_dates=[2])
data_2009_2010 = pd.read_csv('data/de/2000s/2009-10/1-bundesliga.csv', parse_dates=[2])
data_2010_2011 = pd.read_csv('data/de/2010s/2010-11/1-bundesliga.csv', parse_dates=[2])
data_2011_2012 = pd.read_csv('data/de/2010s/2011-12/1-bundesliga.csv', parse_dates=[2])
data_2012_2013 = pd.read_csv('data/de/2010s/2012-13/1-bundesliga.csv', parse_dates=[2])
data_2013_2014 = pd.read_csv('data/de/2010s/2013-14/1-bundesliga.csv', parse_dates=[2])




In [232]:
class Trend:
    def __init__(self,one,two,three,four,five):
        self.one = one
        self.two = two
        self.three = three
        self.four = four
        self.five = five
    
    def update(self, next):
        self.one = self.two
        self.two = self.three
        self.three = self.four
        self.four = self.five
        self.five = next
    
    def points(self):
        return self.one+self.two+self.three+self.four+self.five

# Initialize temporary table for the season with information from last season (positions)
def init_table(data):
    columns = ['Pos', 'Team', 'Points', 'Trend5', 'GoalsScored', 'GoalsAgainst', 'Diff']
    table = pd.DataFrame(columns=columns)
    table.Pos = np.arange(18)+1 # 1..18
    teams = np.unique(data['Team 1'])
    for i, team in enumerate(teams):
        # create empty table with all team names
        t = Trend(0,0,0,0,0)
        table.ix[i] = (i+1, team, 0, t, 0, 0, 0)
    
    return table

# we take the data from CSV files and enhance it with information like 
# which position the teams had in the table or how many goals they scored in total before that particular game 
def enhance(data):
    table = init_table(data)
    
    cols = ['Round', 'Date', 
            'HomeTeam', 'AwayTeam', 
            'HomeTeamPos', 'AwayTeamPos', 
            'HomeTeamTrendPoints', 'AwayTeamTrendPoints',
            'HomeTeamTrend1', 'HomeTeamTrend2', 'HomeTeamTrend3', 'HomeTeamTrend4', 'HomeTeamTrend5', 
            'AwayTeamTrend1', 'AwayTeamTrend2', 'AwayTeamTrend3', 'AwayTeamTrend4', 'AwayTeamTrend5', 
            'HomeTeamGoalsScored', 'HomeTeamGoalsAgainst', 
            'AwayTeamGoalsScored', 'AwayTeamGoalsAgainst', 
            'ResultHomeTeamGoals', 'ResultAwayTeamGoals', 'Result']
    enhanced_data = pd.DataFrame(index=np.arange(9*34), columns=cols)
    
    rounds = np.arange(data['Round'].min(), data['Round'].max()+1)
    for i in rounds:
        # enhance data of current round with current values from table
        round_data = data[data['Round']==i]
        for idx, match in round_data.iterrows():

            round = i
            date = match['Date']
            home_team = match['Team 1']
            away_team = match['Team 2']
            result_home_team = int(match['FT'].split('-')[0])
            result_away_team = int(match['FT'].split('-')[1])
            result = 0
            if (result_home_team > result_away_team):
                result = 1
            if (result_home_team == result_away_team):
                result = 0
            if (result_home_team < result_away_team):
                result = -1
            
            home_team_index = table[table['Team']==home_team].index[0]
            home_team_pos = table.ix[home_team_index, 'Pos']
            home_team_points = table.ix[home_team_index, 'Points']
            home_team_trend_points = table.ix[home_team_index, 'Trend5'].points()
            home_team_trend1 = table.ix[home_team_index, 'Trend5'].one
            home_team_trend2 = table.ix[home_team_index, 'Trend5'].two
            home_team_trend3 = table.ix[home_team_index, 'Trend5'].three
            home_team_trend4 = table.ix[home_team_index, 'Trend5'].four
            home_team_trend5 = table.ix[home_team_index, 'Trend5'].five
            home_team_goals_scored = table.ix[home_team_index, 'GoalsScored']
            home_team_goals_against = table.ix[home_team_index, 'GoalsAgainst']
            
            away_team_index = table[table['Team']==away_team].index[0]
            away_team_pos = table.ix[away_team_index, 'Pos']
            away_team_points = table.ix[away_team_index, 'Points']
            away_team_trend_points = table.ix[away_team_index, 'Trend5'].points()
            away_team_trend1 = table.ix[away_team_index, 'Trend5'].one
            away_team_trend2 = table.ix[away_team_index, 'Trend5'].two
            away_team_trend3 = table.ix[away_team_index, 'Trend5'].three
            away_team_trend4 = table.ix[away_team_index, 'Trend5'].four
            away_team_trend5 = table.ix[away_team_index, 'Trend5'].five
            away_team_goals_scored = table.ix[away_team_index, 'GoalsScored']
            away_team_goals_against = table.ix[away_team_index, 'GoalsAgainst']


            enhanced_data.ix[idx] = (i, date, 
                                     home_team, away_team, 
                                     home_team_pos, away_team_pos, 
                                     home_team_trend_points, away_team_trend_points,
                                     home_team_trend1, home_team_trend2, home_team_trend3, home_team_trend4, home_team_trend5, 
                                     away_team_trend1, away_team_trend2, away_team_trend3, away_team_trend4, away_team_trend5, 
                                     home_team_goals_scored, home_team_goals_against, 
                                     away_team_goals_scored, away_team_goals_against, 
                                     result_home_team, result_away_team, result)

            # update table with results of current round
            
            if (result_home_team > result_away_team):
                table.ix[home_team_index, 'Points'] += 3
                table.ix[home_team_index, 'Trend5'].update(3)
                table.ix[away_team_index, 'Trend5'].update(0)
            if (result_home_team == result_away_team):
                table.ix[home_team_index, 'Points'] += 1
                table.ix[away_team_index, 'Points'] += 1
                table.ix[home_team_index, 'Trend5'].update(1)
                table.ix[away_team_index, 'Trend5'].update(1)
            if (result_home_team < result_away_team):
                table.ix[away_team_index, 'Points'] += 3
                table.ix[home_team_index, 'Trend5'].update(0)
                table.ix[away_team_index, 'Trend5'].update(3)
            
            table.ix[home_team_index, 'GoalsScored'] += result_home_team
            table.ix[home_team_index, 'GoalsAgainst'] += result_away_team
            table.ix[away_team_index, 'GoalsScored'] += result_away_team
            table.ix[away_team_index, 'GoalsAgainst'] += result_home_team
            table.ix[home_team_index, 'Diff'] = table.ix[home_team_index, 'GoalsScored'] - table.ix[home_team_index, 'GoalsAgainst']
            table.ix[away_team_index, 'Diff'] = table.ix[away_team_index, 'GoalsScored'] - table.ix[away_team_index, 'GoalsAgainst']
            

            
        table = table.sort_values(by=['Points', 'Diff', 'GoalsScored'], ascending=False)
        table.Pos = np.arange(18)+1

    return enhanced_data


In [233]:
data_2003_2004_enhanced = enhance(data_2003_2004)
data_2004_2005_enhanced = enhance(data_2004_2005)
data_2005_2006_enhanced = enhance(data_2005_2006)
data_2006_2007_enhanced = enhance(data_2006_2007)
data_2007_2008_enhanced = enhance(data_2007_2008)
data_2008_2009_enhanced = enhance(data_2008_2009)
data_2009_2010_enhanced = enhance(data_2009_2010)
data_2010_2011_enhanced = enhance(data_2010_2011)
data_2011_2012_enhanced = enhance(data_2011_2012)
data_2012_2013_enhanced = enhance(data_2012_2013)
data_2013_2014_enhanced = enhance(data_2013_2014)

print (data_2003_2004_enhanced)

data = pd.concat([
  data_2003_2004_enhanced,
  data_2004_2005_enhanced,
  data_2005_2006_enhanced,
  data_2006_2007_enhanced,
  data_2007_2008_enhanced,
  data_2008_2009_enhanced,
  data_2009_2010_enhanced,
  data_2010_2011_enhanced, 
  data_2011_2012_enhanced, 
  data_2012_2013_enhanced, 
  data_2013_2014_enhanced], axis=0, ignore_index=True)


    Round        Date              HomeTeam              AwayTeam HomeTeamPos  \
0       1  2003-08-01        Bayern München   Eintracht Frankfurt           4   
1       1  2003-08-02         VfL Wolfsburg            VfL Bochum          17   
2       1  2003-08-02  1. FC Kaiserslautern      TSV 1860 München           1   
3       1  2003-08-02            Hertha BSC         Werder Bremen          12   
4       1  2003-08-02      Bayer Leverkusen           SC Freiburg           3   
5       1  2003-08-02          Hamburger SV           Hannover 96           9   
6       1  2003-08-02         FC Schalke 04     Borussia Dortmund           8   
7       1  2003-08-03  Bor. Mönchengladbach            1. FC Köln           5   
8       1  2003-08-03         Hansa Rostock         VfB Stuttgart          11   
9       2  2003-08-09           Hannover 96        Bayern München           2   
10      2  2003-08-09           SC Freiburg         Hansa Rostock          16   
11      2  2003-08-09       

In [234]:
data_train = data[:-(9*34)] # all but last season
data_test = data[-(9*34):] # only last season

response = 'Result' # 1, 0, or -1
#predictors = ['Round', 'HomeTeamPos', 'AwayTeamPos', 'HomeTeamGoalsScored', 'HomeTeamGoalsAgainst', 'AwayTeamGoalsScored', 'AwayTeamGoalsAgainst']
predictors = ['Round', 'HomeTeamPos', 'AwayTeamPos']

expected = data_test[response].get_values().tolist()

def print_precision(expected, predicted):
    print('precision: ' + str(sum([1 if x==predicted[i] else 0 for i,x in enumerate(expected)])/float(len(expected))))

In [236]:
# simple predictor just predicting win for team with better table pos
def simple_predictor(dataset):
    return [1 if match.HomeTeamPos<match.AwayTeamPos else -1 for idx, match in dataset.iterrows()]

predicted = simple_predictor(data_test[predictors])

print_precision(expected, predicted)

#target_names = ['tie', 'home wins', 'away wins']

#print("Classification report for classifier %s:\n%s\n" % ("simple_predictor", metrics.classification_report(expected, predicted,target_names=target_names)))
#print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

precision: 0.5359477124183006


In [238]:
# linear regression based on team position in the table
predictors = ['HomeTeamPos', 'AwayTeamPos',
            'HomeTeamTrend1', 'HomeTeamTrend2', 'HomeTeamTrend3', 'HomeTeamTrend4', 'HomeTeamTrend5', 
            'AwayTeamTrend1', 'AwayTeamTrend2', 'AwayTeamTrend3', 'AwayTeamTrend4', 'AwayTeamTrend5']

regr = linear_model.LinearRegression()
regr.fit(data_train[predictors], data_train[response])
print(regr.coef_)
predicted_lin = regr.predict(data_test[predictors])
# we need to convert the floating values to our response format
predicted = [np.sign(v) for v in predicted_lin]
print_precision(expected, predicted)
print(predicted)

[-0.02207907  0.0196803   0.02216003  0.03356788  0.02697231  0.02591054
  0.00163103 -0.00431951 -0.03199717 -0.0010145  -0.00102393 -0.01221405]
precision: 0.5261437908496732
[1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 

In [245]:
# SVM predictor
predictors = ['HomeTeamPos', 'AwayTeamPos', 'HomeTeamTrendPoints', 'AwayTeamTrendPoints',
            'HomeTeamTrend1', 'HomeTeamTrend2', 'HomeTeamTrend3', 'HomeTeamTrend4', 'HomeTeamTrend5', 
            'AwayTeamTrend1', 'AwayTeamTrend2', 'AwayTeamTrend3', 'AwayTeamTrend4', 'AwayTeamTrend5',
            'HomeTeamGoalsScored', 'HomeTeamGoalsAgainst', 
            'AwayTeamGoalsScored', 'AwayTeamGoalsAgainst']

svc = svm.LinearSVC()
svc.fit(data_train[predictors].get_values().tolist(), data_train[response].get_values().tolist())

predicted = svc.predict(data_test[predictors]).tolist()
expected = data_test[response].get_values().tolist()

print(predicted)
print_precision(expected, predicted)

#target_names = ['tie', 'home wins', 'away wins']

#print("Classification report for classifier %s:\n%s\n" % (svc, metrics.classification_report(expected, predicted,target_names=target_names)))
#print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

[-1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1