#CS579 Project

Predicting Football Results Using a Weight based System.

Vijay Bharrathi - A20356386
Sanker Narayanan Shanmugam	- A20358335


In [401]:
from collections import Counter
from collections import defaultdict
import glob
import hashlib
import io
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import httplib
import json
import sys
import time
import pickle
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tabulate import tabulate
%matplotlib inline

In [402]:
"""Establishes a connection with the football API and gets the response for given URL.
   If the limit for API succeeds 50 per minute,the method retries after one minute 
   
   Arguments: URL Address
   Returns : Number of retries to be permitted  
   
   """
def robust_request(url1, max_tries=5):
    for i in range(max_tries):
        url = 'api.football-data.org'
        key = 'e1eb4a7bca584143a13b9c2db847117f'
        connection = httplib.HTTPConnection(url)
        headers = { 'X-Auth-Token': key, }
        connection.request('GET', url1 , None, headers )
        response = connection.getresponse();
        if response.status == 200:
            return json.loads(response.read())
        else:
            print >> sys.stderr, 'Exceeded traffic limit for api: sleeping for 1 min'
            response.read()
            sys.stderr.flush()
            time.sleep(61)

In [403]:
"""
Gets the Soccer seasons data between start and end years for given league code and constructs a table of fixture
data for all matches in this season with hometeamname,awayteamname,year,matchday,result and headtohead data
" if (season['league'] == u'CL') " this statement chooses the league.
for other league codes refer Table 1. League-Codes used in Soccerseason resource in 
http://api.football-data.org/docs/latest/index.html

Finally it writes the data to a file.

Fetching data from http://api.football-data.org/index requires much time.
So We have commented the following statements since we have fetched data already and attached along with the code.
#fetch_data(2010, 2014 , "training")
#fetch_data(2014,2015, "test")

If needed to verify run this program once uncommenting the above statements-It takes time

Arguments: Start year,End year,File to which data should be written
Returns: None
"""
def fetch_data(startYear, endYear, ouputFile):
    fixtureTab = []
    for year in range(startYear,endYear):
        seasonresponse = robust_request('/v1/soccerseasons?season=' + `year`)
        for season in seasonresponse:
            if (season['league'] == u'CL'):                 
                fixtureslink = season['_links']['fixtures']['href']
                fixturestable = robust_request(fixtureslink)
                if not ('error' in fixturestable.keys()):
                    for fixture in fixturestable['fixtures']:
                        list2 = {}
                        list2['year'] = year
                        list2['matchday'] = fixture['matchday']
                        list2['homeTeamName'] = fixture['homeTeamName']
                        list2['awayTeamName'] = fixture['awayTeamName']              
                        fixtureLink = fixture['_links']['self']['href'] 
                        fixtResponse = robust_request(fixtureLink)
                        list2['headToHeadHomeWin'] = fixtResponse['head2head']['homeTeamWins']
                        list2['headToHeadAwayWin'] = fixtResponse['head2head']['awayTeamWins']
                        list2['headToHeadDraw'] = fixtResponse['head2head']['draws']
                        if (int(fixture['result']['goalsHomeTeam']) > int(fixture['result']['goalsAwayTeam'])):
                            list2['Result'] = 1;
                        elif (int(fixture['result']['goalsHomeTeam']) < int(fixture['result']['goalsAwayTeam'])):
                            list2['Result'] = -1;
                        elif (int(fixture['result']['goalsHomeTeam']) == int(fixture['result']['goalsAwayTeam'])):
                            list2['Result'] = 0;
                        fixtureTab.append(list2)
    with open(ouputFile, 'wb') as f:
        pickle.dump(fixtureTab, f)
    print "Fetched data to file"
    
#fetch_data(2010, 2014 , "training")
#fetch_data(2014,2015, "test")  

In [404]:
"""
Reads data from file 

arguments: filename
returns: data from file requested

"""
def read_data(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [405]:
"""
Updates the total number of matches played and win/loss ratio after each match played for individual teams
"""

def updateTeamStats(TeamStats, homevalue, awayvalue, fixture):
    homevalue['hometotalgames'] += 1
    awayvalue['awaytotalgames'] += 1
   
    if (fixture['Result'] == 1):
        homevalue['homewins'] += 1
        awayvalue['awayloss'] += 1
    elif (fixture['Result'] == 0):
        homevalue['homedraws'] += 1
        awayvalue['awaydraws'] += 1
    elif (fixture['Result'] == -1):
        homevalue['homeloss'] += 1
        awayvalue['awaywins'] += 1

    TeamStats[fixture['homeTeamName']] = homevalue
    TeamStats[fixture['awayTeamName']] = awayvalue

In [406]:
"""
Creates a feature matrix containing information about the teams until the day of the match including:
1.Home Team's win/loss/draw ratios.
2.Away Team's win/loss/draw ratios.
3.Head to Head statistics between home and away team.

Arguments: Fixture table,Flag if it is training,Team statistics
Returns : Feature table
"""
def BuildFeature (fixtureTab, isTraining, TeamStats):
    FeatureTab = []
    for fixture in fixtureTab:
        if fixture['homeTeamName'] in TeamStats.keys():
            homevalue = TeamStats[fixture['homeTeamName']]
        else:
            homevalue = {'hometotalgames':0, 'homewins':0, 'homeloss':0, 'homedraws': 0, 'awaytotalgames':0, 'awaywins':0, 'awayloss': 0, 'awaydraws':0 };

        if fixture['awayTeamName'] in TeamStats.keys():
            awayvalue = TeamStats[fixture['awayTeamName']]
        else:
            awayvalue = {'hometotalgames':0, 'homewins':0, 'homeloss':0, 'homedraws': 0, 'awaytotalgames':0, 'awaywins':0, 'awayloss': 0, 'awaydraws':0 };

        Feature = {}
        if (homevalue['hometotalgames'] != 0):
            Feature['homeTeamhomewinRatio'] = float(homevalue['homewins']) / (float(homevalue['hometotalgames']))
            Feature['homeTeamhomelossRatio'] = float(homevalue['homeloss']) / (float(homevalue['hometotalgames']))
            Feature['homeTeamhomedrawRatio'] = float(homevalue['homedraws']) / (float(homevalue['hometotalgames']))
        else:
            Feature['homeTeamhomewinRatio'] = 0
            Feature['homeTeamhomelossRatio'] = 0
            Feature['homeTeamhomedrawRatio'] = 0
        if (awayvalue['awaytotalgames'] != 0):
            Feature['awayTeamawaywinRatio'] = float(awayvalue['awaywins']) / (float(awayvalue['awaytotalgames']))
            Feature['awayTeamawaylossRatio'] = float(awayvalue['awayloss']) / (float(awayvalue['awaytotalgames']))
            Feature['awayTeamawaydrawRatio'] = float(awayvalue['awaydraws']) / (float(awayvalue['awaytotalgames']))
        else:
            Feature['awayTeamawaywinRatio'] = 0
            Feature['awayTeamawaylossRatio'] = 0
            Feature['awayTeamawaydrawRatio'] = 0 

        Feature['headToHeadWinRatio'] = float(fixture['headToHeadHomeWin']) / (float(fixture['headToHeadHomeWin']) + float(fixture['headToHeadAwayWin']) + float(fixture['headToHeadDraw']))
        Feature['headToHeadLossRatio'] = float(fixture['headToHeadAwayWin']) / (float(fixture['headToHeadHomeWin']) + float(fixture['headToHeadAwayWin']) + float(fixture['headToHeadDraw']))                                    
        Feature['headToHeadDrawRatio'] = float(fixture['headToHeadDraw']) / (float(fixture['headToHeadHomeWin']) + float(fixture['headToHeadAwayWin']) + float(fixture['headToHeadDraw']))                                    
        FeatureTab.append(Feature)
        
        if (isTraining):
            updateTeamStats(TeamStats, homevalue, awayvalue, fixture)
    return FeatureTab


In [407]:
"""
Data binning : Technique used to bin values in our feature matrix
Used to improve accuracy of the model

Arguments: Matrix for which data discretization is to be done
Returns: Matrix after Discretization
"""

def Discretize(DisTab) :
    for rows in DisTab :
        for columns in rows.keys():
            if rows[columns] <=0.2 :
                rows[columns] = 0.2              
            elif rows[columns] > 0.2 and rows[columns] <=0.4 :
                rows[columns] = 0.4                
            elif rows[columns] > 0.4 and rows[columns] <=0.6 :
                rows[columns] = 0.6                
            elif rows[columns] > 0.6 and rows[columns] <=0.8 :
                rows[columns] = 0.8                
            elif rows[columns] > 0.8 :
                rows[columns] = 1

    return DisTab

In [408]:
"""
Gets the result data for each match

Arguments : Fixture Table
Returns: Labels

"""
def BuildLabel(fixtureTab):
    LableTab = []
    for fixture in fixtureTab:
        Lab = []
        Lab.append(fixture['Result'])
        LableTab.append(Lab)
    labels = np.array(LableTab)
    return labels

In [409]:
"""
Converts Feature matrix to np Array

"""

def covertFeatureToNpArray(FeatureTab):
    FeatureList = []
    for feat in FeatureTab:
        Feature = []
        Feature.append(feat['awayTeamawaywinRatio'])
        Feature.append(feat['awayTeamawaylossRatio'])
        Feature.append(feat['headToHeadDrawRatio'])
        Feature.append(feat['headToHeadWinRatio'])
        Feature.append(feat['headToHeadLossRatio'])
        Feature.append(feat['homeTeamhomelossRatio'])
        Feature.append(feat['homeTeamhomedrawRatio'])
        Feature.append(feat['awayTeamawaydrawRatio'])
        Feature.append(feat['homeTeamhomewinRatio'])
        FeatureList.append(Feature)
    mat = np.array(FeatureList)
    return mat

In [410]:
def train(feature, label):
    clf = get_clf()
    clf.fit(feature, label)
    return clf

In [411]:
def predict(clf, feature, label):
    predicted = clf.predict(feature)               
    print ('accuracy of the predicted test data = %.4f' %(accuracy_score(label, predicted)))

In [412]:
def get_clf():
    return LogisticRegression(random_state=42)

In [413]:
"""
Perform n-fold cross validation, calling get_clf() to train n different classifiers. 
"""
def do_cross_validation(X, y, n_folds, verbose=False):
    
    kf = KFold(len(y), n_folds=n_folds,shuffle=False )
    accuracy = []
    for train_idx, test_idx in kf:
        clf = get_clf()
        clf.fit(X[train_idx], y[train_idx])
        predicted = clf.predict(X[test_idx])               
        accuracy.append(accuracy_score(y[test_idx], predicted))
    avg = np.mean(accuracy)
    for i in range(n_folds) :
        if verbose == True :
            print ('fold %d accuracy = %.4f' %(i,accuracy[i])) 
    return avg

In [414]:
TrainingData = read_data("training")
TeamStats = {}
FeatureTab = BuildFeature(TrainingData, True, TeamStats)
FeatureDis = Discretize(FeatureTab)
FeatureMat = covertFeatureToNpArray(FeatureDis)
LabelMat = BuildLabel(TrainingData)

In [415]:
"""
Builds model for given data matrix and labels

Arguments: Feature Matrix,Labels,Number of folds to be done in cross validation
Returns: A Logestic Regression model
"""
def buildModel(FeatureMat,LabelMat,n_folds):
    print('average cross validation accuracy=%.4f' %
      do_cross_validation(FeatureMat, LabelMat.ravel(),n_folds = n_folds, verbose=False))
    clf = train(FeatureMat, LabelMat.ravel())
    Result = {'TeamStats': TeamStats, 'CLF': clf}
    return Result
Result = buildModel(FeatureMat, LabelMat.ravel(),5)

average cross validation accuracy=0.7520


In [416]:
"""
Builds Feature matrix for Test data

Arguments: Test Data
Returns: Feature matrix for test data

"""
def buildtest(TestFeature) :
    TestFeature = BuildFeature(TestData, False, Result['TeamStats'])
    TestDis = Discretize(TestFeature)
    TestFeatureMat = covertFeatureToNpArray(TestDis)
    return TestFeatureMat

In [417]:
"""
Runs one experiment for test data and returns predicted accuracy
"""
def expt_with_testdata(Result, TestData,TestLabelMat):
    predict(Result['CLF'], TestFeatureMat, TestLabelMat)    

TestData = read_data("test")
TestFeatureMat = buildtest(TestData)
TestLabelMat = BuildLabel(TestData)
expt_with_testdata(Result, TestData,TestLabelMat)

accuracy of the predicted test data = 0.8000


In [418]:
clf = get_clf()
clf.fit(TestFeatureMat,TestLabelMat.ravel())
predlist = clf.predict(TestFeatureMat)
def predictedList (matchday) :
    for matchday in range(len(predlist)):
        return predlist[matchday]

In [419]:
"""
Compares actual result with the result predicted by the model for each match

Arguments : Test Data
Returns : Dictionary of {'hometeamname':,'Awayteamname':,'Actualresult':,'PredictedResult':}

"""

def finalResult(TestData) :
    matchday = 0
    finalPredictionresult = []
    for i in TestData:
        f = {}
        f['HomeTeam'] = (i['homeTeamName'])
        f['AwayTeam'] = (i['awayTeamName'])        
        f['Predicted Result'] = (predictedList(matchday))    
        if i['Result'] == 1 :
            f['Actual Result'] = (i['homeTeamName'])
        elif i['Result'] == -1 :
            f['Actual Result'] = (i['awayTeamName'])
        elif i['Result'] == 0 :
            f['Actual Result'] = "Draw"
        matchday += 1
        finalPredictionresult.append(f)
    for i in finalPredictionresult :
        if i['Predicted Result'] == 1:
            i['Predicted Result'] = i['HomeTeam']
        if i['Predicted Result'] == -1:
            i['Predicted Result'] = i['AwayTeam']
        if i['Predicted Result'] == 0:
            i['Predicted Result'] = "Draw"
    return finalPredictionresult

print tabulate(finalResult(TestData),headers="keys",tablefmt="fancy_grid")


╒═════════════════════════╤═════════════════════════╤═════════════════════════╤═════════════════════════╕
│ AwayTeam                │ HomeTeam                │ Actual Result           │ Predicted Result        │
╞═════════════════════════╪═════════════════════════╪═════════════════════════╪═════════════════════════╡
│ Ludogorez Rasgrad       │ Liverpool FC            │ Liverpool FC            │ Liverpool FC            │
├─────────────────────────┼─────────────────────────┼─────────────────────────┼─────────────────────────┤
│ FC Basel                │ Real Madrid CF          │ Real Madrid CF          │ Real Madrid CF          │
├─────────────────────────┼─────────────────────────┼─────────────────────────┼─────────────────────────┤
│ FC Zenit St. Petersburg │ Benfica Lissabon        │ FC Zenit St. Petersburg │ Benfica Lissabon        │
├─────────────────────────┼─────────────────────────┼─────────────────────────┼─────────────────────────┤
│ Bayer Leverkusen        │ AS Monaco FC      

In [420]:
"""
Compares actual wins with the wins predicted by the model for each team

Arguments : Test Data
Returns : Dictionary of {'Actualwins':,'Predictedwins':,'Teamname':}

"""


def compare_wins(TestData) :
    Actualwins = []
    predList = []
    Totalwins = []
    actList = []
    PredictedData = finalResult(TestData)            
    for teams in PredictedData :
        predList.append(teams['Predicted Result'])
        actList.append(teams['Actual Result'])
    Actualwins = Counter(actList)
    Predictedwins = Counter(predList)
    for teams in Predictedwins:
        p = {}
        p['Teamname'] = teams
        p['Predictedwins'] = Predictedwins[teams]
        p['Actualwins']  = Actualwins[teams]
        Totalwins.append(p)
    return Totalwins

print tabulate(compare_wins(TestData),headers="keys",tablefmt="fancy_grid")

╒══════════════╤═════════════════╤═════════════════════════╕
│   Actualwins │   Predictedwins │ Teamname                │
╞══════════════╪═════════════════╪═════════════════════════╡
│            1 │               3 │ CSKA Moscow             │
├──────────────┼─────────────────┼─────────────────────────┤
│            2 │               4 │ FC Basel                │
├──────────────┼─────────────────┼─────────────────────────┤
│            1 │               3 │ FK BATE Baryssau        │
├──────────────┼─────────────────┼─────────────────────────┤
│            0 │               3 │ APOEL Nicosia           │
├──────────────┼─────────────────┼─────────────────────────┤
│            4 │               5 │ Paris Saint-Germain     │
├──────────────┼─────────────────┼─────────────────────────┤
│            5 │               4 │ Arsenal FC              │
├──────────────┼─────────────────┼─────────────────────────┤
│            1 │               3 │ Ajax Amsterdam          │
├──────────────┼────────