In [1]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import tensorflow as tf
import edward as ed
from edward.models import Normal, Poisson

## Data Selection

As a start, let's select the data from the Premier League in the 2008/2009 season

In [2]:
path = "./Input/"  #Insert path here
database = path + 'database.sqlite'
conn = sqlite3.connect(database)

tables = pd.read_sql("""SELECT *
                        FROM sqlite_master
                        WHERE type='table';""", conn)

In [3]:
detailed_matches = pd.read_sql("""SELECT Match.id, 
                                        Country.name AS country_name, 
                                        League.name AS league_name, 
                                        season, 
                                        stage, 
                                        date,
                                        HT.team_long_name AS  home_team,
                                        AT.team_long_name AS away_team,
                                        home_team_goal, 
                                        away_team_goal                                        
                                FROM Match
                                JOIN Country on Country.id = Match.country_id
                                JOIN League on League.id = Match.league_id
                                LEFT JOIN Team AS HT on HT.team_api_id = Match.home_team_api_id
                                LEFT JOIN Team AS AT on AT.team_api_id = Match.away_team_api_id
                                where country_name = 'England' and season = '2008/2009'
                                ORDER by date
                                ;""", conn)

Next let's create a matrix that evaluates how the teams performed against each other. If there are n teams, then there are nxn entries, with [a,b] being team a's score vs team b. we have [a,b] == -[b,a]

In [4]:
# Make a matrix of nxn, matchups of all the teams
teams = detailed_matches.home_team.unique()
teams_dict = dict(zip(teams, range(len(teams))))

n = len(teams)
matchup = np.zeros((n,n))

# Normalize inter-team scores by number of games played
games_count = np.ones((n,n))


for index, row in detailed_matches.iterrows():
    home = teams_dict[row.home_team]
    away = teams_dict[row.away_team]
    score = row.home_team_goal - row.away_team_goal
    matchup[home, away] += score
    matchup[away, home] -= score
    games_count[home, away] += 1
    games_count[away, home] += 1

matchup = np.divide(matchup,games_count)

## Model Building

Now we create an Edward model, with the team's skill represented as a Gaussian, initialized like 25, (25/3)^2

In [5]:
#Team Skill
#initialize 
initial_loc = tf.ones((n,1), dtype='float32') * 25
initial_scale = tf.ones((n,1),  dtype='float32') * (25/3)**2

team_skill = Normal(loc=initial_loc, scale=initial_scale)

#Team Performance
team_performance = Normal(loc=team_skill, scale=initial_scale)

perf_diff = tf.tile(tf.reduce_sum(team_performance, 1, keepdims=True), [1, n])
perf_diff = perf_diff - tf.transpose(perf_diff)


In [6]:
qz = Normal(loc=tf.get_variable("qz/loc", [n, 1]),
            scale=tf.nn.softplus(tf.get_variable("qz/scale", [n, 1])))

inference = ed.KLqp({team_skill: qz}, data={perf_diff: matchup*25})

  not np.issubdtype(value.dtype, np.float) and \


In [7]:
inference.run(n_iter=10000)

10000/10000 [100%] ██████████████████████████████ Elapsed: 9s | Loss: 3.966


## Evaluating Results