# English Premier League (EPL) Pythagorean Predictor

## Pythagorean Expectation

Expected Win% $\propto\frac{x^2}{x^2 + y^2}$, where

- x = parameter scored
- y = parameter conceded

In [1]:
# Importing Packages

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

# Custom
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_formats = ['svg'] # makes everything svg by default
%matplotlib inline

In [2]:
# Read Data

dataset = pd.read_excel('../ds/EPL2017-18.xlsx')
print(dataset.columns.tolist())

display( dataset.head() )

FileNotFoundError: [Errno 2] No such file or directory: '../ds/EPL2017-18.xlsx'

In [None]:
# Cleanup
dataset['count'] = 1

dataset['hwinvalue'] = np.where( dataset['FTR']=='H',1, np.where(dataset['FTR']=='D',.5,0) )
dataset['awinvalue'] = np.where( dataset['FTR']=='A',1, np.where(dataset['FTR']=='D',.5,0) )

home1 = dataset[dataset.Date < 20180000].groupby(['HomeTeam'])['count','hwinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
home1 = home1.rename(columns={'HomeTeam':'Team','count':'MPh','FTHG':'GFh', 'FTAG':'GAh'})

away1 = dataset[dataset.Date < 20180000].groupby(['AwayTeam'])['count','awinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
away1 = away1.rename(columns={'AwayTeam':'Team','count':'MPa','FTHG':'GAa','FTAG':'GFa'})
# because my goals in away ground will be home goals against for the other team


home2 = dataset[dataset.Date > 20180000].groupby(['HomeTeam'])['count','hwinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
home2 = home2.rename(columns={'HomeTeam':'Team','count':'MPh','FTHG':'GFh', 'FTAG':'GAh'})

away2 = dataset[dataset.Date > 20180000].groupby(['AwayTeam'])['count','awinvalue', 'FTHG','FTAG']\
    .sum().reset_index()
away2 = away2.rename(columns={'AwayTeam':'Team','count':'MPa','FTHG':'GAa','FTAG':'GFa'})
# because my goals in away ground will be home goals against for the other team

half1 = pd.merge(home1, away1, on="Team")
half2 = pd.merge(home2, away2, on="Team")

In [None]:
# Evaluations
halves = [half1, half2]

for half in halves:
    half["MP"] = half["MPh"] + half["MPa"]
    half["wValue"] = half["hwinvalue"] + half["awinvalue"]
    half["GF"] = half["GFh"] + half["GFa"]
    half["GA"] = half["GAh"] + half["GAa"]


half1["pyth1"] = (half1["GF"]**2) / (half1["GF"]**2 + half1["GA"]**2)
half1["wpc1"] = half1["wValue"]/half1["MP"]


half2["pyth2"] = (half2["GF"]**2) / (half2["GF"]**2 + half2["GA"]**2)
half2["wpc2"] = half2["wValue"]/half2["MP"]

In [None]:
# Cleaned up Dataset
dropCols = ["MPh", "hwinvalue", "GFh", "GAh", "MPa", "awinvalue", "GFa", "GAa"]

for half in halves:
    display( 
        half.drop(columns = dropCols).head()
    )

In [None]:
# using half 1 pyth as predictor for half 2 wpc
predictor = pd.merge(half1, half2, on = "Team")

In [None]:
sns.relplot(x="pyth1", y="wpc2", data = predictor)
plt.title("2nd Half Win% vs 1st Half Pythagorean Expectation")
plt.xlim(0, 1), plt.ylim(0, 1)
plt.show()

In [None]:
# Plotting
sns.relplot(x="pyth1", y="wpc2", data = predictor)
plt.title("2nd Half Win% vs 1st Half Pythagorean Expectation")
plt.xlim(0, 1), plt.ylim(0, 1)
plt.show()

In [None]:
# Regression

regression = smf.ols(formula = 'wpc2 ~ pyth1', data=predictor).fit()
regression.summary()

In [None]:
# correlation matrix

values = predictor[['Team', 'wpc1', 'wpc2', 'pyth1', 'pyth2']]
display( values.corr() )

In [None]:
# Quiz Questions


print(
    "How many EPL games from this season were played in 2018?"
    + "\n" +
    str(dataset[dataset.Date > 20180000].shape[0])
)

print(
    "Which team scored the highest number of goals while playing at home in the first half of the season?"
    + "\n" +
    half1.sort_values("GFh", ascending=False).iloc[0][0]
)

print(
    "Which team conceded the highest number of goals while playing away in the first half of the season?"
    + "\n" +
    half1.sort_values("GAa", ascending=False).iloc[0][0]
)

half1['dev'] = abs(half1['wpc1'] - half1['pyth1'])
print(
    "Which of the following teams had the smallest difference between their win percentage and Pythagorean expectation in the first half of the season?"
)
display( half1.sort_values("dev", ascending=True).head() )
print("Mancity")
print(
    "Which of the following teams had the smallest difference between their win percentage and Pythagorean expectation in the first half of the season?"
)
display( half1.sort_values("dev", ascending=True).head() )
print("Leicester")


print(
    "Which of the following teams had the highest value for away wins (awinvalue) for in the first half of the season?"
)
display( half1.sort_values("awinvalue", ascending=False).tail() )

half2['gap'] = abs(half2['hwinvalue'] - half2['awinvalue'])
print(
    "Which team had the largest gap between home points won (hwinvalue) and away points won (awinvalue) in the second half the season?"
    + "\n" +
    half2.sort_values("gap", ascending=False).iloc[0][0]
)

print(
    "What was the correlation between win percentage and the Pythagorean expectation in the first half of the season?"
)
display(
    round(values.corr().iloc[0, 2], 3)
)


print(
    "What was the correlation between win percentage in the first half of the season and the second half of the season?"
)
display(
    round(values.corr().iloc[0, 1], 3)
)

print(
    "What was the correlation between win percentage in the second half of the season and the Pythagorean expectation in the first half of the season?"
)
display(
    round(values.corr().iloc[1, 2], 3)
)