In [1]:
### This notebook is based on my kaggle notebook
### https://www.kaggle.com/abzalseitkaziyev/trueskill-ranking-exercise

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import trueskill # trueskill ranking

from scipy.stats import poisson


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# 1. TrueSkill Ranking on English Premier League Dataset

In this notebook, I applied [TrueSkill](https://trueskill.org/) rating system to calculate ranks of English Premiere League teams. The data is sourced from the [Kaggle datasets](https://www.kaggle.com/irkaal/english-premier-league-results#EPL.csv). 
My objective was to learn and implement TrueSkill ranking using real data.

TrueSkill is a rating system which was developed by Microsoft to rank and match players. It is based on ELO ranking system, but gives more flexibility, like multiplayer games ranking.

Ranking starts with 0 and goes up to 50 points. The TrueSkill rank is calculated based on the Score = mu - 3* sigma and could be considered as conservative estimation of the players' skills, as system is 99% confident that players' skills are higher than calculated Scores. 

Starting scores by default: mu = 25, sigma=8.33. For this exercise I will use score = (mu - 2*sigma) instead, so starting scores will be 8.33, and not 0.

### Read in data.

In [3]:
# df = pd.read_csv('../input/english-premier-league-results/EPL.csv')
df = pd.read_csv('data/results.csv')
df.head(3).append(df.tail(3))

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,1993-94,1993-08-14 00:00:00,Arsenal,Coventry,0,3,A,,,,...,,,,,,,,,,
1,1993-94,1993-08-14 00:00:00,Aston Villa,QPR,4,1,H,,,,...,,,,,,,,,,
2,1993-94,1993-08-14 00:00:00,Chelsea,Blackburn,1,2,A,,,,...,,,,,,,,,,
10421,2019-20,2020-07-26 16:00:00,Newcastle,Liverpool,1,3,A,1.0,1.0,D,...,2.0,6.0,2.0,4.0,11.0,5.0,1.0,0.0,0.0,0.0
10422,2019-20,2020-07-26 16:00:00,Southampton,Sheffield United,3,1,H,0.0,1.0,A,...,4.0,3.0,9.0,1.0,9.0,16.0,0.0,1.0,0.0,0.0
10423,2019-20,2020-07-26 16:00:00,West Ham,Aston Villa,1,1,D,0.0,0.0,D,...,1.0,4.0,0.0,7.0,16.0,13.0,2.0,1.0,0.0,0.0


In [4]:
df.columns

Index(['Season', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF',
       'AF', 'HY', 'AY', 'HR', 'AR'],
      dtype='object')

### Leave columns required for TrueSkill ranking implementation   

In [5]:
df = df[['Season','Date', 'HomeTeam', 'AwayTeam','FTR', 'FTHG', 'FTAG']].copy()
df.head()

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG
0,1993-94,1993-08-14 00:00:00,Arsenal,Coventry,A,0,3
1,1993-94,1993-08-14 00:00:00,Aston Villa,QPR,H,4,1
2,1993-94,1993-08-14 00:00:00,Chelsea,Blackburn,A,1,2
3,1993-94,1993-08-14 00:00:00,Liverpool,Sheffield Weds,H,2,0
4,1993-94,1993-08-14 00:00:00,Man City,Leeds,D,1,1


In [6]:
df.tail()

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG
10419,2019-20,2020-07-26 16:00:00,Leicester,Man United,A,0,2
10420,2019-20,2020-07-26 16:00:00,Man City,Norwich,H,5,0
10421,2019-20,2020-07-26 16:00:00,Newcastle,Liverpool,A,1,3
10422,2019-20,2020-07-26 16:00:00,Southampton,Sheffield United,H,3,1
10423,2019-20,2020-07-26 16:00:00,West Ham,Aston Villa,D,1,1


In [7]:
set(df['AwayTeam'].unique())==set(df['HomeTeam'].unique())

True

### 'FTR' - Full Time Result (H=Home Win, D=Draw, A=Away Win)
#### For TrueSkill Win=0,Draw=0,Loose=1. Implement this accordingly.  

In [8]:
df['Result_HomeTeam'] = df['FTR']
df['Result_AwayTeam'] = df['FTR']

df.replace({'Result_HomeTeam': {'H':0,'D':0,'A':1}},inplace=True)
df.replace({'Result_AwayTeam': {'H':1,'D':0,'A':0}},inplace=True)
df.head()

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,Result_HomeTeam,Result_AwayTeam
0,1993-94,1993-08-14 00:00:00,Arsenal,Coventry,A,0,3,1,0
1,1993-94,1993-08-14 00:00:00,Aston Villa,QPR,H,4,1,0,1
2,1993-94,1993-08-14 00:00:00,Chelsea,Blackburn,A,1,2,1,0
3,1993-94,1993-08-14 00:00:00,Liverpool,Sheffield Weds,H,2,0,0,1
4,1993-94,1993-08-14 00:00:00,Man City,Leeds,D,1,1,0,0


In [9]:
df_1 = df[df.Season=='2017-18']
df_2 = df[df.Season=='2018-19']

In [10]:
# df.isna().sum()

In [11]:
df_1.reset_index(drop=True, inplace = True) 
df_2.reset_index(drop=True, inplace = True)

In [12]:
### use df_1 for historical probabilities and True_skill ratings
### use df_2 for calculating odds and current True_skill ratings

In [13]:
# check teams for both dfs
[
    set(df_1['AwayTeam'].unique())==set(df_1['HomeTeam'].unique()),
    set(df_2['AwayTeam'].unique())==set(df_2['HomeTeam'].unique()),
    set(df_1['HomeTeam'].unique()) == set(df_2['HomeTeam'].unique())
]

[True, True, False]

In [14]:
# print(set(df_1['HomeTeam'].unique()), set(df_2['HomeTeam'].unique()))
df_1['HomeTeam'].unique(), df_2['HomeTeam'].unique()

(array(['Arsenal', 'Brighton', 'Chelsea', 'Crystal Palace', 'Everton',
        'Southampton', 'Watford', 'West Brom', 'Man United', 'Newcastle',
        'Bournemouth', 'Burnley', 'Leicester', 'Liverpool', 'Stoke',
        'Swansea', 'Huddersfield', 'Tottenham', 'Man City', 'West Ham'],
       dtype=object),
 array(['Man United', 'Bournemouth', 'Fulham', 'Huddersfield', 'Newcastle',
        'Watford', 'Wolves', 'Arsenal', 'Liverpool', 'Southampton',
        'Cardiff', 'Chelsea', 'Everton', 'Leicester', 'Tottenham',
        'West Ham', 'Brighton', 'Burnley', 'Man City', 'Crystal Palace'],
       dtype=object))

In [15]:
comparison_1 = np.isin(df_1['HomeTeam'].unique(),
                       df_2['HomeTeam'].unique(),invert=True)
comparison_2 = np.isin(df_2['HomeTeam'].unique(),
                       df_1['HomeTeam'].unique(),invert=True)

In [16]:
## get not matching team indeces in df_1 and df_2
[
    df_1['HomeTeam'].unique()[comparison_1], # eleminated teams
    df_2['HomeTeam'].unique()[comparison_2] # new teams in new season 
]

[array(['West Brom', 'Stoke', 'Swansea'], dtype=object),
 array(['Fulham', 'Wolves', 'Cardiff'], dtype=object)]

In [17]:
teams = df_1['HomeTeam'].unique().tolist()
ts = trueskill.TrueSkill()

ranking = []

# Create Default ratings(mu=25, sigma=8.33)
for team in teams:
       ranking.append(ts.create_rating())

# create dictionary with all teams and initial ratings
all_ranks_dict = dict(zip(teams,ranking))
all_ranks_dict

{'Arsenal': trueskill.Rating(mu=25.000, sigma=8.333),
 'Brighton': trueskill.Rating(mu=25.000, sigma=8.333),
 'Chelsea': trueskill.Rating(mu=25.000, sigma=8.333),
 'Crystal Palace': trueskill.Rating(mu=25.000, sigma=8.333),
 'Everton': trueskill.Rating(mu=25.000, sigma=8.333),
 'Southampton': trueskill.Rating(mu=25.000, sigma=8.333),
 'Watford': trueskill.Rating(mu=25.000, sigma=8.333),
 'West Brom': trueskill.Rating(mu=25.000, sigma=8.333),
 'Man United': trueskill.Rating(mu=25.000, sigma=8.333),
 'Newcastle': trueskill.Rating(mu=25.000, sigma=8.333),
 'Bournemouth': trueskill.Rating(mu=25.000, sigma=8.333),
 'Burnley': trueskill.Rating(mu=25.000, sigma=8.333),
 'Leicester': trueskill.Rating(mu=25.000, sigma=8.333),
 'Liverpool': trueskill.Rating(mu=25.000, sigma=8.333),
 'Stoke': trueskill.Rating(mu=25.000, sigma=8.333),
 'Swansea': trueskill.Rating(mu=25.000, sigma=8.333),
 'Huddersfield': trueskill.Rating(mu=25.000, sigma=8.333),
 'Tottenham': trueskill.Rating(mu=25.000, sigma=8.33

In [18]:
home_team_rank = df_1['Result_HomeTeam'].values
away_team_rank = df_1['Result_AwayTeam'].values
ts_ranks = np.stack((home_team_rank,away_team_rank), axis=-1) # create array of arrays with results

home_team = df_1['HomeTeam'].values
away_team = df_1['AwayTeam'].values
match_array = np.stack((home_team,away_team), axis=-1) # create array of arrays with all matches

In [19]:
def rating(mu,sigma):
    """
    mu and sigma from TrueSkill: 
    mu = ts_rating.mu;
    sigma = ts_rating.sigma
    
    Function returns trueskill rating value ('real score' is with 97.1% confidence not below that value)
    """
    return mu-3*sigma

In [20]:
# Create lists: with current ranks(before the game) and new ranks(after the game)
curr_ranks_list=[]
new_ranks_list=[]
for i in range(len(match_array)):
    
    # current ranks:
    home_team_rank = all_ranks_dict[match_array[i][0]]
    away_team_rank = all_ranks_dict[match_array[i][1]]

    curr_ranks_list.append([rating(home_team_rank.mu,
                                   home_team_rank.sigma),
                            rating(away_team_rank.mu,
                                   away_team_rank.sigma)])
    
    # new ranks:
    new_ranks = ts.rate([(home_team_rank,),
                         (away_team_rank,)],
                        ranks = ts_ranks[i])
    
    new_home_team_rank = new_ranks[0][0]
    new_away_team_rank = new_ranks[1][0]
    
    new_ranks_list.append([rating(new_home_team_rank.mu,
                                  new_home_team_rank.sigma),
                           rating(new_away_team_rank.mu,
                                  new_away_team_rank.sigma)])        
    
    # update dictionary with changed/new ranks:
    all_ranks_dict[match_array[i][0]] = new_home_team_rank
    all_ranks_dict[match_array[i][1]] = new_away_team_rank

In [21]:
### Combine results in one dataframe 
df_1 = pd.concat([df_1,
                pd.DataFrame(curr_ranks_list,columns=['Rank_HT_Before', 'Rank_AT_Before']),
                pd.DataFrame(new_ranks_list, columns= ['Rank_HT_After', 'Rank_AT_After'])],
               axis=1)

df_1.drop(columns=['Result_HomeTeam','Result_AwayTeam'],inplace=True)
# df_1.head().append(df.tail(10))

In [22]:
df_1.head().append(df_1.tail(10))

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,Rank_HT_Before,Rank_AT_Before,Rank_HT_After,Rank_AT_After
0,2017-18,2017-08-11 00:00:00,Arsenal,Leicester,H,4,3,0.0,0.0,7.881405,-0.910259
1,2017-18,2017-08-12 00:00:00,Brighton,Man City,A,0,2,0.0,0.0,-0.910259,7.881405
2,2017-18,2017-08-12 00:00:00,Chelsea,Burnley,A,2,3,0.0,0.0,-0.910259,7.881405
3,2017-18,2017-08-12 00:00:00,Crystal Palace,Huddersfield,A,0,3,0.0,0.0,-0.910259,7.881405
4,2017-18,2017-08-12 00:00:00,Everton,Stoke,H,1,0,0.0,0.0,7.881405,-0.910259
370,2017-18,2018-05-13 00:00:00,Burnley,Bournemouth,A,1,2,21.690557,19.953549,21.455776,20.299764
371,2017-18,2018-05-13 00:00:00,Crystal Palace,West Brom,H,2,0,20.673873,18.991312,20.912038,18.848383
372,2017-18,2018-05-13 00:00:00,Huddersfield,Arsenal,A,0,1,19.26811,22.219324,19.150467,22.434173
373,2017-18,2018-05-13 00:00:00,Liverpool,Brighton,H,4,0,24.292871,20.104338,24.462425,20.01624
374,2017-18,2018-05-13 00:00:00,Man United,Watford,H,1,0,24.887357,19.018903,25.028712,18.952724


In [23]:
### Get the latest TrueSkill Ranks.
latest = pd.DataFrame(all_ranks_dict).transpose()
latest.columns = ['mu','sigma']
latest['rank']=rating(latest['mu'],latest['sigma'])
latest.sort_values(by='rank', ascending=False)

Unnamed: 0,mu,sigma,rank
Man City,32.691867,1.764222,27.399202
Man United,29.147768,1.373019,25.028712
Liverpool,28.401757,1.313111,24.462425
Tottenham,28.423951,1.339461,24.405568
Chelsea,27.161311,1.332429,23.164023
Arsenal,26.438771,1.334866,22.434173
Burnley,25.264701,1.269642,21.455776
Crystal Palace,24.784914,1.290959,20.912038
Everton,24.366481,1.274795,20.542096
Leicester,24.313906,1.26756,20.511227


#  2. Poisson Probabilities in Soccer


### Preapre df_2 

In [24]:
## get not matching team indeces in df_1 and df_2
[
    df_1['HomeTeam'].unique()[comparison_1], # eleminated teams
    df_2['HomeTeam'].unique()[comparison_2] # new teams in new season 
]

[array(['West Brom', 'Stoke', 'Swansea'], dtype=object),
 array(['Fulham', 'Wolves', 'Cardiff'], dtype=object)]

In [25]:
# df_2.head(25)

In [26]:
len(df_2['HomeTeam'].unique())

20

In [27]:
df_2 = df_2.head(20).copy()

In [28]:
df_2.head(3).append(df_2.tail(3))

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,Result_HomeTeam,Result_AwayTeam
0,2018-19,2018-08-10 00:00:00,Man United,Leicester,H,2,1,0,1
1,2018-19,2018-08-11 00:00:00,Bournemouth,Cardiff,H,2,0,0,1
2,2018-19,2018-08-11 00:00:00,Fulham,Crystal Palace,A,0,2,1,0
17,2018-19,2018-08-19 00:00:00,Burnley,Watford,A,1,3,1,0
18,2018-19,2018-08-19 00:00:00,Man City,Huddersfield,H,6,1,0,1
19,2018-19,2018-08-20 00:00:00,Crystal Palace,Liverpool,A,0,2,1,0


In [29]:
# drop ['Fulham', 'Wolves', 'Cardiff'] from df_2
# df_2.drop(['Fulham', 'Wolves', 'Cardiff'])
df_2 = df_2[(df_2['HomeTeam']!='Fulham')&
            (df_2['HomeTeam']!='Wolves')&
            (df_2['HomeTeam']!='Cardiff')&
            (df_2['AwayTeam']!='Fulham')&
            (df_2['AwayTeam']!='Wolves')&
            (df_2['AwayTeam']!='Cardiff')]

In [30]:
df_2.reset_index(drop=True, inplace=True)

In [31]:
df_2.drop(columns=['Result_HomeTeam','Result_AwayTeam'], inplace= True)
df_2

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG
0,2018-19,2018-08-10 00:00:00,Man United,Leicester,H,2,1
1,2018-19,2018-08-11 00:00:00,Huddersfield,Chelsea,A,0,3
2,2018-19,2018-08-11 00:00:00,Newcastle,Tottenham,A,1,2
3,2018-19,2018-08-11 00:00:00,Watford,Brighton,H,2,0
4,2018-19,2018-08-12 00:00:00,Arsenal,Man City,A,0,2
5,2018-19,2018-08-12 00:00:00,Liverpool,West Ham,H,4,0
6,2018-19,2018-08-12 00:00:00,Southampton,Burnley,D,0,0
7,2018-19,2018-08-18 00:00:00,Chelsea,Arsenal,H,3,2
8,2018-19,2018-08-18 00:00:00,Everton,Southampton,H,2,1
9,2018-19,2018-08-18 00:00:00,West Ham,Bournemouth,A,1,2


In [32]:
new_season_teams = set(df_2['HomeTeam'].tolist() + df_2['AwayTeam'].tolist())

In [34]:
### Use df_1 to calculate stats 
df_1

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,Rank_HT_Before,Rank_AT_Before,Rank_HT_After,Rank_AT_After
0,2017-18,2017-08-11 00:00:00,Arsenal,Leicester,H,4,3,0.000000,0.000000,7.881405,-0.910259
1,2017-18,2017-08-12 00:00:00,Brighton,Man City,A,0,2,0.000000,0.000000,-0.910259,7.881405
2,2017-18,2017-08-12 00:00:00,Chelsea,Burnley,A,2,3,0.000000,0.000000,-0.910259,7.881405
3,2017-18,2017-08-12 00:00:00,Crystal Palace,Huddersfield,A,0,3,0.000000,0.000000,-0.910259,7.881405
4,2017-18,2017-08-12 00:00:00,Everton,Stoke,H,1,0,0.000000,0.000000,7.881405,-0.910259
...,...,...,...,...,...,...,...,...,...,...,...
375,2017-18,2018-05-13 00:00:00,Newcastle,Chelsea,H,3,0,19.600322,23.494351,20.044268,23.164023
376,2017-18,2018-05-13 00:00:00,Southampton,Man City,A,0,1,19.801835,27.250417,19.767687,27.399202
377,2017-18,2018-05-13 00:00:00,Swansea,Stoke,A,1,2,18.458600,18.736212,18.261693,19.017639
378,2017-18,2018-05-13 00:00:00,Tottenham,Leicester,H,5,4,24.210816,20.612134,24.405568,20.511227


In [35]:
# len(df_1.FTHG)

### Poisson Distribution applied to calculate Probabilities of teams -
### win/loose/draw/number of goals
### Attack Strength and Defense Strength

In [36]:
# calculate average number of goals scored at home
all_avg_h_scor = df_1.FTHG.sum()/len(df_1.FTHG)

# away
all_avg_away_scor = df_1.FTAG.sum()/len(df_1.FTAG)

all_avg_h_scor, all_avg_away_scor

(1.5315789473684212, 1.1473684210526316)

In [37]:
# calculate average number of goals conceded at home
all_avg_h_miss = all_avg_away_scor

# calculate average number of goals conceded at home
all_avg_away_miss = all_avg_h_scor

all_avg_h_miss,all_avg_away_miss

(1.1473684210526316, 1.5315789473684212)

In [38]:
# df_1[df_1['HomeTeam']=='Arsenal']

In [39]:
# df_1[df_1['AwayTeam']=='Arsenal']['FTAG'].sum()

In [40]:
# df_1['HomeTeam'].unique().tolist()

In [41]:
len(df_1[df_1['HomeTeam']=='Arsenal'])

19

### HOME TEAM - ARSENAL

#### Attack & Defense Scores Home Team

In [42]:
### Attack Scores Home

# number of games by a team at home: len(df_1[df_1['HomeTeam']=='Arsenal'])

# goals scored by a team at home = (number of goales scored by a team)/(number of games at home by a team)
team_avg_h_scor = df_1[df_1['HomeTeam']=='Arsenal']['FTHG'].sum() / len(df_1[df_1['HomeTeam']=='Arsenal'])

# attack_strength of each team at home = 
# team average scored @ home/all team average scored @ home
attack_strength_h = team_avg_h_scor/(all_avg_h_scor)


### Defense Scores Home

# goals missed by Home team = (number of goales scored by Away team)/(number of games by a team @ home)
team_avg_h_missed = df_1[df_1['HomeTeam']=='Arsenal']['FTAG'].sum() / len(df_1[df_1['HomeTeam']=='Arsenal'])

# defense_strength of a team at home = 
# team average missed goals @ home/all team average goals missed @ home
defense_strength_h = team_avg_h_missed/(all_avg_h_miss)

attack_strength_h, defense_strength_h

(1.8556701030927834, 0.9174311926605504)

### AWAY TEAM - ARSENAL

#### Attack & Defense Scores Away Team

In [43]:
# df_1[df_1['AwayTeam']=='Arsenal']

In [44]:

### Attack Scores AWAY team

# number of games by AWAY team: len(df_1[df_1['AwayTeam']=='Arsenal'])

# goals scored by a team at home = (number of goales scored by Away team)/(number of games by Away team)
team_avg_away_scor = df_1[df_1['AwayTeam']=='Arsenal']['FTAG'].sum() / len(df_1[df_1['AwayTeam']=='Arsenal'])

# attack_strength of AWAY team = 
# team average scored AWAY/all team average scored AWAY
attack_strength_away = team_avg_away_scor/(all_avg_away_scor)


### Defense Scores AWAY team

# goals missed by AWAY team = (number of goales missed by Away team)/(number of games by AWAY team)
team_avg_away_missed = df_1[df_1['AwayTeam']=='Arsenal']['FTHG'].sum() / len(df_1[df_1['AwayTeam']=='Arsenal'])

# defense_strength of a team at home = 
# team average missed goals @ home/all team average goals missed @ home
defense_strength_away = team_avg_away_missed/(all_avg_away_miss)

attack_strength_away, defense_strength_away

(0.9174311926605504, 1.0652920962199313)

In [45]:
# df_1['HomeTeam'].unique(), df_2['HomeTeam'].unique()

#### Attack/Defense Strength for all teams

In [46]:
def strength():
    
    ### calculates home and away teams attack/defense strength 
    ### returns dictionary with values for all new_season_teams
    
    results = {}
    
    for team in new_season_teams: #df_2['HomeTeam'].unique():
        
        ###### 1 - Attack Scores Home
        
        # number of games by a team at home: len(df_1[df_1['HomeTeam']=='Arsenal'])
        # goals scored by a team at home = (number of goales scored by a team)/(number of games at home by a team)
        team_avg_h_scor = df_1[df_1['HomeTeam']==team]['FTHG'].sum() / len(df_1[df_1['HomeTeam']==team])

        # attack_strength of each team at home = 
        # team average scored @ home/all team average scored @ home
        attack_strength_h = team_avg_h_scor/(all_avg_h_scor)

        
        ###### 2 - Defense Scores Home

        # goals missed by Home team = (number of goales scored by Away team)/(number of games by a team @ home)
        team_avg_h_missed = df_1[df_1['HomeTeam']==team]['FTAG'].sum() / len(df_1[df_1['HomeTeam']==team])

        # defense_strength of a team at home = 
        # team average missed goals @ home/all team average goals missed @ home
        defense_strength_h = team_avg_h_missed/(all_avg_h_miss)
    
        
        ###### 3 - Attack Scores AWAY team

        # number of games by AWAY team: len(df_1[df_1['AwayTeam']=='Arsenal'])

        # goals scored by a team at home = (number of goales scored by Away team)/(number of games by Away team)
        team_avg_away_scor = df_1[df_1['AwayTeam']==team]['FTAG'].sum() / len(df_1[df_1['AwayTeam']==team])

        # attack_strength of AWAY team = 
        # team average scored AWAY/all team average scored AWAY
        attack_strength_away = team_avg_away_scor/(all_avg_away_scor)
        
       
    
        ###### 4 - Defense Scores AWAY team

        # goals missed by AWAY team = (number of goales missed by Away team)/(number of games by AWAY team)
        team_avg_away_missed = df_1[df_1['AwayTeam']==team]['FTHG'].sum() / len(df_1[df_1['AwayTeam']==team])

        # defense_strength of a team at home = 
        # team average missed goals @ home/all team average goals missed @ home
        defense_strength_away = team_avg_away_missed/(all_avg_away_miss)
        
       
        results[team] = {
                        'Attack_Strength_Home' : attack_strength_h,
                        'Defense_Strength_Home': defense_strength_h,
                        'Attack_Strength_Away' : attack_strength_away,
                        'Defense_Strength_Away': defense_strength_away
                        }
    return results

In [47]:
strength_df = pd.DataFrame.from_dict(strength(), orient='index')
strength_df.reset_index(inplace = True)
strength_df.rename(columns={'index':'Team'},inplace=True)
strength_df

Unnamed: 0,Team,Attack_Strength_Home,Defense_Strength_Home,Attack_Strength_Away,Defense_Strength_Away
0,Burnley,0.549828,0.779817,0.917431,0.756014
1,Man City,2.09622,0.642202,2.06422,0.446735
2,Leicester,0.859107,1.009174,1.422018,1.305842
3,Bournemouth,0.893471,1.376147,0.87156,1.065292
4,West Ham,0.824742,1.192661,1.100917,1.443299
5,Man United,1.305842,0.412844,1.376147,0.652921
6,Crystal Palace,0.996564,1.238532,0.733945,0.962199
7,Liverpool,1.546392,0.458716,1.788991,0.962199
8,Arsenal,1.85567,0.917431,0.917431,1.065292
9,Everton,0.962199,1.009174,0.733945,1.237113


In [48]:
# set(strength_df['Team']) == new_season_teams

In [49]:
def goals_probability(home,away):
    
    ### calculates probability of scoring n goals for home and away teams
    ### return home team win, away team win and draw probabilities
    ### requires to provide "home" and "away" team names

    home_team_goal_prob = []
    away_team_goal_prob = []

    for goals in range(11+1): # assumption: maximum of 11 goals 

        attack_strength_home = float(strength_df[strength_df['Team'] == home]['Attack_Strength_Home'])
        defense_strength_home = float(strength_df[strength_df['Team'] == home]['Defense_Strength_Home'])

        attack_strength_away = float(strength_df[strength_df['Team'] == away]['Attack_Strength_Away'])
        defense_strength_away = float(strength_df[strength_df['Team'] == away]['Defense_Strength_Away'])

    #     all_avg_h_miss,all_avg_away_miss
    #     all_avg_h_scor, all_avg_away_scor

        # calculate goal expectancy of home team (which is the same as lambda or mean)
        home_team_goal_exp = attack_strength_home * defense_strength_away * all_avg_h_scor
        # probability of scoring n number of goals by home team
        home_team_prob = poisson.pmf(goals,home_team_goal_exp)
        # append probabilities to the list
        home_team_goal_prob.append(home_team_prob)

        # calculate goal expectancy of away team (= lambda)
        away_team_goal_exp = attack_strength_away * defense_strength_home * all_avg_away_scor
        # probability of scoring n number of goals by away team
        away_team_prob = poisson.pmf(goals,away_team_goal_exp)
        # append probabilities to the list
        away_team_goal_prob.append(away_team_prob)
    
    
    # probability of Draw
    draw = (np.array(home_team_goal_prob) * np.array(away_team_goal_prob)).sum()
    
    # probabilty of a win of home team
    home_win_prob = []
    away_win_prob = [] ### can calculate also as: 1 - sum(home_win_prob) - draw
    for idx in range(0,11+1-1): 
        # probability indeces in the list corresponds to the number of goals 
        for j in range(idx+1,11+1):
            home_win_prob.append(away_team_goal_prob[idx] * home_team_goal_prob[j])
            away_win_prob.append(home_team_goal_prob[idx] * away_team_goal_prob[j])
   
    return sum(home_win_prob), sum(away_win_prob), draw

In [50]:
df_2.head(3)

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG
0,2018-19,2018-08-10 00:00:00,Man United,Leicester,H,2,1
1,2018-19,2018-08-11 00:00:00,Huddersfield,Chelsea,A,0,3
2,2018-19,2018-08-11 00:00:00,Newcastle,Tottenham,A,1,2


In [51]:
# Add probabilities to data frame df_2
for i in range(len(df_2)):
    home = df_2["HomeTeam"][i]
    away = df_2["AwayTeam"][i]
    df_2.loc[i,'Home_Win_Prob'], df_2.loc[i,'Away_Win_Prob'], df_2.loc[i,'Draw_Prob'] = goals_probability(home,away)

In [52]:
df_2

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,Home_Win_Prob,Away_Win_Prob,Draw_Prob
0,2018-19,2018-08-10 00:00:00,Man United,Leicester,H,2,1,0.788351,0.073063,0.138567
1,2018-19,2018-08-11 00:00:00,Huddersfield,Chelsea,A,0,3,0.113835,0.681924,0.204241
2,2018-19,2018-08-11 00:00:00,Newcastle,Tottenham,A,1,2,0.203915,0.520686,0.275399
3,2018-19,2018-08-11 00:00:00,Watford,Brighton,H,2,0,0.529326,0.197906,0.272768
4,2018-19,2018-08-12 00:00:00,Arsenal,Man City,A,0,2,0.216995,0.579669,0.203333
5,2018-19,2018-08-12 00:00:00,Liverpool,West Ham,H,4,0,0.890482,0.032071,0.077213
6,2018-19,2018-08-12 00:00:00,Southampton,Burnley,D,0,0,0.235634,0.471455,0.292911
7,2018-19,2018-08-18 00:00:00,Chelsea,Arsenal,H,3,2,0.59096,0.169574,0.239466
8,2018-19,2018-08-18 00:00:00,Everton,Southampton,H,2,1,0.516855,0.224023,0.259122
9,2018-19,2018-08-18 00:00:00,West Ham,Bournemouth,A,1,2,0.402829,0.330471,0.266701


In [53]:
#[df_1['AwayTeam']=='Arsenal']
# https://www.dataquest.io/blog/settingwithcopywarning/
# df_2.loc[df_2['HomeTeam']=='Man United']['Attack_Strength_Home']

In [54]:
# book_odds = np.array([8.4,3,4.2,14,10,5.4,9.3,15])
# odds = (book_odds - 1)
# odds
# prob = 1/(odds+1)
# np.sum(prob)

In [55]:
# book_odds = np.array([8.4,3,4.2,14,10,5.4,9.3,15])
# prob = 1/(book_odds+1)
# np.sum(prob)

In [56]:
# 1.623 vs. 0.824
# poisson.pmf(0,1.623)

In [57]:
# poisson.pmf(0,0.824)*poisson.pmf(0,1.623)