In [1]:
### This notebook is based on my kaggle notebook
### https://www.kaggle.com/abzalseitkaziyev/trueskill-ranking-exercise

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import trueskill # trueskill ranking

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/english-premier-league-results/EPL.csv


# TrueSkill Ranking on English Premier League Dataset

In this notebook, I applied [TrueSkill](https://trueskill.org/) rating system to calculate ranks of English Premiere League teams. The data is sourced from the [Kaggle datasets](https://www.kaggle.com/irkaal/english-premier-league-results#EPL.csv). 
My objevtive was to learn and implement TrueSkill ranking using real data.

TrueSkill is a rating system which was developed by Microsoft to rank and match players. It is based on ELO ranking system, but gives more flexibility, like multiplayer games ranking.

Ranking starts with 0 and goes up to 50 points.The TrueSkill rank is calculated based on the Score = mu - 3* sigma and could be considered as conservative estimation of the players' skills, as system is 99% confident that players' skills are higher than calculated Scores. 

Starting scores by default: mu = 25, sigma=8.33. For this exercise I will use score = (mu - 2*sigma) instead, so starting scores will be 8.33, and not 0.

### Read in data.

In [2]:
df = pd.read_csv('../input/english-premier-league-results/EPL.csv')
df.head().append(df.tail())

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,19/08/00,Charlton,Man City,4,0,H,2,0,H,Rob Harris,...,14,4,6,6,13,12,1,2,0,0
1,19/08/00,Chelsea,West Ham,4,2,H,1,0,H,Graham Barber,...,10,5,7,7,19,14,1,2,0,0
2,19/08/00,Coventry,Middlesbrough,1,3,A,1,1,D,Barry Knight,...,3,9,8,4,15,21,5,3,1,0
3,19/08/00,Derby,Southampton,2,2,D,1,2,A,Andy D'Urso,...,4,6,5,8,11,13,1,1,0,0
4,19/08/00,Leeds,Everton,2,0,H,2,0,H,Dermot Gallagher,...,8,6,6,4,21,20,1,3,0,0
7381,07/03/2020,Wolves,Brighton,0,0,D,0,0,D,A Marriner,...,1,1,2,0,4,7,1,3,0,0
7382,07/03/2020,Burnley,Tottenham,1,1,D,1,0,H,J Moss,...,8,2,3,5,16,11,5,4,0,0
7383,08/03/2020,Chelsea,Everton,4,0,H,2,0,H,K Friend,...,11,1,6,1,8,10,1,2,0,0
7384,08/03/2020,Man United,Man City,2,0,H,1,0,H,M Dean,...,6,2,2,11,11,9,2,4,0,0
7385,09/03/2020,Leicester,Aston Villa,4,0,H,1,0,H,M Oliver,...,7,1,9,0,15,12,2,1,0,0


In [3]:
df.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF',
       'HY', 'AY', 'HR', 'AR'],
      dtype='object')

### Leave columns required for TrueSkill ranking implementation   

In [4]:
df = df[['Date', 'HomeTeam', 'AwayTeam','FTR']].copy()
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR
0,19/08/00,Charlton,Man City,H
1,19/08/00,Chelsea,West Ham,H
2,19/08/00,Coventry,Middlesbrough,A
3,19/08/00,Derby,Southampton,D
4,19/08/00,Leeds,Everton,H


In [5]:
 set(df['AwayTeam'].unique())==set(df['HomeTeam'].unique())

True

### 'FTR' - Full Time Result (H=Home Win, D=Draw, A=Away Win)
#### For TrueSkill Win=0,Draw=0,Loose=1. Implement this accordingly.  

In [6]:
df['Result_HomeTeam'] = df['FTR']
df['Result_AwayTeam'] = df['FTR']

df.replace({'Result_HomeTeam': {'H':0,'D':0,'A':1}},inplace=True)
df.replace({'Result_AwayTeam': {'H':1,'D':0,'A':0}},inplace=True)
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,Result_HomeTeam,Result_AwayTeam
0,19/08/00,Charlton,Man City,H,0,1
1,19/08/00,Chelsea,West Ham,H,0,1
2,19/08/00,Coventry,Middlesbrough,A,1,0
3,19/08/00,Derby,Southampton,D,0,0
4,19/08/00,Leeds,Everton,H,0,1


In [7]:
df.isna().sum()

Date               0
HomeTeam           0
AwayTeam           0
FTR                0
Result_HomeTeam    0
Result_AwayTeam    0
dtype: int64

In [8]:
teams = df['HomeTeam'].unique().tolist()
ts = trueskill.TrueSkill()

ranking = []

# Create Default ratings(mu=25, sigma=8.33)
for team in teams:
       ranking.append(ts.create_rating())

# create dictionary with all teams and initial ratings
all_ranks_dict = dict(zip(teams,ranking))
all_ranks_dict

{'Charlton': trueskill.Rating(mu=25.000, sigma=8.333),
 'Chelsea': trueskill.Rating(mu=25.000, sigma=8.333),
 'Coventry': trueskill.Rating(mu=25.000, sigma=8.333),
 'Derby': trueskill.Rating(mu=25.000, sigma=8.333),
 'Leeds': trueskill.Rating(mu=25.000, sigma=8.333),
 'Leicester': trueskill.Rating(mu=25.000, sigma=8.333),
 'Liverpool': trueskill.Rating(mu=25.000, sigma=8.333),
 'Sunderland': trueskill.Rating(mu=25.000, sigma=8.333),
 'Tottenham': trueskill.Rating(mu=25.000, sigma=8.333),
 'Man United': trueskill.Rating(mu=25.000, sigma=8.333),
 'Arsenal': trueskill.Rating(mu=25.000, sigma=8.333),
 'Bradford': trueskill.Rating(mu=25.000, sigma=8.333),
 'Ipswich': trueskill.Rating(mu=25.000, sigma=8.333),
 'Middlesbrough': trueskill.Rating(mu=25.000, sigma=8.333),
 'Everton': trueskill.Rating(mu=25.000, sigma=8.333),
 'Man City': trueskill.Rating(mu=25.000, sigma=8.333),
 'Newcastle': trueskill.Rating(mu=25.000, sigma=8.333),
 'Southampton': trueskill.Rating(mu=25.000, sigma=8.333),
 'We

In [9]:
home_team_rank = df['Result_HomeTeam'].values
away_team_rank = df['Result_AwayTeam'].values
ts_ranks = np.stack((home_team_rank,away_team_rank), axis=-1) # create array of arrays with results

home_team = df['HomeTeam'].values
away_team = df['AwayTeam'].values
match_array = np.stack((home_team,away_team), axis=-1) # create array of arrays with all matches

In [10]:
def rating(mu,sigma):
    """
    mu and sigma from TrueSkill: 
    mu = ts_rating.mu;
    sigma = ts_rating.sigma
    
    Function returns trueskill rating value ('real score' is with 97.1% confidence not below that value)
    """
    return mu-2*sigma

In [11]:
# Create lists: with current ranks(before the game) and new ranks(after the game)
curr_ranks_list=[]
new_ranks_list=[]
for i in range(len(match_array)):
    
    # current ranks:
    home_team_rank = all_ranks_dict[match_array[i][0]]
    away_team_rank = all_ranks_dict[match_array[i][1]]

    curr_ranks_list.append([rating(home_team_rank.mu,
                                   home_team_rank.sigma),
                            rating(away_team_rank.mu,
                                   away_team_rank.sigma)])
    
    # new ranks:
    new_ranks = ts.rate([(home_team_rank,),
                         (away_team_rank,)],
                        ranks = ts_ranks[i])
    
    new_home_team_rank = new_ranks[0][0]
    new_away_team_rank = new_ranks[1][0]
    
    new_ranks_list.append([rating(new_home_team_rank.mu,
                                  new_home_team_rank.sigma),
                           rating(new_away_team_rank.mu,
                                  new_away_team_rank.sigma)])        
    
    # update dictionary with changed/new ranks:
    all_ranks_dict[match_array[i][0]] = new_home_team_rank
    all_ranks_dict[match_array[i][1]] = new_away_team_rank

In [12]:
### Combine results in one dataframe 
df = pd.concat([df,
                pd.DataFrame(curr_ranks_list,columns=['Rank_HT_Before', 'Rank_AT_Before']),
                pd.DataFrame(new_ranks_list, columns= ['Rank_HT_After', 'Rank_AT_After'])],
               axis=1)

df.drop(columns=['Result_HomeTeam','Result_AwayTeam'],inplace=True)
df.head().append(df.tail(10))

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,Rank_HT_Before,Rank_AT_Before,Rank_HT_After,Rank_AT_After
0,19/08/00,Charlton,Man City,H,8.333333,8.333333,15.052881,6.261217
1,19/08/00,Chelsea,West Ham,H,8.333333,8.333333,15.052881,6.261217
2,19/08/00,Coventry,Middlesbrough,A,8.333333,8.333333,6.261217,15.052881
3,19/08/00,Derby,Southampton,D,8.333333,8.333333,12.084961,12.084961
4,19/08/00,Leeds,Everton,H,8.333333,8.333333,15.052881,6.261217
7376,07/03/2020,Liverpool,Bournemouth,H,28.65635,22.457023,28.688564,22.420844
7377,07/03/2020,Arsenal,West Ham,H,25.568732,23.071825,25.627533,23.008133
7378,07/03/2020,Crystal Palace,Watford,H,23.335966,22.566208,23.413565,22.485614
7379,07/03/2020,Sheffield United,Norwich,H,23.03139,21.665706,23.144271,21.59416
7380,07/03/2020,Southampton,Newcastle,A,23.094105,22.903115,23.004433,22.990751


In [13]:
### Get the latest TrueSkill Ranks.
latest = pd.DataFrame(all_ranks_dict).transpose()
latest.columns = ['mu','sigma']
latest['rank']=rating(latest['mu'],latest['sigma'])
latest.sort_values(by='rank', ascending=False)

Unnamed: 0,mu,sigma,rank
Liverpool,30.305334,0.808385,28.688564
Man City,29.88498,0.809154,28.266673
Chelsea,27.706053,0.773685,26.158682
Man United,27.506316,0.76379,25.978736
Tottenham,27.342883,0.776718,25.789446
Arsenal,27.150419,0.761443,25.627533
Leicester,25.865696,0.770235,24.325226
Everton,25.630737,0.762115,24.106506
Wolves,25.022345,0.759681,23.502983
Crystal Palace,24.943656,0.765046,23.413565
