In [287]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
''
# Load the dataset
df = pd.read_csv('../datasets/premierleague.csv')
df.head()

Unnamed: 0,home_team,away_team,home_goals,away_goals,result,season
0,Sheffield United,Liverpool,1.0,1.0,D,2006-2007
1,Arsenal,Aston Villa,1.0,1.0,D,2006-2007
2,Everton,Watford,2.0,1.0,H,2006-2007
3,Newcastle United,Wigan Athletic,2.0,1.0,H,2006-2007
4,Portsmouth,Blackburn Rovers,3.0,0.0,H,2006-2007


In [288]:
# Calculate the total number of games played and the number of games won by the home team
total_games = len(df)
home_wins = len(df[df['result'] == 'H'])
away_wins = len(df[df['result'] == 'A'])
home_wins, away_wins

(2108, 1288)

In [289]:
# Calculate the percentage of games won by the home team
home_win_percentage = len(df[df['result'] == 'H']) / total_games * 100
away_win_percentage = len(df[df['result'] == 'A']) / total_games * 100
drew_win_percentage = len(df[df['result'] == 'D']) / total_games * 100

# Print the results
print(f'Total games played: {total_games}')
print(f'Home wins: {home_wins}; Away wins: {away_wins}')
print(f'Home win percentage: {home_win_percentage:.2f}%')
print(f'Away win percentage: {away_win_percentage:.2f}%')
print(f'Drew percentage: {drew_win_percentage:.2f}%')


Total games played: 4560
Home wins: 2108; Away wins: 1288
Home win percentage: 46.23%
Away win percentage: 28.25%
Drew percentage: 25.53%


In [290]:
from statsmodels.stats.proportion import proportions_ztest

# Define the null hypothesis (H0)
null_hypothesis = 0.5

# Perform the one-sample proportion test
count = len(df[df['result'] == 'H'])
nobs = len(df['result'])
stat, pval = proportions_ztest(count, nobs, null_hypothesis)

# Print the results
print(pval)
if pval < 0.05:
    print("The difference in home win percentage is statistically significant")
else:
    print("The difference in home win percentage is not statistically significant")

3.242833517253633e-07
The difference in home win percentage is statistically significant


In [291]:
# Group the data by season and home team, and sum the goals scored
grouped = df.groupby(['season', 'home_team'])[['home_goals']].sum()

# Find the team with the highest number of home goals in each season
best_teams = grouped.groupby('season').apply(lambda x: x[x == x.max()].dropna())
best_teams

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,home_goals
season,season,home_team,Unnamed: 3_level_1
2006-2007,2006-2007,Manchester United,46.0
2007-2008,2007-2008,Manchester United,47.0
2008-2009,2008-2009,Manchester United,43.0
2009-2010,2009-2010,Chelsea,68.0
2010-2011,2010-2011,Manchester United,49.0
2011-2012,2011-2012,Manchester City,55.0
2012-2013,2012-2013,Arsenal,47.0
2013-2014,2013-2014,Manchester City,63.0
2014-2015,2014-2015,Manchester City,44.0
2015-2016,2015-2016,Manchester City,47.0


In [292]:
# Group the data by season and home team
grouped = df.groupby(['season', 'home_team'])
grouped.head()

Unnamed: 0,home_team,away_team,home_goals,away_goals,result,season
0,Sheffield United,Liverpool,1.0,1.0,D,2006-2007
1,Arsenal,Aston Villa,1.0,1.0,D,2006-2007
2,Everton,Watford,2.0,1.0,H,2006-2007
3,Newcastle United,Wigan Athletic,2.0,1.0,H,2006-2007
4,Portsmouth,Blackburn Rovers,3.0,0.0,H,2006-2007
...,...,...,...,...,...,...
4276,AFC Bournemouth,Chelsea,0.0,1.0,A,2017-2018
4277,Brighton and Hove Albion,Southampton,1.0,1.0,D,2017-2018
4278,Leicester City,Everton,2.0,0.0,H,2017-2018
4279,Burnley,Newcastle United,1.0,0.0,H,2017-2018


In [293]:
# Calculate the number of home games played, home wins, and home win percentage for each team in each season
home_games = grouped['home_goals'].count()
home_wins = grouped.apply(lambda x: (x['result'] == 'H').sum())
home_win_pct = home_wins / home_games * 100
home_win_pct

season     home_team           
2006-2007  Arsenal                 63.157895
           Aston Villa             36.842105
           Blackburn Rovers        47.368421
           Bolton Wanderers        47.368421
           Charlton Athletic       36.842105
                                     ...    
2017-2018  Swansea City            31.578947
           Tottenham Hotspur       68.421053
           Watford                 36.842105
           West Bromwich Albion    15.789474
           West Ham United         36.842105
Length: 240, dtype: float64

In [294]:
# Reset the index
home_win_pct = home_win_pct.reset_index()
home_win_pct = home_win_pct[['season','home_team',0]]
home_win_pct

Unnamed: 0,season,home_team,0
0,2006-2007,Arsenal,63.157895
1,2006-2007,Aston Villa,36.842105
2,2006-2007,Blackburn Rovers,47.368421
3,2006-2007,Bolton Wanderers,47.368421
4,2006-2007,Charlton Athletic,36.842105
...,...,...,...
235,2017-2018,Swansea City,31.578947
236,2017-2018,Tottenham Hotspur,68.421053
237,2017-2018,Watford,36.842105
238,2017-2018,West Bromwich Albion,15.789474


In [295]:
# Sort the results by home win percentage
home_win_pct = home_win_pct.sort_values(by=[0], ascending=False)
home_win_pct.head()

Unnamed: 0,season,home_team,0
91,2010-2011,Manchester United,94.736842
108,2011-2012,Manchester City,94.736842
216,2016-2017,Tottenham Hotspur,89.473684
31,2007-2008,Manchester United,89.473684
66,2009-2010,Chelsea,89.473684


In [296]:
# Perform one-hot encoding for home_team and away_team columns
home_team_encoded = pd.get_dummies(df['home_team'], prefix='home_team')
away_team_encoded = pd.get_dummies(df['away_team'], prefix='away_team')
result_encoded = pd.get_dummies(df['result'], prefix='result')
season_encoded = pd.get_dummies(df['season'], prefix='season')

# Concatenate the encoded columns with the original dataframe
df_encoded = pd.concat([df, home_team_encoded, away_team_encoded, result_encoded], axis=1)

# Drop the original home_team and away_team columns
df_encoded = df_encoded.drop(['home_team', 'away_team', 'result', 'season'], axis=1)

In [297]:
# Split the data into features (X) and target variable (y)
X = df_encoded.drop(['home_goals', 'away_goals'], axis=1)
y_home = df['home_goals']
y_away = df['away_goals']

In [298]:
# Prepare the home team and away team columns
df['home_team'] = pd.Categorical(df['home_team'])
df['away_team'] = pd.Categorical(df['away_team'])
df['season'] = pd.Categorical(df['season'])
df['result'] = pd.Categorical(df['result'])
df['home_team_code'] = df['home_team'].cat.codes
df['away_team_code'] = df['away_team'].cat.codes
df['season_code'] = df['season'].cat.codes
df['result_code'] = df['result'].cat.codes

# Define the features and target variables
X = df[['home_team_code', 'away_team_code', 'season_code', 'result_code']]
y_home = df['home_goals']
y_away = df['away_goals']
X.shape, y_home.shape, y_away.shape

((4560, 4), (4560,), (4560,))

In [299]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
import plotly.express as px
import plotly.figure_factory as ff

In [300]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train_home, y_test_home, y_train_away, y_test_away = train_test_split(X, y_home, y_away, test_size=0.3, random_state=0)
X_train.columns

Index(['home_team_code', 'away_team_code', 'season_code', 'result_code'], dtype='object')

In [301]:
# Train the model for home goals
rf_home = RandomForestRegressor(n_estimators=600, max_depth=25, random_state=0)
rf_home.fit(X_train, y_train_home)

# Train the model for away goals
rf_away = RandomForestRegressor(n_estimators=200, random_state=0)
rf_away.fit(X_train, y_train_away)

# Make predictions on the test set
y_home_pred = rf_home.predict(X_test)
y_away_pred = rf_away.predict(X_test)

In [302]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Compute MSE, RMSE, MAE, and R2 for home goals
mse_home = mean_squared_error(y_test_home, y_home_pred)
rmse_home = mean_squared_error(y_test_home, y_home_pred, squared=False)
mae_home = mean_absolute_error(y_test_home, y_home_pred)
r2_home = r2_score(y_test_home, y_home_pred)

# Compute MSE, RMSE, MAE, and R2 for away goals
mse_away = mean_squared_error(y_test_away, y_away_pred)
rmse_away = mean_squared_error(y_test_away, y_away_pred, squared=False)
mae_away = mean_absolute_error(y_test_away, y_away_pred)
r2_away = r2_score(y_test_away, y_away_pred)

print(f"Home Goals - MSE: {mse_home:.2f}, RMSE: {rmse_home:.2f}, MAE: {mae_home:.2f}, R2: {r2_home:.2f}")
print(f"Away Goals - MSE: {mse_away:.2f}, RMSE: {rmse_away:.2f}, MAE: {mae_away:.2f}, R2: {r2_away:.2f}")

Home Goals - MSE: 1.24, RMSE: 1.11, MAE: 0.86, R2: 0.29
Away Goals - MSE: 0.93, RMSE: 0.96, MAE: 0.76, R2: 0.29


In [303]:
import plotly.graph_objs as go
def gen_cm_plotly(cm):
    # Calculate the percentage of success for each cell
    success_percentage = cm / cm.sum(axis=0) * 100

    # calculate various counts and rates
    # compute success percentages and format as strings with percentage symbol
    total_predictions = cm.sum(axis=1)
    success_percentage = np.zeros_like(cm, dtype=float)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            if total_predictions[i] > 0:
                success_percentage[i,j] = cm[i,j] / total_predictions[i] * 100
    success_percentage = np.char.add(success_percentage.round(1).astype(str), '%')

    # create text labels for each cell
    text = [[f"Count: {cm[j][i]:,}<br>Success: {success_percentage[j][i]}" for i in range(len(cm[j]))] for j in range(len(cm))]

    # create heatmap figure
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=[i for i in range(cm.shape[0])],
            colorscale='inferno', 
            text=text,
            hovertemplate='%{text}<extra></extra>'
        )
    )

    # add annotations as percentages to heatmap
    for i in range(len(cm)):
        for j in range(len(cm)):
            fig.add_annotation(x=j, y=i, text=success_percentage[i][j], showarrow=False, font=dict(color='grey', size=12))

    # set layout and show figure
    fig.update_layout(
        title='Confusion Matrix Goals',
        font=dict(color='black'),
    )
    fig.update_layout(
        xaxis_title='Predicted Event',
        yaxis_title='True Event',
        font=dict(
            size=14,
            color='black'
        )
    )
    fig.show()

In [304]:
# Evaluate the model using a confusion matrix
y_home_pred_rounded = y_home_pred.round()
y_away_pred_rounded = y_away_pred.round()

# calculate the confusion matrix
cm_home = confusion_matrix(y_test_home, y_home_pred_rounded)
cm_away = confusion_matrix(y_test_away, y_away_pred_rounded)

gen_cm_plotly(cm_home)
gen_cm_plotly(cm_away)


In [305]:
# Reverse the encoding for home_team_code
home_team_categories = df['home_team'].cat.categories
df['home_team_decoded'] = home_team_categories[df['home_team_code']]

# Reverse the encoding for away_team_code
away_team_categories = df['away_team'].cat.categories
df['away_team_decoded'] = away_team_categories[df['away_team_code']]

df[df['season_code'] > 10]['season_code'].unique()  #the new season code is this result +1

array([11], dtype=int8)

In [306]:
# Create a new data point for the game between Arsenal and Manchester United
new_predict_team_home_code = 1
new_predict_team_away_code = 21
new_game = pd.DataFrame({'home_team_code': new_predict_team_home_code,
                         'away_team_code': new_predict_team_away_code,
                         'season_code': 12,
                         'result_code': [2]
                         }, index=[0])

# Use the trained models to predict the number of goals scored by each team
new_predict_home = rf_home.predict(new_game)
new_predict_away = rf_away.predict(new_game)
df[df['home_team_code'] == new_predict_team_home_code]['home_team'].values[0], new_predict_home.round()[0], \
     'x', new_predict_away.round()[0], df[df['away_team_code'] == new_predict_team_away_code]['away_team'].values[0]

('Arsenal', 2.0, 'x', 0.0, 'Manchester United')

![](../imgs/ArsenalxManutd.JPG)

It worked! The prediction hit the game score in the future.

This match happened after the season of the dataset

In [307]:
# Create a new data point for the game between Manchester United and Arsenal
new_predict_team_home_code = 19
new_predict_team_away_code = 18
new_game = pd.DataFrame({'home_team_code': new_predict_team_home_code,
                         'away_team_code': new_predict_team_away_code,
                         'season_code': 12,
                         'result_code': [1]
                         }, index=[0])

# Use the trained models to predict the number of goals scored by each team
new_predict_home = rf_home.predict(new_game)
new_predict_away = rf_away.predict(new_game)
df[df['home_team_code'] == new_predict_team_home_code]['home_team'].values[0], new_predict_home.round()[0], \
     'x', new_predict_away.round()[0], df[df['away_team_code'] == new_predict_team_away_code]['away_team'].values[0]

('Liverpool', 1.0, 'x', 1.0, 'Leicester City')

![](../imgs/LiverpoolxLeicester.JPG)

It worked! The prediction hit the game score in the future.

This match happened after the season of the dataset

In [308]:
# Create a new data point for the game between Manchester United and Arsenal
new_predict_team_home_code = 19
new_predict_team_away_code = 11
new_game = pd.DataFrame({'home_team_code': new_predict_team_home_code,
                         'away_team_code': new_predict_team_away_code,
                         'season_code': 12,
                         'result_code': [2]
                         }, index=[0])

# Use the trained models to predict the number of goals scored by each team
new_predict_home = rf_home.predict(new_game)
new_predict_away = rf_away.predict(new_game)
df[df['home_team_code'] == new_predict_team_home_code]['home_team'].values[0], new_predict_home.round()[0], \
     'x', new_predict_away.round()[0], df[df['away_team_code'] == new_predict_team_away_code]['away_team'].values[0]

('Liverpool', 2.0, 'x', 0.0, 'Chelsea')

![](../imgs/LiverpoolxChelsea.JPG)

It worked! The prediction hit the game score in the future.

This match happened after the season of the dataset