<a href="https://colab.research.google.com/github/yiyangjessieyu/Machine-Learning/blob/main/Predict_Football_Match_Winners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [143]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Investigating Data

In [158]:
matches = pd.read_csv("matches.csv", index_col=0) # https://github.com/dataquestio/project-walkthroughs/blob/master/football_matches/matches.csv

# 38 matches per season, 20 teams in league each season, 2 seasons of data
print(38*20*2) # so expect to have this much data rows

print(matches.shape) # instead we have

1520
(1389, 27)


In [159]:
matches["team"].value_counts()

Southampton                 72
Brighton and Hove Albion    72
Manchester United           72
West Ham United             72
Newcastle United            72
Burnley                     71
Leeds United                71
Crystal Palace              71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Arsenal                     71
Leicester City              70
Chelsea                     70
Aston Villa                 70
Everton                     70
Liverpool                   38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Brentford                   34
Watford                     33
Norwich City                33
Name: team, dtype: int64

In [146]:
matches[matches["team"] == "Liverpool"].sort_values("date")

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Home,W,4.0,3.0,Leeds United,...,Match Report,,20.0,4.0,17.0,0.0,2.0,2.0,2021,Liverpool
2,2020-09-20,16:30,Premier League,Matchweek 2,Sun,Away,W,2.0,0.0,Chelsea,...,Match Report,,17.0,5.0,17.7,1.0,0.0,0.0,2021,Liverpool
4,2020-09-28,20:00,Premier League,Matchweek 3,Mon,Home,W,3.0,1.0,Arsenal,...,Match Report,,21.0,9.0,16.8,0.0,0.0,0.0,2021,Liverpool
6,2020-10-04,19:15,Premier League,Matchweek 4,Sun,Away,L,2.0,7.0,Aston Villa,...,Match Report,,14.0,8.0,15.8,1.0,0.0,0.0,2021,Liverpool
7,2020-10-17,12:30,Premier League,Matchweek 5,Sat,Away,D,2.0,2.0,Everton,...,Match Report,,22.0,8.0,15.0,1.0,0.0,0.0,2021,Liverpool
9,2020-10-24,20:00,Premier League,Matchweek 6,Sat,Home,W,2.0,1.0,Sheffield Utd,...,Match Report,,17.0,5.0,18.2,1.0,0.0,0.0,2021,Liverpool
11,2020-10-31,17:30,Premier League,Matchweek 7,Sat,Home,W,2.0,1.0,West Ham,...,Match Report,,8.0,2.0,18.6,1.0,1.0,1.0,2021,Liverpool
13,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Away,D,1.0,1.0,Manchester City,...,Match Report,,9.0,2.0,21.5,0.0,1.0,1.0,2021,Liverpool
14,2020-11-22,19:15,Premier League,Matchweek 9,Sun,Home,W,3.0,0.0,Leicester City,...,Match Report,,24.0,12.0,11.9,0.0,0.0,0.0,2021,Liverpool
16,2020-11-28,12:30,Premier League,Matchweek 10,Sat,Away,D,1.0,1.0,Brighton,...,Match Report,,6.0,2.0,20.9,0.0,0.0,0.0,2021,Liverpool


In [147]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


# Data Set Up

In [160]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [161]:
del matches["comp"]
del matches["notes"]
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["day_code"] = matches["date"].dt.dayofweek
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

matches["target"] = (matches["result"] == "W").astype("int")


In [162]:
matches["captain_code"] = matches["captain"].astype("category").cat.codes


In [194]:
matches["round_code"] = matches["round"].astype("category").cat.codes


In [218]:
predictors = ["round_code", "venue_code", "opp_code", "captain_code", "day_code", "hour"] # maybe round? maybe formation?

In [196]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,pkatt,season,team,venue_code,opp_code,day_code,hour,target,captain_code,round_code
1,2021-08-15,16:30,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,1.9,...,0.0,2022,Manchester City,0,18,6,16,0,24,0
2,2021-08-21,15:00,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.7,...,0.0,2022,Manchester City,1,15,5,15,1,90,11
3,2021-08-28,12:30,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,3.8,...,0.0,2022,Manchester City,1,0,5,12,1,90,22
4,2021-09-11,15:00,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.9,...,0.0,2022,Manchester City,0,10,5,15,1,90,32
6,2021-09-18,15:00,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.1,...,0.0,2022,Manchester City,1,17,5,15,0,24,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,0.5,...,0.0,2021,Sheffield United,0,18,6,19,0,43,27
39,2021-05-08,15:00,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,0.7,...,0.0,2021,Sheffield United,1,6,5,15,0,43,28
40,2021-05-16,19:00,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,1.6,...,0.0,2021,Sheffield United,0,7,6,19,1,43,29
41,2021-05-19,18:00,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,0.8,...,0.0,2021,Sheffield United,0,14,2,18,0,43,30


# Hyperparameter Tuning

In [235]:
date_split = '2022-01-01'
train_set = matches[matches["date"] < date_split]
test_set = matches[matches["date"] > date_split]

In [239]:
from sklearn.model_selection import GridSearchCV # can also use RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [5, 10, 50, 100], # n_trees
    'min_samples_split': [2, 10, 50, 100, 200, 500], # n_samples_per_leaf
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create RandomForestClassifier instance
rf = RandomForestClassifier()

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to training data
grid_search.fit(train_set[predictors], train_set["target"])

# Get best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Evaluate performance on test set
best_model = grid_search.best_estimator_
test_predictions = best_model.predict(test_set[predictors])

accuracy = accuracy_score(test_set["target"], test_predictions)
precision = precision_score(test_set["target"], test_predictions)
print("Accuracy:", accuracy)
print("Precision:", precision)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Parameters: {'max_features': 'auto', 'min_samples_split': 200, 'n_estimators': 10}
Accuracy: 0.6304347826086957
Precision: 0.6666666666666666


  warn(


# Exploring Best Model

In [243]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Instantiate models
# Can experiment with these parameters
n_trees = 10 # higher n, potentially more accurate, longer to run,
n_samples_per_leaf = 200 # higher n, less likely to overfit, lower accuracy on training data
random_state = 1 # need to try get same results in random-ness
rf_model = RandomForestClassifier(n_estimators=n_trees,
                            min_samples_split=n_samples_per_leaf,
                            random_state=random_state,
                            max_features = 'auto')
svm_model = SVC()
mlp_model = MLPClassifier()

# Train models
rf_model.fit(train_set[predictors], train_set["target"])
svm_model.fit(train_set[predictors], train_set["target"])
mlp_model.fit(train_set[predictors], train_set["target"])

# Make predictions
rf_predictions = rf_model.predict(test_set[predictors])
svm_predictions = svm_model.predict(test_set[predictors])
mlp_predictions = mlp_model.predict(test_set[predictors])

# Evaluate performance
rf_accuracy = accuracy_score(test_set["target"], rf_predictions)
svm_accuracy = accuracy_score(test_set["target"], svm_predictions)
mlp_accuracy = accuracy_score(test_set["target"], mlp_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("SVM Accuracy:", svm_accuracy)
print("MLP Accuracy:", mlp_accuracy)


  warn(


Random Forest Accuracy: 0.6304347826086957
SVM Accuracy: 0.6231884057971014
MLP Accuracy: 0.5833333333333334


# RandomForestClassifier Set Up

In [149]:
from sklearn.ensemble import RandomForestClassifier # can pick up non-linearity in data
# eg. how opp_code are just values for diff opponents, numbers don't correlate to skillset increasing, so no linear relationship here

In [228]:
# Can experiment with these parameters
n_trees = 50 # higher n, potentially more accurate, longer to run,
n_samples_per_leaf = 100 # higher n, less likely to overfit, lower accuracy on training data
random_state = 1 # need to try get same results in random-ness
rf = RandomForestClassifier(n_estimators=n_trees,
                            min_samples_split=n_samples_per_leaf,
                            random_state=random_state,
                            max_features = 'sqrt')

rf.fit(train_set[predictors], train_set["target"])

In [244]:
test_predictions = rf_model.predict(test_set[predictors])

# Evaluate Performance

In [106]:
from sklearn.metrics import accuracy_score, precision_score

In [245]:
accuracy_score(test_set["target"], test_predictions)
acc

0.6413043478260869

In [246]:
precision_score(test_set["target"], test_predictions)

0.625

# Investigating Prediction Result

In [247]:
combined = pd.DataFrame(dict(ground_truth=test_set["target"], prediction=test_predictions))

In [248]:
pd.crosstab(index=combined['ground_truth'], columns=combined['prediction'])

prediction,0,1
ground_truth,Unnamed: 1_level_1,Unnamed: 2_level_1
0,169,3
1,99,5


# Improve Performance

In [249]:
# Want to add better predictors

grouped_matches = matches.groupby("team")
grouped_matches

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x781baccd91b0>

In [250]:
# Want to compute rolling averages

def rolling_averages(group, cols, new_cols):
  group = group.sort_values("date")
  rolling_stats = group[cols].rolling(10, closed='left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

In [251]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

group = grouped_matches.get_group("Manchester City").sort_values("date")

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,captain_code,round_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
18,2020-12-12,17:30,Matchweek 12,Sat,Away,D,0.0,0.0,Manchester Utd,1.2,...,24,3,1.7,1.1,15.6,5.3,17.90,0.7,0.2,0.3
19,2020-12-15,20:00,Matchweek 13,Tue,Home,D,1.0,1.0,West Brom,1.9,...,50,4,1.4,1.0,15.2,4.7,17.61,0.6,0.1,0.2
20,2020-12-19,15:00,Matchweek 14,Sat,Away,W,1.0,0.0,Southampton,1.1,...,50,5,1.3,0.6,16.2,4.9,17.39,0.7,0.1,0.2
22,2020-12-26,20:00,Matchweek 15,Sat,Home,W,2.0,0.0,Newcastle Utd,1.8,...,50,6,1.3,0.5,15.0,5.3,17.54,0.6,0.1,0.2
23,2021-01-03,16:30,Matchweek 17,Sun,Away,W,3.0,1.0,Chelsea,2.4,...,50,8,1.4,0.5,14.8,5.4,17.09,0.6,0.1,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2022-03-14,20:00,Matchweek 29,Mon,Away,D,0.0,0.0,Crystal Palace,2.3,...,50,21,2.4,0.9,16.4,5.7,16.34,0.3,0.5,0.6
44,2022-04-02,15:00,Matchweek 31,Sat,Away,W,2.0,0.0,Burnley,1.8,...,90,24,1.8,0.6,16.7,5.5,16.43,0.2,0.3,0.4
46,2022-04-10,16:30,Matchweek 32,Sun,Home,D,2.0,2.0,Liverpool,2.0,...,50,25,1.9,0.6,17.3,5.7,16.17,0.2,0.3,0.4
49,2022-04-20,20:00,Matchweek 30,Wed,Home,W,3.0,0.0,Brighton,1.2,...,90,23,1.9,0.7,17.0,6.0,15.98,0.3,0.2,0.3


In [252]:
matches_rolling = matches.groupby("team").apply(lambda group: rolling_averages(group, cols, new_cols))

In [253]:
matches_rolling = matches_rolling.droplevel("team")
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,captain_code,round_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2020-12-06,16:30,Matchweek 11,Sun,Away,L,0.0,2.0,Tottenham,0.8,...,70,2,1.0,1.2,9.3,3.0,16.39,0.9,0.1,0.1
1,2020-12-13,19:15,Matchweek 12,Sun,Home,L,0.0,1.0,Burnley,1.5,...,70,3,0.7,1.4,9.2,2.7,16.54,0.8,0.1,0.1
2,2020-12-16,18:00,Matchweek 13,Wed,Home,D,1.0,1.0,Southampton,0.7,...,70,4,0.5,1.4,10.4,3.0,16.59,0.8,0.1,0.1
3,2020-12-19,17:30,Matchweek 14,Sat,Away,L,1.0,2.0,Everton,1.4,...,73,5,0.5,1.2,10.9,3.1,17.17,0.8,0.1,0.1
4,2020-12-26,17:30,Matchweek 15,Sat,Home,W,3.0,1.0,Chelsea,2.1,...,33,6,0.4,1.3,11.6,2.8,17.27,0.9,0.2,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1149,2022-03-13,14:00,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,0.8,...,14,21,1.5,0.9,11.5,3.9,17.99,0.4,0.1,0.1
1150,2022-03-18,20:00,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,0.8,...,14,23,1.5,0.9,11.0,3.6,18.09,0.3,0.1,0.1
1151,2022-04-02,15:00,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,1.2,...,14,24,1.4,1.1,11.6,4.0,18.64,0.3,0.0,0.0
1152,2022-04-08,20:00,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,0.3,...,14,25,1.4,1.1,12.0,4.2,17.93,0.3,0.0,0.0


# Predict

In [254]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    print("accuracy is", accuracy_score(test_set["target"], preds))
    print("accuracy is", precision_score(test_set["target"], preds))
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [255]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)


accuracy is 0.5833333333333334
accuracy is 0.40350877192982454


In [256]:
error

0.543859649122807

In [257]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [138]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
55,0,1,2022-01-23,Arsenal,Burnley,D
56,1,1,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
1312,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W
1313,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L
1314,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W
1315,0,1,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L


# Investigating Prediction Result 2.0



In [1]:
# custom dictionary class which inherits from the built-in dict class.
class MissingDict(dict):
  # Overriding the __missing__ method:
  # It's invoked when a key is not found in the dictionary.
  # In this custom dictionary class, it's overridden with a lambda function that simply returns the missing key itself.
  __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

map_obj = MissingDict(**map_values)

In [40]:
combined["new_team"] = combined["team"].map(map_obj)

In [43]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [44]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,1,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,1,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W,Wolves,0,0,Everton,Wolves,L,Everton
258,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L,Wolves,1,0,Leeds United,Wolves,W,Leeds United
259,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W,Wolves,0,0,Aston Villa,Wolves,L,Aston Villa
260,0,0,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,0,Newcastle United,Wolves,W,Newcastle Utd


In [45]:
merged[(merged["predicted_x"] == 1) &
       (merged["predicted_y"] == 0)]["actual_x"].value_counts()

1    26
0    15
Name: actual_x, dtype: int64

In [46]:
# accuracy
26/41

0.6341463414634146