In [606]:
# I just used the data for 2020-2021 and 2021-2022 season (two season) for this machine learning
# Because the EPL is biggest soccer league in the World, so there are so many players move to other team every season
# so I think that to use the recent two season's data can make best prediction

# import pandas
import pandas as pd

In [607]:
# The match data for 2020-2021 season and 2021-2022 season.

matches = pd.read_csv("/Users/hyunjesung/Desktop/epl/matches.csv", index_col = 0) # first column has just index

In [608]:
matches.shape # (1389, 27) 

#In epl, there is 38 matches for each team in one season (19 opponenets and 2 games with each opponents). 
#There is 20 teams and 2 seasons.
# total 38 * 20 * 2 = 1520 matches.
#But this data have only 1389 rows

(1389, 27)

In [609]:
# To check the missing data, 

matches["team"].value_counts()

Southampton                 72
Brighton and Hove Albion    72
Manchester United           72
West Ham United             72
Newcastle United            72
Burnley                     71
Leeds United                71
Crystal Palace              71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Arsenal                     71
Leicester City              70
Chelsea                     70
Aston Villa                 70
Everton                     70
Liverpool                   38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Brentford                   34
Watford                     33
Norwich City                33
Name: team, dtype: int64

In [610]:
# Each season, the bottom three teams are relegated to the second division (EFL Championship),
#and the top three teams in the second division are moved up to the first division (EPL).
# So there's total 23 teams in data
# But liverpool never relegated = liverpool's data is missing.
# and we also can find there's some matches missing for other teams becuase each team who did not 
# relegated must have 76(38 *2) matches for two seasons.

In [611]:
# so check the rounds(matchweek) to which data is missing.

matches["round"].value_counts()

# Through this data, we can find that there's some data is missing from Matchweek 30,27,22,etc.

Matchweek 1     39
Matchweek 16    39
Matchweek 34    39
Matchweek 32    39
Matchweek 31    39
Matchweek 29    39
Matchweek 28    39
Matchweek 26    39
Matchweek 25    39
Matchweek 24    39
Matchweek 23    39
Matchweek 2     39
Matchweek 19    39
Matchweek 17    39
Matchweek 20    39
Matchweek 15    39
Matchweek 5     39
Matchweek 3     39
Matchweek 13    39
Matchweek 12    39
Matchweek 4     39
Matchweek 11    39
Matchweek 10    39
Matchweek 9     39
Matchweek 8     39
Matchweek 14    39
Matchweek 7     39
Matchweek 6     39
Matchweek 30    37
Matchweek 27    37
Matchweek 22    37
Matchweek 21    37
Matchweek 18    37
Matchweek 33    32
Matchweek 35    20
Matchweek 36    20
Matchweek 37    20
Matchweek 38    20
Name: round, dtype: int64

In [612]:
matches.dtypes # To check data type because ML only work with numeric data (float or int)

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [613]:
# to replace type of 'date' with datetime
matches["date"]= pd.to_datetime(matches["date"])

In [614]:
# to set venue section as 0 and 1 (0 = away 1 = home)
matches["home/away"] = matches["venue"].astype("category").cat.codes 

In [615]:
#to set team code (each team will get numeric code) 
matches["teamcode"] = matches['opponent'].astype("category").cat.codes

In [616]:
# to replace : time section and set new colum 'match time' with integer type 
matches['matchtime'] = matches['time'].str.replace(":.+","",regex=True).astype("int")

In [617]:
# set the date code section with 0=monday to 6= sunday 
matches["datecode"] = matches["date"].dt.dayofweek

In [618]:
# Train ML model
matches["target"] = (matches["result"]=="W").astype("int") # if result is 'W', return 1. else 0

In [619]:
#delete unnecessary sections
del matches["comp"]
del matches["notes"]

In [621]:
# RandomForestClassifier is type of ML pick up non-linearity of data
from sklearn.ensemble import RandomForestClassifier

In [622]:
# random_state = Random seed value, which controls the randomness of the algorithm. 
# Using the same seed value results in the same result for each run.
# n_estimators = number of dicision tree, min_samples_split = minimum number of sample to split the node, 

rf = RandomForestClassifier(random_state=1, min_samples_split = 10,n_estimators=50)

In [623]:
# to set the train and test data
train = matches[matches["date"] < '2022-01-01'] # to train, use match data before 2022/01/01
test = matches[matches["date"] > '2022-01-01'] #to test my ML, use match data after 2022/01/01 to check accuracy
predictors = ["home/away", "teamcode", "matchtime", "datecode"]

In [624]:
# to train our rf model, with predictors, try to predict target 
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [625]:
preds = rf.predict(test[predictors])

In [626]:
from sklearn.metrics import accuracy_score #to predict win/lose by match time

In [627]:
accuracy = accuracy_score(test["target"], preds)

In [628]:
accuracy # accuracy = 61.2%

0.6123188405797102

In [629]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [630]:
# to check the accuracy with acutal and predicted match
pd.crosstab(index=combined["actual"], columns=combined["predicted"]) #crosstab= two way table

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,31
1,76,28


In [631]:
from sklearn.metrics import precision_score # to check the precentage of match (prediction and acutal) 

In [632]:
precision_score(test["target"], preds) # when we predict as win, only 47 percent accuracy

0.4745762711864407

In [633]:
# to add some more predictors to improve ML
byteam = matches.groupby("team")

In [634]:
group = byteam.get_group("Chelsea").sort_values("date")
group

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,home/away,teamcode,matchtime,datecode,target
0,2020-09-14,20:15,Matchweek 1,Mon,Away,W,3.0,1.0,Brighton,1.2,...,0.0,1.0,1.0,2021,Chelsea,0,3,20,0,1
1,2020-09-20,16:30,Matchweek 2,Sun,Home,L,0.0,2.0,Liverpool,1.0,...,0.0,0.0,1.0,2021,Chelsea,1,11,16,6,0
3,2020-09-26,17:30,Matchweek 3,Sat,Away,D,3.0,3.0,West Brom,2.2,...,2.0,0.0,0.0,2021,Chelsea,0,20,17,5,0
5,2020-10-03,12:30,Matchweek 4,Sat,Home,W,4.0,0.0,Crystal Palace,2.5,...,0.0,2.0,2.0,2021,Chelsea,1,6,12,5,1
6,2020-10-17,15:00,Matchweek 5,Sat,Home,D,3.0,3.0,Southampton,2.0,...,0.0,0.0,0.0,2021,Chelsea,1,17,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2022-03-13,14:00,Matchweek 29,Sun,Home,W,1.0,0.0,Newcastle Utd,0.8,...,1.0,0.0,0.0,2022,Chelsea,1,14,14,6,1
47,2022-04-02,15:00,Matchweek 31,Sat,Home,L,1.0,4.0,Brentford,1.6,...,0.0,0.0,0.0,2022,Chelsea,1,2,15,5,0
49,2022-04-09,15:00,Matchweek 32,Sat,Away,W,6.0,0.0,Southampton,4.2,...,0.0,0.0,0.0,2022,Chelsea,0,17,15,5,1
52,2022-04-20,19:45,Matchweek 25,Wed,Home,L,2.0,4.0,Arsenal,0.7,...,0.0,0.0,0.0,2022,Chelsea,1,0,19,2,0


In [635]:
# to make better model, check the few previous games
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [636]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] # gf = goals for ,ga = goals against, sot = shots on target
# dist = distance that shot taken, fk = free kick, pk= penalty kick, pkatt= pk attempts  
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,datecode,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-03,12:30,Matchweek 4,Sat,Home,W,4.0,0.0,Crystal Palace,2.5,...,5,1,2.000000,2.000000,12.333333,5.666667,20.366667,0.666667,0.333333,0.666667
6,2020-10-17,15:00,Matchweek 5,Sat,Home,D,3.0,3.0,Southampton,2.0,...,5,0,2.333333,1.666667,14.333333,5.666667,18.933333,0.666667,0.666667,1.000000
8,2020-10-24,17:30,Matchweek 6,Sat,Away,D,0.0,0.0,Manchester Utd,0.2,...,5,0,3.333333,2.000000,17.000000,6.666667,15.300000,0.666667,0.666667,0.666667
10,2020-10-31,15:00,Matchweek 7,Sat,Away,W,3.0,0.0,Burnley,1.4,...,5,1,2.333333,1.000000,11.000000,3.333333,15.300000,0.000000,0.666667,0.666667
12,2020-11-07,17:30,Matchweek 8,Sat,Home,W,4.0,1.0,Sheffield Utd,2.7,...,5,1,2.000000,1.000000,10.666667,5.000000,15.733333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2022-03-13,14:00,Matchweek 29,Sun,Home,W,1.0,0.0,Newcastle Utd,0.8,...,6,1,2.666667,0.333333,12.000000,5.000000,15.600000,0.666667,0.000000,0.000000
47,2022-04-02,15:00,Matchweek 31,Sat,Home,L,1.0,4.0,Brentford,1.6,...,5,0,2.666667,0.333333,11.333333,5.000000,15.133333,0.666667,0.000000,0.000000
49,2022-04-09,15:00,Matchweek 32,Sat,Away,W,6.0,0.0,Southampton,4.2,...,5,1,1.666667,1.666667,14.666667,6.000000,16.100000,0.666667,0.000000,0.000000
52,2022-04-20,19:45,Matchweek 25,Wed,Home,L,2.0,4.0,Arsenal,0.7,...,2,0,2.666667,1.333333,17.666667,8.333333,17.000000,0.333333,0.000000,0.000000


In [637]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [638]:
rolling_averages(group,cols,new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,datecode,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-03,12:30,Matchweek 4,Sat,Home,W,4.0,0.0,Crystal Palace,2.5,...,5,1,2.000000,2.000000,12.333333,5.666667,20.366667,0.666667,0.333333,0.666667
6,2020-10-17,15:00,Matchweek 5,Sat,Home,D,3.0,3.0,Southampton,2.0,...,5,0,2.333333,1.666667,14.333333,5.666667,18.933333,0.666667,0.666667,1.000000
8,2020-10-24,17:30,Matchweek 6,Sat,Away,D,0.0,0.0,Manchester Utd,0.2,...,5,0,3.333333,2.000000,17.000000,6.666667,15.300000,0.666667,0.666667,0.666667
10,2020-10-31,15:00,Matchweek 7,Sat,Away,W,3.0,0.0,Burnley,1.4,...,5,1,2.333333,1.000000,11.000000,3.333333,15.300000,0.000000,0.666667,0.666667
12,2020-11-07,17:30,Matchweek 8,Sat,Home,W,4.0,1.0,Sheffield Utd,2.7,...,5,1,2.000000,1.000000,10.666667,5.000000,15.733333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2022-03-13,14:00,Matchweek 29,Sun,Home,W,1.0,0.0,Newcastle Utd,0.8,...,6,1,2.666667,0.333333,12.000000,5.000000,15.600000,0.666667,0.000000,0.000000
47,2022-04-02,15:00,Matchweek 31,Sat,Home,L,1.0,4.0,Brentford,1.6,...,5,0,2.666667,0.333333,11.333333,5.000000,15.133333,0.666667,0.000000,0.000000
49,2022-04-09,15:00,Matchweek 32,Sat,Away,W,6.0,0.0,Southampton,4.2,...,5,1,1.666667,1.666667,14.666667,6.000000,16.100000,0.666667,0.000000,0.000000
52,2022-04-20,19:45,Matchweek 25,Wed,Home,L,2.0,4.0,Arsenal,0.7,...,2,0,2.666667,1.333333,17.666667,8.333333,17.000000,0.333333,0.000000,0.000000


In [639]:
#set group by team
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [640]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,datecode,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.4,...,6,1,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,5,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.9,...,6,0,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.1,...,6,1,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,6,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,32,2022-03-13,14:00,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,0.8,...,6,1,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,33,2022-03-18,20:00,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,0.8,...,4,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,34,2022-04-02,15:00,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,1.2,...,5,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,35,2022-04-08,20:00,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,0.3,...,4,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [641]:
# we do not need team name -> drop
matches_rolling = matches_rolling.droplevel('team')

In [642]:
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,datecode,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.4,...,6,1,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,5,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.9,...,6,0,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.1,...,6,1,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,6,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32,2022-03-13,14:00,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,0.8,...,6,1,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
33,2022-03-18,20:00,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,0.8,...,4,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
34,2022-04-02,15:00,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,1.2,...,5,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
35,2022-04-08,20:00,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,0.3,...,4,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [643]:
matches_rolling.index = range(matches_rolling.shape[0]) # to set unique index for each match

In [644]:
# re-train ML model
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    accuracy = precision_score(test["target"], preds)
    return combined, error

In [645]:
combined, accuracy = make_predictions(matches_rolling, predictors + new_cols)

In [646]:
accuracy # my ML model's accuracy

0.625

In [647]:
#to add match date, team, opponent, and result in combined
combined = combined.merge(matches_rolling[["date","team","opponent","result"]],left_index=True, right_index= True)

In [648]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
55,0,0,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
1312,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W
1313,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L
1314,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W
1315,0,0,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L


In [649]:
# as in example above, the opponent name and team name is not same ( ex) Wolverhampton Wanderers and Wolves
class MissingDict(dict):
    __missing__= lambda self,key:key
    
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)


In [650]:
combined["new_team"] = combined["team"].map(mapping)

In [651]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [652]:
merged 

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,0,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W,Wolves,0,0,Everton,Wolves,L,Everton
258,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L,Wolves,1,0,Leeds United,Wolves,W,Leeds United
259,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W,Wolves,0,0,Aston Villa,Wolves,L,Aston Villa
260,0,0,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,0,Newcastle United,Wolves,W,Newcastle Utd


In [653]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

1    27
0    13
Name: actual_x, dtype: int64

In [655]:
 27/40 #67.5 accuracy

0.675