In [105]:
import pandas as pd
# accuracy_score tells us what percent of our predictions were actually right
from sklearn.metrics import accuracy_score, precision_score
# Random Forest is a non linear classifier --> Any other non linear classifier will work as well
from sklearn.ensemble import RandomForestClassifier

In [79]:
# Initially we read the match data into our pandas dataframe
# The first column in our CSV was numbered from 1 to 3800, we just tell pandas that the first column will be our index column.
matches = pd.read_csv("all_matches.csv", index_col = 0)

In [80]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2023,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2023,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2023,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2023,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2023,Manchester City


In [81]:
# Next we investigate our data to check for missing data --> Check the number of matches played by all teams in the last 5 years --> Should be equal to 3800
matches.shape

(3800, 27)

In [82]:
# ML models can only work with int64 or float64 values so columns like Date have to be converted
matches["date"] = pd.to_datetime(matches["date"])

In [83]:
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                       int64
ga                       int64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                       int64
pkatt                    int64
season                   int64
team                    object
dtype: object

In [84]:
# Creating Prdictors for our ML Model

# Venue_Code marks Home Games as 1 and Away Games as 0 --> This helps make the object data type into an int
# First we take the venue column and split it into 2 catgories --> Home and Away --> then we use cat.codes to assign numbers 1 and 0 for Home and Away respectively
matches["venue_code"] = matches["venue"].astype("category").cat.codes

# Opp_Code will give each opponent a code
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

# Converting the time of the matches to a single integer --> Ex :- 16:30  will become 16
matches["hour"] = matches["time"].str.replace(":.+", "", regex = True).astype("int")

# Each day of the week is given a seperate code --> Monday being 0 and Sunday being 6
matches["day_code"] = matches["date"].dt.dayofweek

In [85]:
# We set a target to train our model on --> Since we are tying to predict which team will WIN, we consider losses and draws as 0 and wins as 1.
matches["target"] = (matches["result"] == "W").astype("int")

In [86]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,0.0,0,0,2023,Manchester City,0,5,20,4,1
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,0.0,0,0,2023,Manchester City,1,14,20,5,1
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,2.0,0,1,2023,Manchester City,0,16,14,6,1
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,0.0,1,1,2023,Manchester City,1,9,15,5,1
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,1.0,0,0,2023,Manchester City,0,18,15,5,1


In [88]:
# Initialize your rf object
# n_estimators --> Greater the value, slower your algo, but better the accuracy
# min_samples_split --> Higher the value --> Less the Overfit but also Less Accurate will the model be on the training data
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [89]:
# Making the venue_code, opp_code, hour and day_code the predictors of our RandomForest classifier
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [90]:
# We use Rolling Averages because we want to utilize the data of previous matchweeks of a single season in order to predict the next matchweek's result --> The reason for this being that if a team has lost 3 games in a row, we'd expect them to lose the next game as well since they're not in-form.
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    # This will take the results of the last 3 weeks, not including this week and use it as predictor for the outcome of this week
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats

    # This will drop any missing values --> For example, we don't have any previous data of the last 3 matchweeks for matchweek 1 or matchweek 2
    group = group.dropna(subset = new_cols)
    return group

In [91]:
columns_for_ra = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in columns_for_ra]

In [92]:
matches_rolling = matches.groupby("team").apply(lambda x : rolling_averages(x, columns_for_ra, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x : rolling_averages(x, columns_for_ra, new_cols))


In [93]:
matches_rolling = matches_rolling.droplevel('team')

In [94]:
matches_rolling.index = range(matches_rolling.shape[0])

In [95]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,5,1,2.000000,1.000000,15.000000,7.000000,19.100000,0.000000,0.000000,0.000000
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,5,1,2.000000,1.000000,15.000000,7.000000,19.100000,0.000000,0.000000,0.000000
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,0,1,2.000000,1.000000,15.000000,7.000000,19.100000,0.000000,0.000000,0.000000
3,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,0,1,1.666667,0.666667,14.333333,5.333333,18.200000,0.000000,0.333333,0.333333
4,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,0,1,1.333333,0.333333,13.666667,3.666667,17.300000,0.000000,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,L,0,2,Liverpool,...,6,0,1.000000,3.000000,14.000000,8.000000,14.600000,0.000000,0.000000,0.000000
3736,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,L,0,2,Liverpool,...,6,0,0.666667,2.666667,10.666667,6.000000,16.066667,0.333333,0.000000,0.000000
3737,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,L,0,2,Liverpool,...,6,0,0.333333,2.333333,7.333333,4.000000,17.533333,0.666667,0.000000,0.000000
3738,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,L,0,2,Liverpool,...,6,0,0.000000,2.000000,4.000000,2.000000,19.000000,1.000000,0.000000,0.000000


In [96]:
def make_prediction(data, predictors):
    
    # Making the training data all the matches before the 23-24 season
    train = data[data["date"] < '2024-04-05']

    # Making the test data all the matches of the last season
    test = data[data["date"] > '2024-04-05']

    # Fitting our model on the training data with the predefined target column
    rf.fit(train[predictors], train["target"])

    preds = rf.predict(test[predictors])

    combined = pd.DataFrame(dict(actual = test["target"], prediction = preds))

    # This will tell us how many times did we predict correctly --> A loss or draw is 0 and a win is a 1.
    pd.crosstab(index = combined["actual"], columns = combined["prediction"])

    precision = precision_score(test["target"], preds)

    return combined, precision

In [97]:
combined, precision = make_prediction(matches_rolling, predictors + new_cols)

In [98]:
precision

0.8075601374570447

In [99]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)

In [100]:
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
147,1,1,2024-04-06,Arsenal,Brighton,W
148,1,1,2024-04-06,Arsenal,Brighton,W
149,1,1,2024-04-06,Arsenal,Brighton,W
150,1,1,2024-04-06,Arsenal,Brighton,W
151,1,1,2024-04-06,Arsenal,Brighton,W
...,...,...,...,...,...,...
3735,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L
3736,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L
3737,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L
3738,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L


In [101]:
# Now some teams like Wolves are listed as Wolves in the opponent side and as Wolverhampton Wanderers in the team side, hence our model might treat them as seperate teams
# So we create a class to map these values correctly

class Missing_Dictionary(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Wolverhampton Wanderers" : "Wolves",
    "Brighton and Hove Albion" : "Brighton",
    "Newcastle United" : "Newcastle Utd",
    "Manchester United" : "Manchester Utd",
    "West Ham United" : "West Ham",
    "Tottenham Hotspur" : "Tottenham"
}

mapping = Missing_Dictionary(**map_values)

In [102]:
combined["new_team"] = combined["team"].map(mapping)

In [103]:
# This will essentially show both the away and home games for a single matchup in one row for analysis
both_away_and_home = combined.merge(combined, left_on = ["date", "new_team"], right_on = ["date", "opponent"])

In [104]:
both_away_and_home

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2024-04-06,Arsenal,Brighton,W,Arsenal,0,0,Brighton and Hove Albion,Arsenal,L,Brighton
1,1,1,2024-04-06,Arsenal,Brighton,W,Arsenal,0,0,Brighton and Hove Albion,Arsenal,L,Brighton
2,1,1,2024-04-06,Arsenal,Brighton,W,Arsenal,0,0,Brighton and Hove Albion,Arsenal,L,Brighton
3,1,1,2024-04-06,Arsenal,Brighton,W,Arsenal,0,0,Brighton and Hove Albion,Arsenal,L,Brighton
4,1,1,2024-04-06,Arsenal,Brighton,W,Arsenal,0,0,Brighton and Hove Albion,Arsenal,L,Brighton
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3470,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
3471,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
3472,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
3473,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
