In [47]:
import pandas as pd


In [48]:
matches = pd.read_csv('matches.csv', index_col = 0)

In [49]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,...,Match Report,,11.0,5.0,19.1,0.0,0,0,2025,Manchester City
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,...,Match Report,,13.0,4.0,17.8,1.0,1,1,2025,Manchester City
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,1.0,West Ham,...,Match Report,,23.0,8.0,15.0,1.0,0,0,2025,Manchester City
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,...,Match Report,,10.0,5.0,13.8,0.0,0,0,2025,Brighton and Hove Albion
1,2024-08-24,12:30,Premier League,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,...,Match Report,,14.0,4.0,14.2,1.0,0,0,2025,Brighton and Hove Albion


In [50]:
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [51]:
matches['date'] = pd.to_datetime(matches['date'])

In [52]:
matches['venue_code'] = matches['venue'].astype('category').cat.codes

In [53]:
matches['opp_code'] = matches['opponent'].astype('category').cat.codes

In [54]:
matches['hour'] = matches['time'].str.replace(':.+','', regex=True).astype('int')

In [55]:
matches['day_code'] = matches['date'].dt.dayofweek

In [56]:
matches['target'] = (matches['result'] == 'W').astype('int')

In [57]:
%pip install scikit-learn
from sklearn.ensemble import RandomForestClassifier

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [58]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [59]:
train = matches[matches['date'] < '2022-05-22']

In [60]:
test = matches[matches['date'] > '2022-05-22']

In [61]:
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

In [62]:
rf.fit(train[predictors], train['target'])

In [63]:
preds = rf.predict(test[predictors])

In [64]:
from sklearn.metrics import accuracy_score

In [65]:
acc = accuracy_score(test['target'], preds)

In [66]:
acc

0.5974683544303797

In [67]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [68]:
combined

Unnamed: 0,actual,prediction
1,1,0
2,1,1
3,1,1
0,1,0
1,1,0
...,...,...
42,0,1
43,0,0
44,0,1
45,0,0


In [69]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,751,215
1,421,193


In [70]:
from sklearn.metrics import precision_score

In [71]:
precision_score(test['target'], preds)

0.4730392156862745

In [72]:
grouped_matches = matches.groupby('team')

In [73]:
group = grouped_matches.get_group('Manchester United')

In [74]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2024-08-16,20:00,Premier League,Matchweek 1,Fri,Home,W,1.0,0.0,Fulham,...,0.0,0,0,2025,Manchester United,1,9,20,4,1
2,2024-08-24,12:30,Premier League,Matchweek 2,Sat,Away,L,1.0,2.0,Brighton,...,0.0,0,0,2025,Manchester United,0,4,12,5,0
3,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Home,L,0.0,3.0,Liverpool,...,0.0,0,0,2025,Manchester United,1,13,16,6,0
0,2023-08-14,20:00,Premier League,Matchweek 1,Mon,Home,W,1.0,0.0,Wolves,...,0.0,0,0,2024,Manchester United,1,26,20,0,1
1,2023-08-19,17:30,Premier League,Matchweek 2,Sat,Away,L,0.0,2.0,Tottenham,...,1.0,0,0,2024,Manchester United,0,22,17,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,2021-05-09,14:05,Premier League,Matchweek 35,Sun,Away,W,3.0,1.0,Aston Villa,...,0.0,1,1,2021,Manchester United,0,1,14,6,1
56,2021-05-11,18:00,Premier League,Matchweek 36,Tue,Home,L,1.0,2.0,Leicester City,...,0.0,0,0,2021,Manchester United,1,12,18,1,0
57,2021-05-13,20:15,Premier League,Matchweek 34,Thu,Home,L,2.0,4.0,Liverpool,...,1.0,0,0,2021,Manchester United,1,13,20,3,0
58,2021-05-18,18:00,Premier League,Matchweek 37,Tue,Home,D,1.0,1.0,Fulham,...,2.0,0,0,2021,Manchester United,1,9,18,1,0


In [75]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [76]:
cols = ['gf','ga','sh', 'sot','dist', 'fk', 'pk', 'pkatt']
new_cols = [f'{c}_rolling' for c in cols]

In [77]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-17,20:00,Premier League,Matchweek 5,Sat,Away,W,4.0,1.0,Newcastle Utd,...,5,1,1.666667,3.666667,9.000000,2.333333,21.266667,0.333333,0.666667,0.666667
7,2020-10-24,17:30,Premier League,Matchweek 6,Sat,Home,D,0.0,0.0,Chelsea,...,5,0,2.666667,3.000000,12.333333,4.666667,21.366667,0.000000,0.666667,1.000000
9,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Home,L,0.0,1.0,Arsenal,...,6,0,1.666667,2.333333,15.000000,5.333333,21.266667,0.000000,0.333333,0.666667
11,2020-11-07,12:30,Premier League,Matchweek 8,Sat,Away,W,3.0,1.0,Everton,...,5,1,1.333333,0.666667,16.333333,5.666667,19.433333,0.000000,0.000000,0.333333
12,2020-11-21,20:00,Premier League,Matchweek 9,Sat,Home,W,1.0,0.0,West Brom,...,5,1,1.000000,0.666667,11.666667,3.666667,19.633333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,2024-05-15,20:00,Premier League,Matchweek 34,Wed,Home,W,3.0,2.0,Newcastle Utd,...,2,1,0.333333,2.000000,16.000000,4.666667,19.200000,0.333333,0.000000,0.000000
50,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Away,W,2.0,0.0,Brighton,...,6,1,1.000000,2.333333,12.666667,4.000000,18.266667,0.333333,0.000000,0.000000
1,2024-08-16,20:00,Premier League,Matchweek 1,Fri,Home,W,1.0,0.0,Fulham,...,4,1,1.666667,1.000000,14.000000,4.666667,18.366667,0.000000,0.000000,0.000000
2,2024-08-24,12:30,Premier League,Matchweek 2,Sat,Away,L,1.0,2.0,Brighton,...,5,0,2.000000,0.666667,14.000000,5.666667,16.600000,0.000000,0.000000,0.000000


In [78]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))


In [79]:
matches_rolling = matches_rolling.droplevel('team')

In [80]:
matches_rolling.index = range(matches_rolling.shape[0])

In [81]:
def make_predictions(data, predictors):
    train = data[data['date'] < '2022-05-22']
    test = data[data['date'] > '2022-05-22']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], prediction=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [82]:
combined, precision, = make_predictions(matches_rolling, predictors + new_cols)

In [83]:
precision 

0.5583126550868487

In [84]:
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)

In [85]:
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
73,1,0,2022-08-05,Arsenal,Crystal Palace,W
74,1,0,2022-08-13,Arsenal,Leicester City,W
75,1,1,2022-08-20,Arsenal,Bournemouth,W
76,1,0,2022-08-27,Arsenal,Fulham,W
77,1,1,2022-08-31,Arsenal,Aston Villa,W
...,...,...,...,...,...,...
3011,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L
3012,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L
3013,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L
3014,0,0,2024-08-25,Wolverhampton Wanderers,Chelsea,L


In [86]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    'Brighton and Hove Albion': 'Brighton',
    'Manchester United' : 'Manchester Utd',
    'Newcastle United': 'Newcastle Utd',
    'Tottenham Hotspur': 'Tottenham',
    'West Ham United' : 'West Ham',
    'Wolverhampton Wanderers': 'Wolves'
}
mapping = MissingDict(**map_values)

In [87]:
combined['new_team'] = combined['team'].map(mapping)

In [88]:
merged =  combined.merge(combined, left_on=['date', 'new_team'], right_on=['date', 'opponent'])

In [89]:
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,1,0,2022-08-05,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
1,1,0,2022-08-13,Arsenal,Leicester City,W,Arsenal,0,0,Leicester City,Arsenal,L,Leicester City
2,1,0,2022-08-27,Arsenal,Fulham,W,Arsenal,0,0,Fulham,Arsenal,L,Fulham
3,1,1,2022-08-31,Arsenal,Aston Villa,W,Arsenal,0,0,Aston Villa,Arsenal,L,Aston Villa
4,0,1,2022-09-04,Arsenal,Manchester Utd,L,Arsenal,1,0,Manchester United,Arsenal,W,Manchester Utd
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,0,Crystal Palace,Wolves,W,Crystal Palace
1438,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
1439,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L,Wolves,1,1,Arsenal,Wolves,W,Arsenal
1440,0,0,2024-08-25,Wolverhampton Wanderers,Chelsea,L,Wolves,1,0,Chelsea,Wolves,W,Chelsea


In [90]:
merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()

actual_x
1    194
0    138
Name: count, dtype: int64

In [91]:
194 / (194 + 138)

0.5843373493975904