In [91]:
import pandas as pd

# using pandas to import csv
matches = pd.read_csv("matches.csv", index_col=0)

matches.shape

38 * 20 * 3 - 360

# checking data was scraped correctly
matches["team"].value_counts()
matches["round"].value_counts()

# checking data types are compatible for ML and converting date from object(string) to datetime64(int)
matches.dtypes
matches['date'] = pd.to_datetime(matches['date'])

matches.head()

# creating cols compatible with ML as it needs ints for the algorithm (cannot use strings/objects in calculation)
matches['venue_code'] = matches['venue'].astype('category').cat.codes
matches['opp_code'] = matches['opponent'].astype('category').cat.codes
matches['hour'] = matches['time'].str.replace(':.+', '', regex=True).astype('int')
matches['day_code'] = matches['date'].dt.dayofweek
matches['result_code'] = (matches['result'] == 'W').astype('int')

matches

from sklearn.ensemble import RandomForestClassifier

# init a random forest algorithm
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

# separate data between what the random forest trains with (past data) and what it will test (future data)
train = matches[matches['date'] < '2023/01/01']
test = matches[matches['date'] > '2023/01/01']

# select the data that predictions will be made with
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

rf.fit(train[predictors], train['result_code'])

predictions = rf.predict(test[predictors])

from sklearn.metrics import accuracy_score

acc = accuracy_score(test['result_code'], predictions)

acc

combined = pd.DataFrame(dict(actual=test['result_code'], prediction=predictions))
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

from sklearn.metrics import precision_score

precision_score(test['result_code'], predictions)


grouped_matches = matches.groupby('team')

group = grouped_matches.get_group('Newcastle United').sort_values('date')

group

def rolling_averages(group, cols, new_cols):
  group = group.sort_values('date')
  rolling_stats = group[cols].rolling(5, closed='left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f'{c}_rolling' for c in cols]

rolling_averages(group, cols, new_cols)

matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))

matches_rolling = matches_rolling.droplevel('team')

matches_rolling.index = range(matches_rolling.shape[0])

all_preds = predictors + new_cols

def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["result_code"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["result_code"], predicted=preds), index=test.index)
    error = precision_score(test["result_code"], preds)
    return combined, error

combined, error = make_predictions(matches_rolling, predictors + new_cols)

error

combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)

combined

class MissingDict(dict):
  __missing__ = lambda self, key: key

map_values = {
    'Brighton and Hove Albion': 'Brighton',
    'Manchester United': 'Manchester Utd',
    'Newcastle United': 'Newcastle Utd',
    'Tottenham Hotspur': 'Tottenham',
    'West Ham United': 'West Ham',
    'Wolverhampton Wanderers': 'Wolves'
}

mapping = MissingDict(**map_values)

combined['new_team'] = combined['team'].map(mapping)

merged = combined.merge(combined, left_on=['date', 'new_team'], right_on=['date', 'opponent'])

merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0)]['actual_x'].value_counts()

161/289

0.5570934256055363