In [None]:
# fetching data from csv file

import pandas as pd

In [None]:
matches = pd.read_csv("matches.csv")


In [None]:
matches.head()

In [None]:
matches.shape

In [None]:
matches["team"].value_counts()


In [None]:

matches[matches["team"] == "Liverpool"]

In [None]:
matches["round"].value_counts()

In [None]:
matches.dtypes

In [None]:
# ml only works with numerical data or float
matches['date'] = pd.to_datetime(matches['date']) #convert date to datetime format
matches.dtypes

In [None]:
# predictors 
matches['venue_code'] = matches['venue'].astype('category').cat.codes#from sting to caterogy to numbers

In [None]:
matches["opp_codes"] = matches['opponent'].astype('category').cat.codes


In [None]:
matches

In [None]:
matches["hour"] = matches['time'].str.replace(":.+", "", regex=True).astype("int")

In [None]:
matches


In [None]:
matches["day_code"] = matches['date'].dt.dayofweek

In [None]:
matches


In [None]:
matches["target"] = (matches['result'] == "W").astype("int")

In [None]:
matches

In [None]:
# creating model
from sklearn.ensemble import RandomForestClassifier #Random Forest model can pickup non linear relationships


In [None]:
# random forest model is a series of decision trees the more the estimator the better the model generalizes the more accurate but the longer it takes to train
# min samples is the minimum number of samples required to split an internal node the higher the less overfitting but the less accurate
#  random state is for reproducibility 
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = matches[matches['date'] < '2022-01-01']
test = matches[matches['date'] >= '2022-01-01']

In [None]:
predictors = ['venue_code', 'opp_codes', 'hour', 'day_code']

In [None]:
rf.fit(train[predictors], train['target'])

In [None]:
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score, precision_score

In [None]:
acc = accuracy_score(test['target'], preds)

In [None]:
acc

In [None]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [None]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

In [None]:
precision_score(test['target'], preds)


In [None]:
# creates a dataframe for each team showing their matches   
grouped_matches = matches.groupby("team")

In [None]:
group = grouped_matches.get_group("Liverpool")

In [None]:
def rolling_averages(group, col, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[col].rolling(3, closed='left')
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    

In [None]:
cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f'{c}_rolling' for c in cols]



In [None]:
new_cols

In [None]:
rolling_averages(group, cols, new_cols)

In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [None]:
matches_rolling

In [None]:
matches_rolling = matches_rolling.droplevel("team")

In [None]:
matches_rolling

In [None]:
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
matches_rolling

In [None]:
# retraining the model for rolling averages features
def make_predictions(data, predictors, model=rf):
    train = data[data['date'] < '2022-01-01']
    test = data[data['date'] >= '2022-01-01']

    rf.fit(train[predictors], train['target'])

    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], prediction=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [None]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)  


In [None]:
precision

In [None]:
combined

In [None]:
combined = combined.merge(matches_rolling[["team", "opponent", "date", "result"]], left_index=True, right_index=True)

In [None]:
combined

In [None]:
# combining home and away predictions
class MissingDict(dict):
    __missing__ = lambda self, key: key

maps_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcaste Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**maps_values)

In [None]:
mapping["Arsenal"]


In [None]:
mapping["West Ham United"]

In [None]:
combined["new_teams"] = combined["team"].map(mapping)