<a href="https://colab.research.google.com/github/whitehatjr1001/Football-analysis/blob/main/FootBallpredictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np 
import requests
import warnings 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.ensemble import RandomForestClassifier
from tabulate import tabulate
from sklearn.model_selection import GridSearchCV

In [26]:
warnings.filterwarnings('ignore')
url = "https://www.betexplorer.com/soccer/england/premier-league/results/"
soup = BeautifulSoup(requests.get(url).content,"html.parser")

In [27]:
def get_odd_or_text(td):
    if "data-odd" in td.attrs:
        return td["data-odd"]

    odd = td.select_one("[data-odd]")
    if odd:
        return odd["data-odd"]

    return td.get_text(strip=True)


In [28]:
all_data = []
for row in soup.select(".table-main tr:has(td)"):
    tds = [get_odd_or_text(td) for td in row.select("td")]
    round_ = row.find_previous("th").find_previous("tr").th.text
    all_data.append([round_, *tds])

df = pd.DataFrame(
    all_data, columns=["Round", "Match", "Score", "1", "X", "2", "Date"])


df['Home'] = [i.split('-')[0] for i in df['Match']]
df['Away'] = [i.split('-')[1] for i in df['Match']]

In [None]:
df.shape

(281, 9)

In [29]:
#reverse df
df = df.iloc[::-1]


cols = ['1','X','2']

df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df['HomeWin_Prob'] = round(1/df['1'],2)
df['DrawWin_Prob'] = round(1/df['X'],2)
df['AwayWin_Prob'] = round(1/df['2'],2)

df['HomeGoals'] = [i.split(':', 1)[0] for i in df['Score']]



In [30]:
df['AwayGoals'] = [i.split(':', 1)[1] if (':' in i and len(i.split(':', 1)) > 1) else '' for i in df['Score']]

In [31]:


cols = ['1','X','2']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df['HomeWin_Prob'] = round(1/df['1'],2)
df['DrawWin_Prob'] = round(1/df['X'],2)
df['AwayWin_Prob'] = round(1/df['2'],2)

df['HomeGoals'] = [i.split(':', 1)[0] for i in df['Score']]
df['AwayGoals'] = [i.split(':', 1)[1] if (':' in i and len(i.split(':', 1)) > 1) else '' for i in df['Score']]

In [32]:
def result(df):
  if df['HomeGoals']>df['AwayGoals']:
    return 1
  if df['HomeGoals']==df['AwayGoals']:
    return 0
  if df['HomeGoals']<df['AwayGoals']:
    return 2
  





In [33]:
df['Result'] = df.apply(result,axis=1)

In [None]:
df

Unnamed: 0,Round,Match,Score,1,X,2,Date,Home,Away,HomeWin_Prob,DrawWin_Prob,AwayWin_Prob,HomeGoals,AwayGoals,Result
280,1. Round,Crystal Palace-Arsenal,0:2,4.58,3.57,1.84,05.08.2022,Crystal Palace,Arsenal,0.22,0.28,0.54,0,2,2
279,1. Round,Tottenham-Southampton,4:1,1.36,5.20,8.69,06.08.2022,Tottenham,Southampton,0.74,0.19,0.12,4,1,1
278,1. Round,Newcastle-Nottingham,2:0,1.58,4.04,6.19,06.08.2022,Newcastle,Nottingham,0.63,0.25,0.16,2,0,1
277,1. Round,Leeds-Wolves,2:1,2.43,3.34,3.00,06.08.2022,Leeds,Wolves,0.41,0.30,0.33,2,1,1
276,1. Round,Fulham-Liverpool,2:2,10.45,6.10,1.28,06.08.2022,Fulham,Liverpool,0.10,0.16,0.78,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,29. Round,Bournemouth-Fulham,2:1,2.56,3.26,2.89,01.04.,Bournemouth,Fulham,0.39,0.31,0.35,2,1,1
3,29. Round,Arsenal-Leeds,4:1,1.26,6.33,10.91,01.04.,Arsenal,Leeds,0.79,0.16,0.09,4,1,1
2,29. Round,West Ham-Southampton,1:0,1.76,3.67,4.99,02.04.,West Ham,Southampton,0.57,0.27,0.20,1,0,1
1,29. Round,Newcastle-Manchester Utd,2:0,2.32,3.48,3.06,02.04.,Newcastle,Manchester Utd,0.43,0.29,0.33,2,0,1


In [34]:
df.drop(['Round','Score','Date','1','X','2','Match','HomeGoals','AwayGoals'],axis=1,inplace=True)


In [None]:
df.isna().sum()


Home            0
Away            0
HomeWin_Prob    1
DrawWin_Prob    1
AwayWin_Prob    1
Result          0
dtype: int64

In [35]:
df.dropna(inplace=True)


In [None]:
df.isna().sum()

Home            0
Away            0
HomeWin_Prob    0
DrawWin_Prob    0
AwayWin_Prob    0
Result          0
dtype: int64

In [36]:
hold_out = df[250::]
hold_out.drop(['Result'],axis=1,inplace=True)
hold_out = hold_out.reset_index(drop=True)
df = df[0:250]

In [37]:
model_recode = {'Southampton':0,
              'Crystal Palace':1,
              'Fulham':2,
              'Liverpool':3,
              'Manchester Utd':4,
              'Newcastle':5,
              'Aston Villa':6,
              'Brentford':7,
              'Tottenham':8,
              'West Ham':9,
              'Chelsea':10,
              'Leicester':11,
              'Manchester City':12,
              'Arsenal':13,
              'Bournemouth':14,
              'Everton':15,
              'Wolves':16,
              'Nottingham':17,
              'Leeds':18,
              'Brighton':19}


In [38]:
# iterate over columns
for key, value in df['Home'].iteritems():
    df['Home'] = df['Home'].apply(lambda x: model_recode.get(x,x))

for key, value in df['Away'].iteritems():
    df['Away'] = df['Away'].apply(lambda x: model_recode.get(x,x))

for key, value in hold_out['Home'].iteritems():
    hold_out['Home'] = hold_out['Home'].apply(lambda x: model_recode.get(x,x))

for key, value in hold_out['Away'].iteritems():
    hold_out['Away'] = hold_out['Away'].apply(lambda x: model_recode.get(x,x))

X = df.drop('Result',axis=1)
y = df['Result']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
def model_tuning_GS(model, parameter_dict):
    """Function to perform hyperparameter turning for the classification models using GridSearch."""
    # inspect the model params.
    model.get_params()
    # define the parameters using a dictionary that we want to test.
    model_grid = parameter_dict
    # initialise a GSCV object with the model as an argument. scoring is set to accuracy and CV set to 10.
    Grid_model = GridSearchCV(estimator=model, param_grid=model_grid, cv=10, scoring="accuracy")
    # fit the model to data.
    Grid_model.fit(X_train, y_train)
    # extract the best estimator, accuracy score and print them.
    print("GridSearchCV results:", model.__class__.__name__)
    # print best estimator
    print("Best Estimator:\n", Grid_model.best_estimator_)
    # printing the mean cross-validated score of the best_estimator:
    print("\n Best Score:\n", Grid_model.best_score_)
    # printing the parameter setting that gave the best results on the hold out testing data.:
    print("\n Best Hyperparameters:\n", Grid_model.best_params_)

In [None]:
parameter_dict = {'n_estimators':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                  'max_depth': [5,9,10,20],
                  'min_samples_leaf': [0.1,0.2,0.3,0.4,0.5],
                  'criterion':['gini', 'entropy', 'log_loss']}

model_tuning_GS(RandomForestClassifier(random_state=42), parameter_dict)

GridSearchCV results: RandomForestClassifier
Best Estimator:
 RandomForestClassifier(max_depth=5, min_samples_leaf=0.1, n_estimators=10,
                       random_state=42)

 Best Score:
 0.5349999999999999

 Best Hyperparameters:
 {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 0.1, 'n_estimators': 10}


In [40]:
clf = RandomForestClassifier(criterion='gini', max_depth=5, min_samples_leaf=0.1,
                       n_estimators=10, random_state=42)

clf.fit(X_train, y_train)

In [46]:
yhat = clf.predict(X_test)


In [42]:
print("Accuracy:", metrics.accuracy_score(y_train, clf.predict(X_train)).round(decimals=4))
#print("Accuracy:", metrics.accuracy_score(y_test, yhat).round(decimals=4))


Accuracy: 0.6


In [48]:
print("Accuracy:", metrics.accuracy_score(y_test, yhat).round(decimals=4))

Accuracy: 0.44


In [51]:
# Make predictions on hold out set.
predictions = clf.predict(hold_out)

predictions = pd.DataFrame(predictions,columns=['Predicted_Result'])

predictions = pd.concat([predictions,hold_out],axis=1)



# reverse the team name mappings.

inv_map = {v: k for k, v in model_recode.items()}

# iterate over columns
for key, value in predictions['Home'].iteritems():
    predictions['Home'] = predictions['Home'].apply(lambda x: inv_map.get(x,x))

for key, value in predictions['Away'].iteritems():
    predictions['Away'] = predictions['Away'].apply(lambda x: inv_map.get(x,x))

print(tabulate(predictions,headers='keys'))

predictions.to_csv('pred.csv')

      Predicted_Result  Home             Away               HomeWin_Prob    DrawWin_Prob    AwayWin_Prob
--  ------------------  ---------------  ---------------  --------------  --------------  --------------
 0                   2  Brentford        Fulham                     0.49            0.28            0.27
 1                   1  Tottenham        Nottingham                 0.69            0.22            0.13
 2                   2  Leicester        Chelsea                    0.25            0.28            0.52
 3                   2  Leeds            Brighton                   0.23            0.27            0.54
 4                   2  Everton          Brentford                  0.37            0.33            0.34
 5                   2  Crystal Palace   Manchester City            0.1             0.2             0.75
 6                   2  Bournemouth      Liverpool                  0.14            0.2             0.71
 7                   2  West Ham         Aston Villa   