# Football Match Prediction

## Task
Scrape premier league data from this website https://fbref.com/en/comps/9/Premier-League-Stats and predict match outcomes

## Steps

1. Scrape data from 2022/23 season
2. Explore data
3. build an outcome predictor using the poisson method
4. build an ML model to predict outcomes
5. compare methods 

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## 1. Web scrape data

In [2]:
years = list(range(2022,2020,-1))

In [11]:
all_matches =[]
stats_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [18]:
for year in years:
    data = requests.get(stats_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get('href') for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    stats_url = f"https://fbref.com/{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split('/')[-1].replace('-Stats',"").replace("-"," ")

        data = requests.get(team_url)
        matches = pd.read_html(data.text, match = 'Scores & Fixtures') [0]

        soup = BeautifulSoup(data.text)
        links = [l.get('href') for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        time.sleep(1)
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        try:
            team_data = matches.merge(shooting[["Date","Sh","SoT","Dist","FK","PK","PKatt"]])
        except ValueError:
            continue

        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(5)

In [19]:
match_df = pd.concat(all_matches)

In [None]:
match_df

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,18.5,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.8,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.3,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,16.4,1.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,8.0,1.0,18.2,0.0,0.0,0.0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,7.0,0.0,13.4,1.0,0.0,0.0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,10.0,3.0,18.5,0.0,0.0,0.0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,11.0,1.0,18.3,1.0,0.0,0.0,2021,Sheffield United


In [20]:
match_df.columns = [c.lower() for c in match_df.columns]

In [25]:
match_df.to_csv("./matches.csv", index=False)

In [19]:
match_df = pd.read_csv("./matches.csv")

In [20]:
match_df

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,0.5,75.0,62443.0,İlkay Gündoğan,4-3-3,Michael Oliver,Match Report,,13.0,1.0,18.7,1.0,1.0,1.0,2022,Manchester City
1,2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,0.1,67.0,53453.0,İlkay Gündoğan,4-2-3-1,David Coote,Match Report,,19.0,7.0,17.5,0.0,0.0,0.0,2022,Manchester City
2,3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,1.8,69.0,52258.0,İlkay Gündoğan,4-3-3,Jarred Gillett,Match Report,,21.0,10.0,16.2,1.0,0.0,0.0,2022,Manchester City
3,4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,0.1,74.0,53112.0,Kevin De Bruyne,4-2-3-1,Darren England,Match Report,,18.0,5.0,14.1,0.0,0.0,0.0,2022,Manchester City
4,5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,0.7,74.0,53409.0,İlkay Gündoğan,4-2-3-1,Paul Tierney,Match Report,,17.0,9.0,14.8,0.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,0.5,2.2,34.0,,John Egan,3-4-1-2,Andre Marriner,Match Report,,8.0,1.0,18.2,0.0,0.0,0.0,2021,Sheffield United
2048,39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,0.7,2.0,50.0,,John Egan,3-5-2,Simon Hooper,Match Report,,7.0,0.0,13.4,1.0,0.0,0.0,2021,Sheffield United
2049,40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,1.2,1.4,38.0,,John Egan,3-4-3,Jonathan Moss,Match Report,,10.0,3.0,18.5,0.0,0.0,0.0,2021,Sheffield United
2050,41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,0.9,1.6,49.0,10000.0,John Egan,3-4-3,Robert Jones,Match Report,,11.0,1.0,18.3,1.0,0.0,0.0,2021,Sheffield United


## 2. Explore data

In [21]:
match_df = match_df.drop(columns="Unnamed: 0")
match_df 

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,0.5,75.0,62443.0,İlkay Gündoğan,4-3-3,Michael Oliver,Match Report,,13.0,1.0,18.7,1.0,1.0,1.0,2022,Manchester City
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,0.1,67.0,53453.0,İlkay Gündoğan,4-2-3-1,David Coote,Match Report,,19.0,7.0,17.5,0.0,0.0,0.0,2022,Manchester City
2,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,1.8,69.0,52258.0,İlkay Gündoğan,4-3-3,Jarred Gillett,Match Report,,21.0,10.0,16.2,1.0,0.0,0.0,2022,Manchester City
3,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,0.1,74.0,53112.0,Kevin De Bruyne,4-2-3-1,Darren England,Match Report,,18.0,5.0,14.1,0.0,0.0,0.0,2022,Manchester City
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,0.7,74.0,53409.0,İlkay Gündoğan,4-2-3-1,Paul Tierney,Match Report,,17.0,9.0,14.8,0.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,0.5,2.2,34.0,,John Egan,3-4-1-2,Andre Marriner,Match Report,,8.0,1.0,18.2,0.0,0.0,0.0,2021,Sheffield United
2048,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,0.7,2.0,50.0,,John Egan,3-5-2,Simon Hooper,Match Report,,7.0,0.0,13.4,1.0,0.0,0.0,2021,Sheffield United
2049,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,1.2,1.4,38.0,,John Egan,3-4-3,Jonathan Moss,Match Report,,10.0,3.0,18.5,0.0,0.0,0.0,2021,Sheffield United
2050,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,0.9,1.6,49.0,10000.0,John Egan,3-4-3,Robert Jones,Match Report,,11.0,1.0,18.3,1.0,0.0,0.0,2021,Sheffield United


In [22]:
match_df.describe()

Unnamed: 0,gf,ga,xg,xga,poss,attendance,notes,sh,sot,dist,fk,pk,pkatt,season
count,2052.0,2052.0,2052.0,2052.0,2052.0,1354.0,0.0,2052.0,2052.0,2051.0,2052.0,2052.0,2052.0,2052.0
mean,1.430312,1.346979,1.375926,1.314376,50.741715,39046.248892,,12.569688,4.202729,17.488932,0.444444,0.120858,0.148635,2021.62963
std,1.319884,1.277444,0.814116,0.795724,12.514548,17348.419084,,5.484299,2.428064,2.999707,0.659723,0.34775,0.383513,0.483022
min,0.0,0.0,0.0,0.0,18.0,2000.0,,0.0,0.0,5.3,0.0,0.0,0.0,2021.0
25%,0.0,0.0,0.7,0.7,41.0,26684.0,,8.0,2.0,15.6,0.0,0.0,0.0,2021.0
50%,1.0,1.0,1.3,1.2,51.0,36638.0,,12.0,4.0,17.3,0.0,0.0,0.0,2022.0
75%,2.0,2.0,1.825,1.8,61.0,53122.5,,16.0,6.0,19.3,1.0,0.0,0.0,2022.0
max,9.0,9.0,5.9,5.9,82.0,75546.0,,32.0,15.0,35.0,4.0,3.0,3.0,2022.0


In [23]:
match_df.describe(include=object)

Unnamed: 0,date,time,comp,round,day,venue,result,opponent,captain,formation,referee,match report,team
count,2052,2052,2052,2052,2052,2052,2052,2052,2052,2052,2052,2052,2052
unique,373,19,1,38,7,2,3,25,106,21,28,1,23
top,2022-05-22,15:00,Premier League,Matchweek 1,Sat,Away,W,Southampton,Hugo Lloris,4-2-3-1,Anthony Taylor,Match Report,Manchester City
freq,20,477,2052,54,839,1026,832,104,100,541,157,2052,114


In [26]:
match_df.isna().sum()

date            0
time            0
comp            0
round           0
day             0
venue           0
result          0
gf              0
ga              0
opponent        0
xg              0
xga             0
poss            0
attendance    698
captain         0
formation       0
referee         0
sh              0
sot             0
dist            1
fk              0
pk              0
pkatt           0
season          0
team            0
dtype: int64

In [25]:
# remove notes col
match_df = match_df.drop(columns=['notes','match report'])

## 3. Match predictor using poisson method

In [27]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,0.5,75.0,62443.0,İlkay Gündoğan,4-3-3,Michael Oliver,13.0,1.0,18.7,1.0,1.0,1.0,2022,Manchester City
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,0.1,67.0,53453.0,İlkay Gündoğan,4-2-3-1,David Coote,19.0,7.0,17.5,0.0,0.0,0.0,2022,Manchester City
2,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,1.8,69.0,52258.0,İlkay Gündoğan,4-3-3,Jarred Gillett,21.0,10.0,16.2,1.0,0.0,0.0,2022,Manchester City
3,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,0.1,74.0,53112.0,Kevin De Bruyne,4-2-3-1,Darren England,18.0,5.0,14.1,0.0,0.0,0.0,2022,Manchester City
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,0.7,74.0,53409.0,İlkay Gündoğan,4-2-3-1,Paul Tierney,17.0,9.0,14.8,0.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,0.5,2.2,34.0,,John Egan,3-4-1-2,Andre Marriner,8.0,1.0,18.2,0.0,0.0,0.0,2021,Sheffield United
2048,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,0.7,2.0,50.0,,John Egan,3-5-2,Simon Hooper,7.0,0.0,13.4,1.0,0.0,0.0,2021,Sheffield United
2049,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,1.2,1.4,38.0,,John Egan,3-4-3,Jonathan Moss,10.0,3.0,18.5,0.0,0.0,0.0,2021,Sheffield United
2050,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,0.9,1.6,49.0,10000.0,John Egan,3-4-3,Robert Jones,11.0,1.0,18.3,1.0,0.0,0.0,2021,Sheffield United


## Attack and defence strength

In [28]:
match_df.groupby('season').count()

Unnamed: 0_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,sh,sot,dist,fk,pk,pkatt,team
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2021,760,760,760,760,760,760,760,760,760,760,760,760,760,64,760,760,760,760,760,759,760,760,760,760
2022,1292,1292,1292,1292,1292,1292,1292,1292,1292,1292,1292,1292,1292,1290,1292,1292,1292,1292,1292,1292,1292,1292,1292,1292


In [None]:
#get last season results
season_21 = match_df.query('season==2021')

In [50]:
# average goals scored at home
t_home_goals = season_21.query('venue=="Home"')['gf'].sum()
total_home_games = season_21.query('venue=="Home"').shape[0]
avg_home_goals = t_home_goals/total_home_games

In [51]:
# average goals scored away 
t_away_goals = season_21.query('venue=="Away"')['gf'].sum()
total_away_games = season_21.query('venue=="Away"').shape[0]
avg_away_goals = t_away_goals/total_away_games

In [52]:
# average goals conceded at home
t_home_goals = season_21.query('venue=="Home"')['ga'].sum()
total_home_games = season_21.query('venue=="Home"').shape[0]
avg_home_conc = t_home_goals/total_home_games

In [53]:
# average goals conceded away 
t_away_goals = season_21.query('venue=="Away"')['ga'].sum()
total_away_games = season_21.query('venue=="Away"').shape[0]
avg_away_conc = t_away_goals/total_away_games

In [54]:
avg_away_goals,avg_away_conc,avg_home_conc,avg_home_goals

(1.3421052631578947,
 1.3526315789473684,
 1.3421052631578947,
 1.3526315789473684)