# Football Match Prediction

## Task
Scrape premier league data from this website https://fbref.com/en/comps/9/Premier-League-Stats and predict match outcomes

## Steps

1. Scrape data from 2022/23 season
2. Explore data
3. build an outcome predictor using the poisson method
4. build an ML model to predict outcomes
5. compare methods 

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## 1. Web scrape data

In [2]:
try:
    match_df = pd.read_csv('./matches.csv')
except FileNotFoundError:
    years = list(range(2022,2020,-1))

    all_matches =[]
    stats_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

    for year in years:
        data = requests.get(stats_url)
        soup = BeautifulSoup(data.text)
        standings_table = soup.select('table.stats_table')[0]

        links = [l.get('href') for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]

        previous_season = soup.select("a.prev")[0].get("href")
        stats_url = f"https://fbref.com/{previous_season}"

        for team_url in team_urls:
            team_name = team_url.split('/')[-1].replace('-Stats',"").replace("-"," ")

            data = requests.get(team_url)
            matches = pd.read_html(data.text, match = 'Scores & Fixtures') [0]

            soup = BeautifulSoup(data.text)
            links = [l.get('href') for l in soup.find_all('a')]
            links = [l for l in links if l and 'all_comps/shooting/' in l]
            time.sleep(1)
            data = requests.get(f"https://fbref.com{links[0]}")
            shooting = pd.read_html(data.text, match="Shooting")[0]
            shooting.columns = shooting.columns.droplevel()

            try:
                team_data = matches.merge(shooting[["Date","Sh","SoT","Dist","FK","PK","PKatt"]])
            except ValueError:
                continue

            team_data = team_data[team_data["Comp"] == "Premier League"]
            team_data["Season"] = year
            team_data["Team"] = team_name
            all_matches.append(team_data)
            time.sleep(5)
    match_df = pd.concat(all_matches)
    match_df.columns = [c.lower() for c in match_df.columns]
    match_df.to_csv("./matches.csv", index=False)

In [3]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,0.5,75.0,62443.0,İlkay Gündoğan,4-3-3,Michael Oliver,Match Report,,13.0,1.0,18.7,1.0,1.0,1.0,2022,Manchester City
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,0.1,67.0,53453.0,İlkay Gündoğan,4-2-3-1,David Coote,Match Report,,19.0,7.0,17.5,0.0,0.0,0.0,2022,Manchester City
2,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,1.8,69.0,52258.0,İlkay Gündoğan,4-3-3,Jarred Gillett,Match Report,,21.0,10.0,16.2,1.0,0.0,0.0,2022,Manchester City
3,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,0.1,74.0,53112.0,Kevin De Bruyne,4-2-3-1,Darren England,Match Report,,18.0,5.0,14.1,0.0,0.0,0.0,2022,Manchester City
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,0.7,74.0,53409.0,İlkay Gündoğan,4-2-3-1,Paul Tierney,Match Report,,17.0,9.0,14.8,0.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,Aston Villa,0.5,1.8,55.0,40290.0,Grant Hanley,4-2-3-1,John Brooks,Match Report,,9.0,3.0,21.6,0.0,0.0,0.0,2021,Norwich City
1516,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,West Ham,0.8,3.0,37.0,26428.0,Grant Hanley,4-2-3-1,Robert Jones,Match Report,,8.0,2.0,22.2,1.0,0.0,0.0,2021,Norwich City
1517,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,Leicester City,1.1,2.0,35.0,38092.0,Grant Hanley,4-1-4-1,Simon Hooper,Match Report,,9.0,5.0,17.0,0.0,0.0,0.0,2021,Norwich City
1518,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,Wolves,1.1,0.9,36.0,31219.0,Grant Hanley,3-4-3,Tony Harrington,Match Report,,11.0,2.0,14.4,0.0,0.0,0.0,2021,Norwich City


## 2. Explore data

In [4]:
match_df.describe()

Unnamed: 0,gf,ga,xg,xga,poss,attendance,notes,sh,sot,dist,fk,pk,pkatt,season
count,1520.0,1520.0,1520.0,1520.0,1520.0,1518.0,0.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0
mean,1.417763,1.417763,1.380329,1.380329,50.000658,39883.109354,,12.601316,4.159211,17.254868,0.419737,0.103947,0.132895,2021.5
std,1.309237,1.309237,0.820829,0.820829,12.503666,15919.766005,,5.491174,2.420218,2.929295,0.650218,0.320032,0.360268,0.500165
min,0.0,0.0,0.0,0.0,18.0,9972.0,,1.0,0.0,6.8,0.0,0.0,0.0,2021.0
25%,0.0,0.0,0.8,0.8,40.0,29284.75,,8.0,2.0,15.3,0.0,0.0,0.0,2021.0
50%,1.0,1.0,1.3,1.3,50.0,36700.0,,12.0,4.0,17.0,0.0,0.0,0.0,2021.5
75%,2.0,2.0,1.8,1.8,60.0,53100.75,,16.0,6.0,19.0,1.0,0.0,0.0,2022.0
max,9.0,9.0,5.9,5.9,82.0,75546.0,,32.0,15.0,31.4,4.0,2.0,2.0,2022.0


In [5]:
match_df.describe(include=object)

Unnamed: 0,date,time,comp,round,day,venue,result,opponent,captain,formation,referee,match report,team
count,1520,1520,1520,1520,1520,1520,1520,1520,1520,1520,1520,1520,1520
unique,240,16,1,38,7,2,3,23,100,20,27,1,23
top,2022-05-22,15:00,Premier League,Matchweek 1,Sat,Away,W,West Ham,James Ward-Prowse,4-2-3-1,Anthony Taylor,Match Report,Manchester City
freq,20,512,1520,40,724,760,585,76,74,424,116,1520,76


In [6]:
match_df.isna().sum()

date               0
time               0
comp               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 0
xga                0
poss               0
attendance         2
captain            0
formation          0
referee            0
match report       0
notes           1520
sh                 0
sot                0
dist               0
fk                 0
pk                 0
pkatt              0
season             0
team               0
dtype: int64

In [7]:
# remove notes col
match_df = match_df.drop(columns=['notes','match report'])

## 3. Match predictor using poisson method

In [8]:
columns = ['gf','ga','xg','xga','poss','attendance','sh','sot','dist','fk','pk','pkatt']

match_df[columns] = match_df[columns].apply(pd.to_numeric, errors='coerce')

### Using last season to predict this season

## Rolling 10 game window

In [13]:
games_agg = match_df.groupby(['venue','date']).agg(
    t_gf = ('gf','sum'),
    t_ga = ('ga','sum'),
    total_games = ('comp','count')
).reset_index()

games_agg['t_gf_roll'] = games_agg.groupby(['venue'])['t_gf'].rolling(6).sum().reset_index(drop=True)
games_agg['t_gf_roll'] = games_agg.t_gf_roll.sub(games_agg.t_gf)

games_agg['t_ga_roll'] = games_agg.groupby(['venue'])['t_ga'].rolling(6).sum().reset_index(drop=True)
games_agg['t_ga_roll'] = games_agg.t_ga_roll.sub(games_agg.t_ga)

games_agg['total_games_roll'] = games_agg.groupby(['venue'])['total_games'].rolling(6).sum().reset_index(drop=True)
games_agg['total_games_roll'] = games_agg.total_games_roll.sub(games_agg.total_games)

games_agg['avg_gf_roll'] = games_agg.t_gf_roll/games_agg.total_games_roll
games_agg['avg_ga_roll'] = games_agg.t_ga_roll/games_agg.total_games_roll

In [14]:
team_agg = match_df.groupby(['team','venue','date']).agg(
    t_gf = ('gf','sum'),
    t_ga = ('ga','sum'),
    total_games = ('comp','count')
).reset_index()

team_agg['team_t_gf_roll'] = team_agg.sort_values(by = ['date','venue']).groupby(['team','venue'])['t_gf'].rolling(6).sum().reset_index(drop=True)
team_agg['team_t_gf_roll'] = team_agg.team_t_gf_roll.sub(team_agg.t_gf)

team_agg['team_t_ga_roll'] = team_agg.sort_values(by = ['date','venue']).groupby(['team','venue'])['t_ga'].rolling(6).sum().reset_index(drop=True)
team_agg['team_t_ga_roll'] = team_agg.team_t_ga_roll.sub(team_agg.t_ga)

team_agg['team_total_games_roll'] = team_agg.sort_values(by = ['date','venue']).groupby(['team','venue'])['total_games'].rolling(6).sum().reset_index(drop=True)
team_agg['team_total_games_roll'] = team_agg.team_total_games_roll.sub(team_agg.total_games)

team_agg['team_avg_gf_roll'] = team_agg.team_t_gf_roll/team_agg.team_total_games_roll
team_agg['team_avg_ga_roll'] = team_agg.team_t_ga_roll/team_agg.team_total_games_roll 

In [15]:
team_game_merge = team_agg.merge(games_agg[['date','venue','avg_gf_roll','avg_ga_roll']], on =['date','venue'], how='left')

In [16]:
team_game_merge['attack_strength'] = team_game_merge.team_avg_gf_roll / team_game_merge.avg_gf_roll
team_game_merge['defence_strength'] = team_game_merge.team_avg_ga_roll / team_game_merge.avg_ga_roll

In [17]:
team_game_merge

Unnamed: 0,team,venue,date,t_gf,t_ga,total_games,team_t_gf_roll,team_t_ga_roll,team_total_games_roll,team_avg_gf_roll,team_avg_ga_roll,avg_gf_roll,avg_ga_roll,attack_strength,defence_strength
0,Arsenal,Away,2021-08-13,0,2,1,,,,,,,,,
1,Arsenal,Away,2021-08-28,0,5,1,,,,,,1.052632,1.947368,,
2,Arsenal,Away,2021-09-18,1,0,1,,,,,,0.857143,1.214286,,
3,Arsenal,Away,2021-10-02,0,0,1,,,,,,1.315789,1.210526,,
4,Arsenal,Away,2021-10-30,2,0,1,,,,,,2.076923,1.769231,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515,Wolverhampton Wanderers,Home,2023-04-08,1,0,1,7.0,5.0,5.0,1.4,1.0,1.562500,1.312500,0.896000,0.761905
1516,Wolverhampton Wanderers,Home,2023-04-15,2,0,1,7.0,5.0,5.0,1.4,1.0,1.058824,1.588235,1.322222,0.629630
1517,Wolverhampton Wanderers,Home,2023-04-25,2,0,1,6.0,5.0,5.0,1.2,1.0,1.818182,2.090909,0.660000,0.478261
1518,Wolverhampton Wanderers,Home,2023-05-06,1,0,1,8.0,4.0,5.0,1.6,0.8,2.300000,1.000000,0.695652,0.800000


In [37]:
#get last season results
season_21 = match_df.query('season==2021')

In [38]:
season_21.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.8,1.0,65.0,58262.0,Fernandinho,4-3-3,Anthony Taylor,18.0,4.0,17.3,1.0,0.0,0.0,2021,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.6,0.1,67.0,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,16.0,4.0,18.5,1.0,0.0,0.0,2021,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,4.4,0.2,80.0,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,25.0,10.0,14.8,0.0,0.0,0.0,2021,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.8,0.6,61.0,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,25.0,8.0,14.3,0.0,0.0,0.0,2021,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,1.0,0.4,64.0,52698.0,Fernandinho,4-3-3,Jonathan Moss,16.0,1.0,16.4,1.0,0.0,0.0,2021,Manchester City


In [40]:
# average goals scored away 
t_away_goals = season_21.query('venue=="Away"')['gf'].sum()
total_away_games = season_21.query('venue=="Away"').shape[0]
avg_away_goals = t_away_goals/total_away_games

In [41]:
# average goals conceded at home
t_home_goals = season_21.query('venue=="Home"')['ga'].sum()
total_home_games = season_21.query('venue=="Home"').shape[0]
avg_home_conc = t_home_goals/total_home_games

In [42]:
# average goals conceded away 
t_away_goals = season_21.query('venue=="Away"')['ga'].sum()
total_away_games = season_21.query('venue=="Away"').shape[0]
avg_away_conc = t_away_goals/total_away_games

In [43]:
avg_away_goals,avg_away_conc,avg_home_conc,avg_home_goals

(1.305263157894737, 1.513157894736842, 1.305263157894737, 1.513157894736842)