<a href="https://colab.research.google.com/github/yebyyy/English-Premier-League-2023-2024-Prediction/blob/main/Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This code is for scraping the data for the current season of English Premier League

In [263]:
import requests

In [264]:
import time

In [265]:
import pandas as pd

In [266]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [267]:
data = requests.get(standings_url)

In [268]:
from bs4 import BeautifulSoup

In [269]:
soup = BeautifulSoup(data.text)

#### Now we need give the soup object something to select.
Since everything is inside the table html object, we need to select it and then get everything anchor tags for the clubs.

In [270]:
data

<Response [200]>

In [271]:
# selecting the table using css selector
standings_table = soup.select('table.stats_table')[0] # .stats_table is the class name

In [272]:
# find all the <a>(the clubs) in the standings table
links = standings_table.find_all("a") # find_all only finds tags

In [273]:
# links should actually contain all the links
links = [l.get("href") for l in links]

In [274]:
# links should actually contain the links that have clubs
links = [l for l in links if "/squads/" in l]
links

['/en/squads/822bd0ba/Liverpool-Stats',
 '/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/squads/18bb7c10/Arsenal-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/19538871/Manchester-United-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/cff3d9bb/Chelsea-Stats',
 '/en/squads/fd962109/Fulham-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/squads/e297cd13/Luton-Town-Stats',
 '/en/squads/943e8050/Burnley-Stats',
 '/en/squads/1df6b87e/Sheffield-United-Stats']

In [275]:
# since the links only have the subdomain, we should include the first half of the url as well
team_urls = [f"https://fbref.com{l}" for l in links]
team_urls

['https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats',
 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
 'https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/19538871/Manchester-United-Stats',
 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats',
 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
 'https://fbref.com/en/s

#### Using Pandas and Requests to extract match stats: Using a single team as an example

In [276]:
team_url_first = team_urls[0]
team_url_first

'https://fbref.com/en/squads/822bd0ba/Liverpool-Stats'

In [277]:
first_team_data = requests.get(team_url_first)

Now we use pandas for the parsing and read only the table called scores and fixtures since we only care about the data there

In [278]:
matches_first_team = pd.read_html(first_team_data.text, match="Scores & Fixtures")

Since this is a list, we want a pandas dataFrame, therefore we get the first element from this list

In [279]:
matches_first_team = matches_first_team[0]

#### Now we also care about the shooting data, so we continue using the same team for the example

Since the shooting page is now just a URL in an anchor in the webpage of the club page we were working on, we find all the links and keep the one that has shooting in it

In [280]:
soup_for_first_team = BeautifulSoup(first_team_data.text)

In [281]:
link_for_first_team = soup_for_first_team.find_all("a")

In [282]:
link_for_first_team = [l.get("href") for l in link_for_first_team]

In [283]:
link_for_first_team = [l for l in link_for_first_team if l and "all_comps/shooting/" in l]
link_for_first_team

['/en/squads/822bd0ba/2023-2024/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 '/en/squads/822bd0ba/2023-2024/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 '/en/squads/822bd0ba/2023-2024/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions',
 '/en/squads/822bd0ba/2023-2024/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions']

In [284]:
first_team_shooting_data = requests.get(f"https://fbref.com{link_for_first_team[0]}")

In [285]:
first_team_shooting_frame = pd.read_html(first_team_shooting_data.text, match='Shooting')
first_team_shooting_frame = first_team_shooting_frame[0]

#### Cleaning and merging data in pandas

We don't want the multilevel index, so just drop that

In [286]:
first_team_shooting_frame.columns = first_team_shooting_frame.columns.droplevel()

In [287]:
first_team_shooting_frame.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,17.8,0.0,0,0,1.3,1.3,0.1,-0.3,-0.3,Match Report
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,16.8,1.0,0,1,3.0,2.1,0.09,0.0,0.9,Match Report
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,17.2,1.0,0,0,0.9,0.9,0.1,1.1,1.1,Match Report
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,14.7,0.0,0,0,2.5,2.5,0.15,-0.5,-0.5,Match Report
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,15.8,0.0,0,0,2.5,2.5,0.16,-0.5,-0.5,Match Report


In [288]:
first_team_frame = matches_first_team.merge(first_team_shooting_frame[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
first_team_frame.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,4-3-3,Anthony Taylor,Match Report,,13,1,17.8,0.0,0,0
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,4-3-3,Thomas Bramall,Match Report,,25,9,16.8,1.0,0,1
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,4-3-3,John Brooks,Match Report,,9,4,17.2,1.0,0,0
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,4-3-3,Simon Hooper,Match Report,,17,4,14.7,0.0,0,0
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,4-3-3,Michael Oliver,Match Report,,16,5,15.8,0.0,0,0


This just added additional columns in the end of the frame

In [289]:
matches_first_team.shape

(56, 19)

In [290]:
first_team_shooting_frame.shape

(44, 26)

### Now we should scrape the data for multiple seasons and for all of the teams existed in the English Premier League

In [291]:
years = list(range(2024, 2021, -1))
years

[2024, 2023, 2022]

In [292]:
# several DataFrames and each DataFrame contains the match log for a team
all_matches = []

In [293]:
for year in years:
  # get the links to the teams through the table.stats_table
  data = requests.get(standings_url)
  soup = BeautifulSoup(data.text)
  table = soup.select("table.stats_table")[0]
  links = standings_table.find_all("a")
  links = [l.get("href") for l in links]
  links = [l for l in links if "/squads/" in l]

  previous_season = soup.select("a.prev")[0].get("href")
  standings_url = f"https://fbref.com{previous_season}"

  team_urls = [f"https://fbref.com{l}" for l in links]
  for team in team_urls:
    team_name = team.split("/")[-1].replace("-Stats", "").replace("-", " ")
    data = requests.get(team)
    # read the table that has the Scores and Fixtures
    matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
    # get the link to the scoring data
    soup = BeautifulSoup(data.text)
    links = [l.get("href") for l in soup.find_all('a')]
    links = [l for l in links if l and "all_comps/shooting/" in l]
    data = requests.get(f"https://fbref.com{links[0]}")
    shooting = pd.read_html(data.text, match='Shooting')[0]
    shooting.columns = shooting.columns.droplevel()

    try:
      team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
    # for some teams the shooting table is not available
    except ValueError:
      continue

    # filter by competition and add column indicating years and team names
    team_data = team_data[team_data["Comp"] == "Premier League"]
    team_data["Season"] = year
    team_data["Team"] = team_name
    all_matches.append(team_data)
    time.sleep(3)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Season"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Team"] = team_name


In [294]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]

In [295]:
match_df.to_csv("matches.csv")