In [121]:
# SCRAPING OUR FIRST PAGE WITH REQUESTS:

# first we're going to figure out how to download the html of a page that shows the standings for the Italian Seria A 
# to do that we're going to use the requests library 

import requests

In [122]:
# next we're going to define the url that we are going to start scraping

standings_url = "https://fbref.com/en/comps/11/Serie-A-Stats"

In [123]:
# next we are going to download the page from up above, to do this we are going to use the get method in the requests library.
# What this will do is it'll make a request to server and actually download the html of this page 


data = requests.get(standings_url)

In [125]:
# PARSING THE HTML LINKS WITH BEAUTFIULSOUP:

# In order to actually parse our html we're going to use a library called beautiful soup:


from bs4 import BeautifulSoup

In [126]:
# next we will initialize it using our html

soup = BeautifulSoup(data.text)

In [127]:
# next thing we need to do is give our soup object something to select from the web page. To select the table from the website what
# we're going to do is we're going to type: "soup.select('table." this is what we call a css selector. the name of the tag is "table"
# than add a . than the class name which in this case from the website is stats_table this will select the table elements in our page
# that have the class, stats_table than assign it back to a variable called "standings_table" This will remove a bulk of the html, leaving
# only the html for the table, which is still alot but narrowing it down. We could see the html for the table by running
# standings_table 

standings_table = soup.select('table.stats_table')[0]

In [128]:
# next thing we need to do is to find all of the a tags inside our table so what we can do is:

links = standings_table.find_all('a')

In [129]:
# next we want to get the href property of each link so what we'll do is write a list comprehension that says:

links = [l.get("href") for l in links]

# what this does is it goes through each of the a elements and then it finds the value of the href property

In [130]:
# the next thing we need to do is filter our links so we only have the squad links using another list comprehension

links = [l for l in links if '/squads/' in l]

# what this will do is basically say is squad in the link, and if it isn't get rid of the link.
# next type links in another code and run it and see what you have.

In [131]:
team_urls = [f"https://fbref.com{l}" for l in links]

# what this will do is basically take each of our links and add the string above to the beginning of that link

In [132]:
# EXTRACT MATCH STATS USING PANDAS AND REQUESTS:

# now what we can do is actually start getting the stats we want from one of these team urls, so for now we'll just work with the first
# team url.

team_url = team_urls[0]

In [133]:
# now what we can do is again use requests to get the html from that url:

data = requests.get(team_url)

In [134]:
# next we can import pandas that will make what we need "scores and fixures" from our team page and pandas will turn it into a dataset
# for us so: we're going to turn that match table from fbref.com into a pandas data frame using the pandas read html method


import pandas as pd

In [None]:
# what "match" does is it looks for a specific string inside the table and what "pd.read_html" does is it reads all of the tables
# on the page so essentially we're scanning all of the tables on the page, all of the table tags and then we're looking for one
# that has this string inside of it "match="Scores & Fixtures"

matches = pd.read_html(data.text, match="Scores & Fixtures")

In [None]:
matches[0].head()

In [137]:
# GET MATCH SHOOTING STATS WITH REQUESTS AND PANDAS

# the first thing we must do if we want to get the data involving the shooting stats is get the url of the shooting stats. 
# we're going to use beautiful soup, so we'll initialize a beautiful soup instance and pass in our html


soup = BeautifulSoup(data.text)

In [138]:
# next we'll find all the links on the page

links = soup.find_all('a')

In [139]:
# next what we'll do is use a list comprehension to get the actual url of the link

links = [l.get("href") for l in links]

In [140]:
# then we'll use another list comprehension to filter our links and only find the link to the shooting stats

links = [l for l in links if l and 'all_comps/shooting/' in l]

# so what this will do is we're looking for any links that have this that element in them "all_comp/shooting/" are the shooting links


In [141]:
# what we'll do now is go ahead and grab the html for this specific link

data = requests.get(f"https://fbref.com{links[0]}")

# this will download our data

In [None]:
# we can do something fairly similar to before using pandas to actually read in our shooting stats 

shooting = pd.read_html(data.text, match="Shooting")[0]

In [None]:
# CLEANING AND MERGING SCRAPED DATA WITH PANDAS

shooting.head()

In [144]:
shooting.columns = shooting.columns.droplevel()

# this will just drop the top index level for us and then if we run shooting.head() again we see that index level is gone

In [None]:
shooting.head()

In [None]:
# now we can do something like:

shooting["Date"]

# which will just give us the date column 

In [None]:
# or we could do something like:

shooting["Result"]

# which will give us the results for that week

In [148]:
# now we have two different data frames, we have the shooting data and match data data frame. Now we need to combine these two
# What we're gonna use is the pandas merge method to merge these data frames together. we'll assign that to a variable called team data.

team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [None]:
team_data.head()

# now we can see that we now have both of our data frames merged together. So we've essentially taken the matches dataframe
# and added a few extra columns to the end

In [160]:
# So what we have accomplished here is we have gone ahead and scraped the standings and then we have downloaded the data for a 
# single team and we've combined the data for a single team in a single season into one data frame 

In [161]:
# SCRAPING DATA FOR MULTIPLE SEASON AND TEAMS WITH A LOOP

In [None]:
years = list(range(2025, 2023, -1)) # this will scrape 2024 - 2025 season and 2023 - 2024 season

In [None]:
years

In [153]:
# next we'll initizalize a list called all matches, what this list will contain when our loop is finished is several data frames,
# and each data frame is going to contain the match logs for one team in one season so we'll end up  a bunch of little data frames
# that will then combine into one big data frame at the end once our loop is finished. 


all_matches = []

In [154]:
# then we need to define the url that we want to start on so this url is going to be the same url we used initially.
# we're going to add another layer to this and what we'll do in addition is we'll actually go to previous seasons and scrape 
# those seasons as well. which we will start in the next cell block.

standings_url = "https://fbref.com/en/comps/11/Serie-A-Stats"

In [None]:
import time
for year in years:
    data = requests.get(standings_url) # 1st we will get the standings url html
    soup = BeautifulSoup(data.text)  # 2nd we will get beautfiul soup to parse that html file
    standings_table = soup.select('table.stats_table') [0] # 3rd we will get out stats table whivh contains all of our individual team links which give us the individual match data for each team.
    
    links = [l.get("href") for l in standings_table.find_all('a')] # 4th find all of the team links and get the href property
    links = [l for l in links if '/squads/' in l] # 5th then we'll filter the links so we only have the links for the squads
    team_urls = [f"https://fbref.com{l}" for l in links] # 6th then what we'll do is we will turn these from relative links into absolute links
    
    previous_season = soup.select("a.prev")[0].get("href") # we need to grab the url for the previous season because we will be scraping information from previous seasons
    standings_url = f"https://fbref.com{previous_season}" # then we will convert that into an absolute url
    
    for team_url in team_urls: #7th then what we need to do is loop through each of the team urls and what we'll do is individually scrape the match logs for each team
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") # 8th what this line does is removes everything before the "/" giving us only the team name, and the replace removes anything we dont want in this case a dash and the word stats after the team name
        
        data = requests.get(team_url) #9th the next thing we'll do is get that team url which will let us get that scores and fixtures table and we'll read that into matches
        matches = pd.read_html(data.text, match="Scores & Fixtures") [0]
        
        soup = BeautifulSoup(data.text) # 11th next we'll need to parse that code
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        
         # so what we did here in the 4 lines above is we're first parsing the socres and fixtures table, then we're pulling out the all comps shooting link
         # because that will let us get the shooting stats, 
     
         # then we convert that to an absolute url 
        
        
        data = requests.get(f"https://fbref.com{links[0]}")
         
        
        # then what we do is we read in our shooting stats using pandas
        
        
        shooting = pd.read_html(data.text, match="Shooting") [0]
        shooting.columns = shooting.columns.droplevel()
        
        # next what we'll do is merge our shooting stats with our match stats 
        
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        
        
         # so sometimes for some teams the shooting stats aren't available and when you try to actually merge the two together 
         # pandas gives you a value error because the shooting stats is empty shooting stats data frame is empty so in these cases 
         # what we're going to do is just ignore that team so this says hey try to merge the team data together if pandas has an error 
         # which in this case is a specific kind of error called a value error if pandas has a value error then just continue with the loop
         # and don't do anything else so we're essentially skipping over any teams where the shooting stats aren't available   
        
         # Next what we'll do is we'll filter this so it only shows competitions that took place in the leagie and not champions league or 
         # domestic cup games or friendlies, ONLY Serie A GAMES.
        
        
        team_data = team_data[team_data["Comp"] == "Serie A"]
        
        # next we have to add in season and team columns because we're scraping this table out we're going to be combining this with 
        # alot of other tables for other teams and we need a way to distinguish okay which team was this actually for and which season 
        # was this for so that's why we're adding in some extra columns here that show us that all right so that's why we're adding 
        # this team and season column this is really something to be aware of when you're web scraping.
        
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        
        # the next thing we'll do is have this list all_matches which is going to be a list of data frames and we're just going to add 
        # this team_data frame to that list
        
        
        all_matches.append(team_data)
        
        
        # the final thing we'll do is we'll sleep for a second, the reason we're doing this is because a lot of websites including fbref 
        # allow scraping but don't want you to scrape too quickly because it can slow down their website and make it hard for their 
        # website to run effectively so by slowing down how quickly we're scraping we're making sure we don't get blocked from scraping 
        # the website really important to do that it's a nice thing to do for the people who own the site so you're not making a lot of 
        # requests very quickly
        
        
        time.sleep(1)
        
        # this is the loop, and there are a couple things we must do after the loop

In [None]:
len(all_matches)

In [157]:
# the first thing we need to do is combine all of our indicidual data frames into one data frame. We'll use the concat function in
# pandas to do this, so it takes a list of data frames as input and returns a single data frame.

match_df = pd.concat(all_matches)

In [158]:
# another thing we can do, but is not necessary is making all the column names lowercase whatever your preference may be

match_df.columns = [c.lower() for c in match_df.columns]

# so what this will do is it will go through all the columns in this match 
# data frame lowercase them and then assign them back so it'll replace all the column names

In [None]:
match_df

In [160]:
# the final thing we want to do is write this to csv using the pandas to csv method so this will write all of our data to a csv file
# called matches.csv

match_df.to_csv("matches.csv")