# Web Scraping Project - Football Data

In this project I walk through how to scrape data from websites using BeautifulSoup and Requests and how to update automatically time series of data in CSV file which we can use as dataset for further analysis. For this project we scrape Serie A Stats data from FBref website.


In [1]:
# Import libraries 

from bs4 import BeautifulSoup
import requests
import time
import datetime
import csv
import pandas as pd

In [2]:
# Now I create a function to check Serie A Stats data from FBref Website

def check_serieA():
    
    # Connect to Website and pull in data
    
    URL = 'https://fbref.com/it/comp/11/Statistiche-di-Serie-A'

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.27", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    page = requests.get(URL, headers=headers)

    soup1 = BeautifulSoup(page.content, "html.parser") # parsing web page content

    soup2 = BeautifulSoup(soup1.prettify(), "html.parser") # pretty version
    
    # Define attributes and clean up the data a little bit

    teams = soup2.find_all(class_='right')
    header = []
    i = 0
    for team in teams:
        i+=1
        header.append(team['data-stat'])
        if i == 13: # we take the first 13 attributes
            break
            
    # Create a Timestamp for your output to track when data was collected

    today = datetime.date.today()
    
    # Create the first column for the club name
    
    club = []
    names = soup2.find_all(class_='left')
    i=0
    for name in names:
        if name['data-stat'] == 'team':
            club.append(name.text.strip())
            i+=1
            if i == 20:
                break
                
    # Pull data into lists and into a dict to create the dataset

    lis = []
    for team in teams:
        for col in header:
            try:
                if team['data-stat'] == col:
                    lis.append(team.text.strip())
            except:
                pass
    
    data = lis[:260] # 13*20
    i=0
    j=13
    diz_data={}
    for row in club:
        diz_data[row] = data[i:j]
        diz_data[row].insert(0, row) # insert club name
        diz_data[row].insert(15, str(today)) #insert date
        i = j
        j+=13
        
    # Create CSV and write headers and data into the file

    header.append('date')
    header.insert(0,'team')
    
    with open('FBrefWebScraperDataset.csv', 'w', newline='', encoding='UTF8') as f: #w: writing CSV file
        writer = csv.writer(f)
        writer.writerow(header)
        for k,v in diz_data.items():
            writer.writerow(v)
    

In [5]:
check_serieA()

df = pd.read_csv(r'..\web_scraping_project\FBrefWebScraperDataset.csv')
df

Unnamed: 0,team,rank,games,wins,ties,losses,goals_for,goals_against,goal_diff,points,points_avg,xg_for,xg_against,xg_diff,date
0,Napoli,1,5,3,2,0,12,4,8,11,220,10.5,3.2,7.3,2022-09-05
1,Milan,2,5,3,2,0,10,5,5,11,220,7.5,4.0,3.5,2022-09-05
2,Udinese,3,5,3,1,1,9,5,4,10,200,5.8,6.4,-0.6,2022-09-05
3,Atalanta,4,4,3,1,0,7,2,5,10,250,6.5,2.6,3.9,2022-09-05
4,Roma,5,5,3,1,1,6,5,1,10,200,10.0,3.6,6.4,2022-09-05
5,Juventus,6,5,2,3,0,7,2,5,9,180,5.2,5.0,0.2,2022-09-05
6,Inter,7,5,3,0,2,11,8,3,9,180,10.5,4.4,6.0,2022-09-05
7,Lazio,8,5,2,2,1,7,5,2,8,160,3.9,7.2,-3.3,2022-09-05
8,Torino,9,4,2,1,1,5,5,0,7,175,6.3,6.4,-0.1,2022-09-05
9,Fiorentina,10,5,1,3,1,4,4,0,6,120,6.0,5.2,0.8,2022-09-05


In [None]:
# Runs check_serieA after a set time and inputs data into your CSV

while(True):
    check_serieA()
    time.sleep(100)