<a href="https://colab.research.google.com/github/superpanditas/web-scraping-toolkit/blob/main/Global_Soccer_Data_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0 - Getting Started
- Global Soccer Data Scraper: In this exercise, I developed a web scraper to extract soccer outcomes from various leagues around the world. The task involves selecting from a range of options in a dropdown menu, with the ultimate goal of building a predictive model for match outcomes.

In [1]:
# @title Download Libraries Required

!pip install --upgrade selenium
# !pip install --upgrade requests
# !pip install --upgrade pandas

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [2]:
# @title Download Google-Chrome & Chromedriver

'''
!google-chrome --version # version 130.0
!chromedriver --version # version 130.0
!pip show selenium # version 4.25.0
'''

## Download the Google-Chrome (Unix)
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install
print('Google-Chrome Installed')
## --------------------------------------------
## Download the ChromeDriver
!wget https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.58/linux64/chromedriver-linux64.zip
!unzip -o chromedriver-linux64.zip
!mv chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
!chmod +x /usr/local/bin/chromedriver
print('Chromedriver Installed')

--2024-10-24 01:55:14--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
Resolving dl.google.com (dl.google.com)... 74.125.135.136, 74.125.135.91, 74.125.135.93, ...
Connecting to dl.google.com (dl.google.com)|74.125.135.136|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 111353576 (106M) [application/x-debian-package]
Saving to: ‘google-chrome-stable_current_amd64.deb’


2024-10-24 01:55:14 (184 MB/s) - ‘google-chrome-stable_current_amd64.deb’ saved [111353576/111353576]

Selecting previously unselected package google-chrome-stable.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (130.0.6723.69-1) ...
[1mdpkg:[0m dependency problems prevent configuration of google-chrome-stable:
 google-chrome-stable depends on libvulkan1; however:
  Package libvulkan1 is not installed.

[1mdpkg:[0m error processing package googl

In [3]:
# @title Import Libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# Step 1 - Global Soccer Data Scraper

In [4]:
# @title Global Soccer Data Class
class SoccerDataScraper:
    def __init__(self, driver_path, binary_location):
        self.driver_path = driver_path
        self.binary_location = binary_location
        self.driver = self._initialize_driver()

    def _initialize_driver(self):
        options = Options()
        options.binary_location = self.binary_location
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument("--window-size=1920,1200")

        service = Service(self.driver_path)
        driver = webdriver.Chrome(options=options, service=service)
        return driver

    def open_website(self, url):
        self.driver.get(url)
        self.wait = WebDriverWait(self.driver, 10)

    def select_all_matches(self):
        button_all_matches = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//label[@analytics-event="All matches"]')))
        button_all_matches.click()

    def select_country(self, country_name):
        dropdown = Select(self.driver.find_element(By.ID, 'country'))
        dropdown.select_by_visible_text(country_name)
        time.sleep(10)

    def select_season(self, season_name):
        dropdown = Select(self.driver.find_element(By.ID, 'season'))
        dropdown.select_by_visible_text(season_name)
        time.sleep(10)

    def get_league(self, xpath="//select[@id='league']/option[@selected='selected']"):
        league = self.driver.find_element(By.XPATH, xpath).text
        return league

    def scrape_matches(self, country_name, season_name, league_name):
        #matches = self.wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'tr')))
        matches = self.driver.find_elements(By.TAG_NAME, 'tr')
        time.sleep(10)
        date, home_team, score, home_away = [], [], [], []

        if not matches:
          print('Not data available ...')
          return

        for row in matches:
            try:
                tds = row.find_elements(By.TAG_NAME, 'td')
                if len(tds) < 4:
                    print(f'Incomplete row for {country_name} - {league_name} - {season_name}')
                    continue

                # Extract data from each column
                date_element = tds[0].text
                home_team_element = tds[1].text
                score_element = tds[2].text
                home_away_element = tds[3].text

                date.append(date_element)
                home_team.append(home_team_element)
                score.append(score_element)
                home_away.append(home_away_element)

            except Exception as e:
                print(f'Exception Raised: {row} - {e}')

        return pd.DataFrame({
            'date': date,
            'home_team': home_team,
            'score': score,
            'home_away': home_away,
            'country': [country_name] * len(date),
            'league': [league_name] * len(date),
            'season': [season_name] * len(date)
        })

    def scrape_multiple_countries_and_seasons(self, countries_and_seasons):
        all_data = pd.DataFrame()

        for country, seasons in countries_and_seasons.items():
            print(f'Scraping data for {country}')
            self.select_country(country)
            league = self.get_league()

            for season in seasons:
                print(f'Scraping data for {country} - {season}')
                self.select_season(season)

                time.sleep(5)

                data_scraped = self.scrape_matches(country, season, league)

                all_data = pd.concat([all_data, data_scraped], ignore_index=True)

        return all_data


    def save_data(self, data, file_name):
        data.to_csv(file_name, index=False)
        print(f'Data saved to {file_name}')

    def close_driver(self):
        self.driver.quit()

if __name__ == "__main__":
    DRIVER_PATH = '/usr/local/bin/chromedriver'
    BINARY_LOCATION = '/bin/google-chrome'
    WEBSITE_URL = 'https://www.adamchoi.co.uk/overs/detailed'

    COUNTRIES_AND_SEASONS = {
        'Mexico': ['2024/2025', '23/24', '22/23'],
        'England':['2024/2025', '23/24', '22/23']
    }

    # initialize soccer data scraper class
    scraper = SoccerDataScraper(driver_path=DRIVER_PATH, binary_location=BINARY_LOCATION)

    # open WEBSITE
    scraper.open_website(WEBSITE_URL)

    # select 'All matches'
    scraper.select_all_matches()

    # scrape data
    match_data = scraper.scrape_multiple_countries_and_seasons(COUNTRIES_AND_SEASONS)

    # save data to CSV
    scraper.save_data(match_data, 'all_soccer_data.csv')

    # close the WEBDRIVER
    scraper.close_driver()


Scraping data for Mexico
Scraping data for Mexico - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico - Liga MX - 2024/2025
Incomplete row for Mexico 

In [5]:
# @title Load Data
all_soccer_data_path = '/content/all_soccer_data.csv'
all_soccer_data = pd.read_csv(all_soccer_data_path)
print(all_soccer_data.shape)

(2586, 7)


In [6]:
# @title Output
all_soccer_data.head(15)

Unnamed: 0,date,home_team,score,home_away,country,league,season
0,06-07-2024,Atlético San Luis,2 - 1,América,Mexico,Liga MX,2024/2025
1,13-07-2024,América,3 - 1,Querétaro,Mexico,Liga MX,2024/2025
2,18-07-2024,Tigres UANL,1 - 0,América,Mexico,Liga MX,2024/2025
3,21-07-2024,Juárez,1 - 2,América,Mexico,Liga MX,2024/2025
4,25-08-2024,América,0 - 1,Puebla,Mexico,Liga MX,2024/2025
5,01-09-2024,Cruz Azul,4 - 1,América,Mexico,Liga MX,2024/2025
6,15-09-2024,América,1 - 0,Guadalajara,Mexico,Liga MX,2024/2025
7,18-09-2024,América,3 - 0,Atlas,Mexico,Liga MX,2024/2025
8,22-09-2024,Necaxa,1 - 1,América,Mexico,Liga MX,2024/2025
9,30-09-2024,América,0 - 1,Pumas UNAM,Mexico,Liga MX,2024/2025
