<a href="https://colab.research.google.com/github/superpanditas/web-scraping-toolkit/blob/main/Soccer_Data_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0 - Getting Started
- Business Problem: A company has requested the scraping of all Premier League matches to build a predictive model for match outcomes, aimed at increasing betting success and profitability.

In [None]:
# @title Download Libraries Required

!pip install --upgrade selenium
# !pip install --upgrade requests
# !pip install --upgrade pandas

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [None]:
# @title Download Google-Chrome & Chromedriver

'''
!google-chrome --version # version 130.0
!chromedriver --version # version 130.0
!pip show selenium # version 4.25.0
'''

## Download the Google-Chrome (Unix)
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install
print('Google-Chrome Installed')
## --------------------------------------------
## Download the ChromeDriver
!wget https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.58/linux64/chromedriver-linux64.zip
!unzip -o chromedriver-linux64.zip
!mv chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
!chmod +x /usr/local/bin/chromedriver
print('Chromedriver Installed')

--2024-10-21 15:34:09--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
Resolving dl.google.com (dl.google.com)... 74.125.132.136, 74.125.132.93, 74.125.132.190, ...
Connecting to dl.google.com (dl.google.com)|74.125.132.136|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 111386844 (106M) [application/x-debian-package]
Saving to: ‘google-chrome-stable_current_amd64.deb’


2024-10-21 15:34:10 (238 MB/s) - ‘google-chrome-stable_current_amd64.deb’ saved [111386844/111386844]

Selecting previously unselected package google-chrome-stable.
(Reading database ... 123629 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (130.0.6723.58-1) ...
[1mdpkg:[0m dependency problems prevent configuration of google-chrome-stable:
 google-chrome-stable depends on libvulkan1; however:
  Package libvulkan1 is not installed.

[1mdpkg:[0m error processing package goog

In [None]:
# @title Import Libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# Step 1 - Scraping Soccer Data

In [None]:
# @title Soccer Data Scraper Class
class SoccerDataScraper:
    def __init__(self, driver_path, binary_location):
        self.driver_path = driver_path
        self.binary_location = binary_location
        self.driver = self._initialize_driver()

    def _initialize_driver(self):
        options = Options()
        options.binary_location = self.binary_location
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument("--window-size=1920,1200")

        service = Service(self.driver_path)
        driver = webdriver.Chrome(options=options, service=service)
        return driver

    def open_website(self, url):
        self.driver.get(url)
        self.wait = WebDriverWait(self.driver, 10)

    def select_all_matches(self):
        button_all_matches = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//label[@analytics-event="All matches"]')))
        button_all_matches.click()

    def select_country(self, country_name):
        dropdown = Select(self.driver.find_element(By.ID, 'country'))
        dropdown.select_by_visible_text(country_name)
        time.sleep(10)

    def scrape_matches(self):
        matches = self.wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'tr')))
        date, home_team, score, home_away = [], [], [], []

        for row in matches:
            try:
                tds = row.find_elements(By.TAG_NAME, 'td')
                if len(tds) < 4:
                    print('Data is missing')
                    continue

                # Extract data from each column
                date_element = tds[0].text
                home_team_element = tds[1].text
                score_element = tds[2].text
                home_away_element = tds[3].text

                date.append(date_element)
                home_team.append(home_team_element)
                score.append(score_element)
                home_away.append(home_away_element)

            except Exception as e:
                print(f'Exception Raised: {row} - {e}')

        return pd.DataFrame({
            'date': date,
            'home_team': home_team,
            'score': score,
            'home_away': home_away
        })

    def save_data(self, data, file_name):
        data.to_csv(file_name, index=False)
        print(f'Data saved to {file_name}')

    def close_driver(self):
        self.driver.quit()

if __name__ == "__main__":
    DRIVER_PATH = '/usr/local/bin/chromedriver'
    BINARY_LOCATION = '/bin/google-chrome'
    WEBSITE_URL = 'https://www.adamchoi.co.uk/overs/detailed'

    # initialize soccer data scraper class
    scraper = SoccerDataScraper(driver_path=DRIVER_PATH, binary_location=BINARY_LOCATION)

    # open WEBSITE
    scraper.open_website(WEBSITE_URL)

    # select 'All matches' and 'country'
    scraper.select_all_matches()
    scraper.select_country('England')

    # scrape data
    match_data = scraper.scrape_matches()

    # save data to CSV
    scraper.save_data(match_data, 'soccer_data.csv')

    # close the WEBDRIVER
    scraper.close_driver()


Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Incomplete row
Data saved to soccer_data.csv


In [None]:
# @title Load Data
soccer_data_path = '/content/soccer_data.csv'
soccer_data = pd.read_csv(soccer_data_path)

In [None]:
soccer_data.head(15)

Unnamed: 0,date,home_team,score,home_away
0,17-08-2024,Arsenal,2 - 0,Wolves
1,24-08-2024,Aston Villa,0 - 2,Arsenal
2,31-08-2024,Arsenal,1 - 1,Brighton
3,15-09-2024,Tottenham,0 - 1,Arsenal
4,22-09-2024,Man City,2 - 2,Arsenal
5,28-09-2024,Arsenal,4 - 2,Leicester
6,05-10-2024,Arsenal,3 - 1,Southampton
7,19-10-2024,Bournemouth,2 - 0,Arsenal
8,17-08-2024,West Ham,1 - 2,Aston Villa
9,24-08-2024,Aston Villa,0 - 2,Arsenal


# Behind the Scenes

In [None]:
# configure chrome options
options = Options()
options.binary_location = '/bin/google-chrome'
options.add_argument('--headless')  # Run in headless mode (no UI)
options.add_argument('--no-sandbox')  # Bypass OS security model
options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
options.add_argument('--disable-gpu')  # Disable GPU acceleration
options.add_argument("--window-size=1920,1200")

# set the path to the chromedriver
DRIVER_PATH = '/usr/local/bin/chromedriver'
service = Service(DRIVER_PATH)
# initialize chrome driver
driver = webdriver.Chrome(options=options, service=service)

WEBSITE = 'https://www.adamchoi.co.uk/overs/detailed'

driver.get(WEBSITE)

wait = WebDriverWait(driver, 10)

button_all_matches = wait.until(EC.element_to_be_clickable((By.XPATH, '//label[@analytics-event="All matches"]')))
button_all_matches.click()

drowdown = Select(driver.find_element(By.ID, 'country'))
drowdown.select_by_visible_text('Spain')
time.sleep(10)

matches = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'tr')))

date = []
home_team = []
score = []
home_away = []

# print(matches[1].find_element(By.XPATH, './td[2]').text)


for row in matches:
  try:
    tds = row.find_elements(By.TAG_NAME, 'td')
    if len(tds) < 4:
      print('Incomplete rows')
      continue

    # date_element = wait.until(EC.presence_of_element_located((By.XPATH, './td[2]')))
    # home_team_element = wait.until(EC.presence_of_element_located((By.XPATH, './td[2]')))
    # score_element = wait.until(EC.presence_of_element_located((By.XPATH, './td[3]')))
    # home_away_element = wait.until(EC.presence_of_element_located((By.XPATH, './td[4]')))
    date_element = row.find_element(By.XPATH, './td[1]').text
    home_team_element = row.find_element(By.XPATH, './td[2]').text
    score_element = row.find_element(By.XPATH, './td[3]').text
    home_away_element = row.find_element(By.XPATH, './td[4]').text

    date.append(date_element)
    home_team.append(home_team_element)
    score.append(score_element)
    home_away.append(home_away_element)

  except Exception as e:
    print(f'Error scraping row: {row}')

driver.quit()

soccer_data = pd.DataFrame({'date': date,
                            'home_team': home_team,
                            'score': score,
                            'home_away': home_away})

soccer_data.to_csv('soccer_data.csv', index=False)

Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
Incomplete rows
