<a href="https://colab.research.google.com/github/superpanditas/web-scraping-toolkit/blob/main/Hockey_Teams_Data_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0 - Getting Started

Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components. Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.

In [None]:
# @title Download Libraries
!pip install selenium==4.25.0
print('-'*15)
print('Selenium Installed')
print('-'*15)

Collecting selenium==4.25.0
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium==4.25.0)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium==4.25.0)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium==4.25.0)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium==4.25.0)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium==4.25.0)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium==4.25.0)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0

In [None]:
# @title Download Chrome Driver and Google Chrome
'''
!google-chrome --version # version 130.0
!chromedriver --version # version 130.0
!pip show selenium # version 4.25.0
'''
# downlowd selenium
!pip install selenium==4.25.0
print('-'*30)
print('Selenium Installed')
print('-'*30)
## Download the Google-Chrome (Unix)
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install
print('-'*30)
print('Google-Chrome Installed')
print('-'*30)
## ----------------------------------
## Download the ChromeDriver
!wget https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.58/linux64/chromedriver-linux64.zip
!unzip -o chromedriver-linux64.zip
!mv chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
!chmod +x /usr/local/bin/chromedriver
print('-'*30)
print('Chromedriver Installed')
print('-'*30)

------------------------------
Selenium Installed
------------------------------
--2024-10-24 18:21:37--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
Resolving dl.google.com (dl.google.com)... 74.125.202.93, 74.125.202.136, 74.125.202.91, ...
Connecting to dl.google.com (dl.google.com)|74.125.202.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 111353576 (106M) [application/x-debian-package]
Saving to: ‘google-chrome-stable_current_amd64.deb’


2024-10-24 18:21:37 (275 MB/s) - ‘google-chrome-stable_current_amd64.deb’ saved [111353576/111353576]

Selecting previously unselected package google-chrome-stable.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (130.0.6723.69-1) ...
[1mdpkg:[0m dependency problems prevent configuration of google-chrome-stable:
 google-chrome-stable depends on libvulkan1; however:
  Pa

In [None]:
# @title Import Libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import time

# Step 1 - Hockey Teams Data Scraper Class


In [10]:

class hockeyDataScraper:

    def __init__(self, driver_path, binary_location):
        self.driver_path = driver_path
        self.binary_location = binary_location
        self.driver = self._initialize_driver()

    def _initialize_driver(self):
        options = Options()
        options.binary_location = self.binary_location
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')

        service = Service(self.driver_path)
        driver = webdriver.Chrome(service=service, options=options)
        return driver

    def open_website(self, website):
        self.driver.get(website)
        self.wait = WebDriverWait(self.driver, 10)

    # num of rows displayed by page [25, 50, 100]
    def rows_displayed_per_page(self, rows='100'):
        dropdown_teams_per_page = Select(self.driver.find_element(By.ID, 'per_page'))
        dropdown_teams_per_page.select_by_visible_text(rows)
        time.sleep(15)

    # scrape data
    def scrape_current_page(self, current_page):
        # find all teams displayed in the current page
        teams = self.driver.find_elements(By.CSS_SELECTOR, 'tr.team')
        time.sleep(15)

        name, year, wins, losses, pct_success, gf, ga, diff_success = [], [], [], [], [], [], [], []
        for team in teams:
            try:
                # all features stored in a list
                team_attr = team.find_elements(By.TAG_NAME, 'td')

                name.append(team_attr[0].text)
                year.append(team_attr[1].text)
                wins.append(team_attr[2].text)
                losses.append(team_attr[3].text)
                pct_success.append(team_attr[5].text)
                gf.append(team_attr[6].text)
                ga.append(team_attr[7].text)
                diff_success.append(team_attr[8].text)

            except Exception as e:
                print('Exception Raised')
                pass

        return pd.DataFrame({
            'team': name,
            'year': year,
            'wins': wins,
            'losses': losses,
            'pct_success': pct_success,
            'gf': gf,
            'ga': ga,
            'diff_success': diff_success,
            'page': len(name) * [str(current_page)]
        })

    def scrape_data_multiple_pages(self):
        # create empty dataframe
        all_data = pd.DataFrame()

        pagination = self.driver.find_element(By.CSS_SELECTOR, 'ul.pagination')
        page_links = pagination.find_elements(By.TAG_NAME, 'a')
        end_page = int(page_links[-2].text)

        current_page = 1
        while current_page <= end_page:
            try:
                next_page = self.driver.find_element(By.PARTIAL_LINK_TEXT, str(current_page))
                next_page.click()
                time.sleep(15)

                current_data_scraped = self.scrape_current_page(current_page)
                all_data = pd.concat([all_data, current_data_scraped], ignore_index=True)

                current_page+=1
            except Exception as e:
                print('Exception Raised')
                pass

        return all_data

    def save_data(self, dataframe, file_name):
        dataframe.to_csv(file_name, index=False)
        print(f'Data saved to {file_name}')

    def close_driver(self):
        self.driver.quit()

if __name__=='__main__':
    DRIVER_PATH = '/usr/local/bin/chromedriver'
    BINARY_LOCATION = '/bin/google-chrome'
    WEBSITE_URL = 'https://www.scrapethissite.com/pages/forms/'

    # initialize class
    scraper = hockeyDataScraper(DRIVER_PATH, BINARY_LOCATION)
    scraper.open_website(WEBSITE_URL)
    scraper.rows_displayed_per_page()
    dataframe_scraped = scraper.scrape_data_multiple_pages()
    scraper.save_data(dataframe_scraped, 'hockey_data_scraped.csv')
    scraper.close_driver()


Data saved to hockey_data_scraped.csv


In [11]:
hockey_data = pd.read_csv('/content/hockey_data_scraped.csv')
hockey_data.head(10)

Unnamed: 0,team,year,wins,losses,pct_success,gf,ga,diff_success,page
0,Boston Bruins,1990,44,24,0.55,299,264,35,1
1,Buffalo Sabres,1990,31,30,0.388,292,278,14,1
2,Calgary Flames,1990,46,26,0.575,344,263,81,1
3,Chicago Blackhawks,1990,49,23,0.613,284,211,73,1
4,Detroit Red Wings,1990,34,38,0.425,273,298,-25,1
5,Edmonton Oilers,1990,37,37,0.463,272,272,0,1
6,Hartford Whalers,1990,31,38,0.388,238,276,-38,1
7,Los Angeles Kings,1990,46,24,0.575,340,254,86,1
8,Minnesota North Stars,1990,27,39,0.338,256,266,-10,1
9,Montreal Canadiens,1990,39,30,0.487,273,249,24,1


In [12]:
hockey_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   team          582 non-null    object 
 1   year          582 non-null    int64  
 2   wins          582 non-null    int64  
 3   losses        582 non-null    int64  
 4   pct_success   582 non-null    float64
 5   gf            582 non-null    int64  
 6   ga            582 non-null    int64  
 7   diff_success  582 non-null    int64  
 8   page          582 non-null    int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 41.0+ KB
