<a href="https://colab.research.google.com/github/superpanditas/web-scraping-toolkit/blob/main/Countries_Of_The_World_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0 - Getting Started

A simple Exercise of Web Scraping: A single page that lists information about all the countries in the world. Good for those just get started with web scraping. Practice looking for patterns in the HTML that will allow you to extract information about each country. Then, build a simple web scraper that makes a request to this page, parses the HTML and prints out each country's name.

In [1]:
# @title Download Libraries
!pip install selenium==4.25.0

Collecting selenium==4.25.0
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium==4.25.0)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium==4.25.0)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium==4.25.0)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium==4.25.0)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium==4.25.0)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium==4.25.0)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0

In [2]:
# @title Download Google-Chrome & Chromedriver

'''
!google-chrome --version # version 130.0
!chromedriver --version # version 130.0
!pip show selenium # version 4.25.0
'''

## Download the Google-Chrome (Unix)
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install
print('Google-Chrome Installed')
## --------------------------------------------
## Download the ChromeDriver
!wget https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.58/linux64/chromedriver-linux64.zip
!unzip -o chromedriver-linux64.zip
!mv chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
!chmod +x /usr/local/bin/chromedriver
print('Chromedriver Installed')

--2024-10-24 01:25:27--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
Resolving dl.google.com (dl.google.com)... 74.125.141.93, 74.125.141.190, 74.125.141.91, ...
Connecting to dl.google.com (dl.google.com)|74.125.141.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 111353576 (106M) [application/x-debian-package]
Saving to: ‘google-chrome-stable_current_amd64.deb’


2024-10-24 01:25:28 (199 MB/s) - ‘google-chrome-stable_current_amd64.deb’ saved [111353576/111353576]

Selecting previously unselected package google-chrome-stable.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (130.0.6723.69-1) ...
[1mdpkg:[0m dependency problems prevent configuration of google-chrome-stable:
 google-chrome-stable depends on libvulkan1; however:
  Package libvulkan1 is not installed.

[1mdpkg:[0m error processing package google

In [3]:
# @title Import Libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
import pandas as pd

# Step 1 - Scraping Countries Of The World
Web Page: https://www.scrapethissite.com/pages/simple/

In [4]:
# @title Scraper Class
class countryDataScraper():

  def __init__(self, binary_location, driver_path):
      self.driver_path = driver_path
      self.binary_location = binary_location
      self.driver = self._initialize_driver()

  def _initialize_driver(self):
      # set up driver
      options = Options()
      options.binary_location = self.binary_location
      options.add_argument('--headless')
      options.add_argument('--no-sandbox')
      options.add_argument('--disable-dev-shm-usage')
      options.add_argument('--disable-gpu')
      options.add_argument('--window_size=1920,1200')
      # chrome driver path
      service = Service(self.driver_path)
      driver = webdriver.Chrome(options=options, service=service)
      return driver

  def open_website(self, url):
      self.driver.get(url)
      self.wait = WebDriverWait(self.driver, 30)
      time.sleep(15)

  def scraper_data(self):
      country_data = self.driver.find_elements(By.XPATH, "//div[contains(@class, 'col-md-4')]")
      country_name, capital, population, area_km2 = [], [], [], []

      for country in country_data:

          try:
              country_name.append(country.find_element(By.CLASS_NAME, 'country-name').text)
              capital.append(country.find_element(By.CLASS_NAME, 'country-capital').text)
              population.append(country.find_element(By.CLASS_NAME, 'country-population').text)
              area_km2.append(country.find_element(By.CLASS_NAME, 'country-area').text)

          except Exception as e:
            print('Exception Raised')

      return pd.DataFrame({
          'country_name' : country_name,
          'capital': capital,
          'population': population,
          'area_km2': area_km2
      })

  def save_csv(self, file, file_name):
      file.to_csv(file_name, index=False)
      print(f'CSV saved as {file_name}')

  def close_driver(self):
      self.driver.quit()

if __name__ == '__main__':
    DRIVER_PATH = '/usr/local/bin/chromedriver'
    BINARY_LOCATION = '/bin/google-chrome'
    WEBSITE_URL = 'https://www.scrapethissite.com/pages/simple/'

    # initialize countries of the world data scraper class
    scraper = countryDataScraper(binary_location=BINARY_LOCATION
                              , driver_path=DRIVER_PATH)

    # open website
    scraper.open_website(url=WEBSITE_URL)

    # scraper
    df_csv = scraper.scraper_data()

    # file saved
    scraper.save_csv(df_csv, 'countries_data.csv')

    # close driver
    scraper.close_driver()

CSV saved as countries_data.csv


In [5]:
#@title Output
country_data = pd.read_csv('/content/countries_data.csv')
country_data.head(10)

Unnamed: 0,country_name,capital,population,area_km2
0,Andorra,Andorra la Vella,84000,468.0
1,United Arab Emirates,Abu Dhabi,4975593,82880.0
2,Afghanistan,Kabul,29121286,647500.0
3,Antigua and Barbuda,St. John's,86754,443.0
4,Anguilla,The Valley,13254,102.0
5,Albania,Tirana,2986952,28748.0
6,Armenia,Yerevan,2968000,29800.0
7,Angola,Luanda,13068161,1246700.0
8,Antarctica,,0,14000000.0
9,Argentina,Buenos Aires,41343201,2766890.0


In [6]:
country_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_name  250 non-null    object 
 1   capital       242 non-null    object 
 2   population    250 non-null    int64  
 3   area_km2      250 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 7.9+ KB
