<a href="https://colab.research.google.com/github/superpanditas/web-scraping-toolkit/blob/main/Oscar_Winning_Films_Data_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0 - Getting Started

Click through a bunch of great films. Learn how content is added to the page asynchronously with Javascript and how you can scrape it. Look for ways that you can tell visually when a site is loading content with AJAX. Then, browse through your network tab to see those AJAX requests and scrape them.

In [2]:
# @title Download Libraries
!pip install selenium==4.25.0
print('-'*15)
print('Selenium Installed')
print('-'*15)

Collecting selenium==4.25.0
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium==4.25.0)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium==4.25.0)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium==4.25.0)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium==4.25.0)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium==4.25.0)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium==4.25.0)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0

In [3]:
# @title Download Chrome Driver and Google Chrome
'''
!google-chrome --version # version 130.0
!chromedriver --version # version 130.0
!pip show selenium # version 4.25.0
'''
# downlowd selenium
!pip install selenium==4.25.0
print('-'*30)
print('Selenium Installed')
print('-'*30)
## Download the Google-Chrome (Unix)
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install
print('-'*30)
print('Google-Chrome Installed')
print('-'*30)
## ----------------------------------
## Download the ChromeDriver
!wget https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.58/linux64/chromedriver-linux64.zip
!unzip -o chromedriver-linux64.zip
!mv chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
!chmod +x /usr/local/bin/chromedriver
print('-'*30)
print('Chromedriver Installed')
print('-'*30)

------------------------------
Selenium Installed
------------------------------
--2024-10-25 16:18:02--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
Resolving dl.google.com (dl.google.com)... 142.251.179.190, 142.251.179.136, 142.251.179.93, ...
Connecting to dl.google.com (dl.google.com)|142.251.179.190|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 111353576 (106M) [application/x-debian-package]
Saving to: ‘google-chrome-stable_current_amd64.deb’


2024-10-25 16:18:03 (305 MB/s) - ‘google-chrome-stable_current_amd64.deb’ saved [111353576/111353576]

Selecting previously unselected package google-chrome-stable.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (130.0.6723.69-1) ...
[1mdpkg:[0m dependency problems prevent configuration of google-chrome-stable:
 google-chrome-stable depends on libvulkan1; however

In [4]:
# @title Load Libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Step 1 - Oscar Winning Films Data Class
Web Page: https://www.scrapethissite.com/pages/ajax-javascript/

In [13]:
# oscarWinningFilmsScraper Class
class oscarWinningFilmsScraper:
    def __init__(self, driver_path, binary_location):
        self.driver_path = driver_path
        self.binary_location = binary_location
        self.driver = self._initialize_driver()

    def _initialize_driver(self):
        options = Options()
        options.binary_location = self.binary_location
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-gpu')

        service = Service(self.driver_path)
        driver = webdriver.Chrome(options=options, service=service)
        return driver

    def open_website(self, website):
        self.driver.get(website)
        self.wait = WebDriverWait(self.driver, 10)

    def scrape_current_year(self, current_year):
        films = self.driver.find_elements(By.CSS_SELECTOR, 'tr.film')
        title, nominations, awards = [], [], []

        for film in films:
            try:
                # attributes/features of each film nominated
                film_attr = film.find_elements(By.TAG_NAME, 'td')

                title.append(film_attr[0].text)
                nominations.append(film_attr[1].text)
                awards.append(film_attr[2].text)
            except Exception as e:
                print('Exception Raised')
                pass

        return pd.DataFrame({
            'film_name':title,
            'nominations':nominations,
            'awards':awards,
            'year': [current_year] * len(title)
        })


    def scrape_data_multiple_years(self):
        all_data = pd.DataFrame()

        all_years = self.driver.find_elements(By.CSS_SELECTOR, 'a.year-link')
        for current_year in all_years:
            try:
                current_year.click()
                time.sleep(15)

                current_data_scraped = self.scrape_current_year(current_year.text)
                all_data = pd.concat([all_data, current_data_scraped], ignore_index=True)

            except Exception as e:
                print('Exception Raised')
                pass

        return all_data

    def save_data(self, file, file_name):
        file.to_csv(file_name, index=False)
        print(f'Data saved to {file_name}')

    def close_driver(self):
        self.driver.quit()
        print('Driver was closed')


if __name__ == '__main__':
    DRIVER_PATH = '/usr/local/bin/chromedriver'
    BINARY_LOCATION = '/bin/google-chrome'
    WEBSITE_URL = 'https://www.scrapethissite.com/pages/ajax-javascript/'

    # initialize class
    scraper = oscarWinningFilmsScraper(DRIVER_PATH, BINARY_LOCATION)
    # opening website
    scraper.open_website(WEBSITE_URL)
    # scraping data
    print('Scraping Data ...')
    dataframe_scraped = scraper.scrape_data_multiple_years()
    # save data scraped
    scraper.save_data(dataframe_scraped, 'oscar_winning_films_data.csv')
    # close driver
    scraper.close_driver()

Scraping Data ...
Data saved to oscar_winning_films_data.csv
Driver was closed


In [14]:
oscar_winning_films = pd.read_csv('/content/oscar_winning_films_data.csv')
oscar_winning_films.head(15)

Unnamed: 0,film_name,nominations,awards,year
0,Spotlight,6,2,2015
1,Mad Max: Fury Road,10,6,2015
2,The Revenant,12,3,2015
3,Bridge of Spies,6,1,2015
4,The Big Short,5,1,2015
5,The Danish Girl,4,1,2015
6,Room,4,1,2015
7,Ex Machina,2,1,2015
8,The Hateful Eight,2,1,2015
9,Inside Out,2,1,2015


In [15]:
oscar_winning_films.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   film_name    87 non-null     object
 1   nominations  87 non-null     int64 
 2   awards       87 non-null     int64 
 3   year         87 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.8+ KB
