<a href="https://colab.research.google.com/github/zahrafiroz/Geo458Lab2/blob/main/youtubecrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# created on January 2024
# @author:          Zahra Firoz
# @email:           zahraf@uw.edu
# @website:         https://
# @organization:    Geo 458a Lab number 2, University of Washington, Seattle
# @description:     A demo of collecting data from YouTube.


In [None]:
%%shell
sudo apt -y update
sudo apt install -y wget curl unzip
wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb
dpkg -i libu2f-udev_1.1.4-1_all.deb
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
dpkg -i google-chrome-stable_current_amd64.deb

pip install selenium chromedriver_autoinstaller

In [None]:
from bs4 import BeautifulSoup # Import BeautifulSoup to parse the HTML.
import time, datetime # Import time and datetime to record the time.
import pandas as pd # Import pandas to create a dataframe, and it can save the dataframe as a csv file.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options() # Create an instance of Options so you can add arguments to the driver.
chrome_options.add_argument('--headless') # Add an argument 'headless' to run Chrome in headless mode.
chrome_options.add_argument('--no-sandbox') # Add an argument 'no-sandbox' to run Chrome in no-sandbox mode.
chrome_options.add_argument('--disable-dev-shm-usage') # Add an argument 'disable-dev-shm-usage' to run Chrome in disable-dev-shm-usage mode.

bot = webdriver.Chrome(options=chrome_options) # Create an instance of Chrome. Pass the argument 'options' to the constructor of Chrome.

In [None]:
# The url where the data will be collected from.
url = "https://www.youtube.com/results?search_query=afghanistan"
# Input the targeting url to the bot, and the bot will load data from the url.
bot.get(url)

In [None]:
# An array to store all the video urls. If a video has been crawled, it would not be stored to the data frame.
video_urls = []
# An array to store the retrieved video details.
results = []

In [None]:
# variable i indicates the number of times that scrolls down a web page. In practice, you might want to develop different
# interaction approach to load and view the web pages.

for i in range(5):

    # it is very important to enable the bot take some rest, and then resume to work.
    time.sleep(5)
    # Let the bot scrolls down to the bottom of the content element, most of the time the bot needs to scroll down to the bottom of the page.
    # like this statement: bot.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    bot.execute_script('window.scrollTo(0,  document.getElementById("content").scrollHeight);')

In [None]:
# Create a document object model (DOM) from the raw source of the crawled web page.
# Since you are processing a html page, 'html.parser' is chosen.
soup = BeautifulSoup(bot.page_source, 'html.parser')

# Capture all the video items using find_all or findAll method.
# To view the information of the html elements you want to collect, you need to inspect the raw source using Chrome Inspector.
videos = soup.find_all('ytd-video-renderer', class_="style-scope ytd-item-section-renderer")[-20:] # 20 indicates only process the newly-acquired 20 entries.


In [None]:
for video in videos:

    # I prefer use the "try-except" statement to enable the program run without pausing due to unexpected errors.
    try:
      ...
    except:
        pass

In [None]:
video_url = video.find("a", class_="yt-simple-endpoint style-scope ytd-video-renderer").attrs["href"]
user_url = video.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").attrs["href"]
username = video.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").text
title = video.find("yt-formatted-string", class_="style-scope ytd-video-renderer").text
metadata_items = video.find_all("span", class_="inline-metadata-item style-scope ytd-video-meta-block")
view_num = metadata_items[0].text.replace(" views", "")
created_at = metadata_items[1].text.replace(" ago", "")
shortdesc = video.find("yt-formatted-string", class_="metadata-snippet-text style-scope ytd-video-renderer").text
collected_at = datetime.datetime.now()

In [None]:
row = {'video_url': video_url,
        'user_url': user_url,
        'username': username,
        'title': title,
        'view_num': view_num,
        'created_at': created_at,
        'shortdesc': shortdesc,
        'collected_at': collected_at}


if video_url in video_urls:
    print("this video has already been added.")
else:
    print(row)
    results.append(row)

In [None]:
# terminate the bot object.
bot.close()

In [None]:
# Store the results as a pandas dataframe
df = pd.DataFrame(results)

# notify the completion of the crawling in the console.
print("the crawling task is finished.")

In [None]:
# Create data on to Google Drive
from google.colab import drive
# Mount your Drive to the Colab VM.
drive.mount('/gdrive')

# the file path where to store the output csv on google drive
output_file = '/gdrive/My Drive/videos.csv'

# Save the dataframe as a csv file
df.to_csv(output_file, index=False)

In [None]:
# download the csv to your local computer
from google.colab import files
files.download(output_file)
print("the csv has been downloaded to your local computer. The program has been completed successfully.")

In [None]:
# The url where the data will be collected from.
search_terms = ["Geography", "GIS", "Maps"]
for search_term in search_terms:
    url = "https://www.youtube.com/results?search_query=" + search_term.replace(" ", "+")
    # Input the targeting url to the bot, and the bot will load data from the url.
    bot.get(url)

In [None]:
# The url where the data will be collected from.
locations = [
    {"name": "Seattle", "lat": 47.6062, "lng": -122.3321},
    {"name": "Tacoma", "lat": 47.2529, "lng": -122.4443},
    {"name": "Olympia", "lat": 47.0379, "lng": -122.9007}
]
for location in locations:
    url = "https://www.youtube.com/results?search_query=" + location["name"].replace(" ", "+")
    # Input the targeting url to the bot, and the bot will load data from the url.
    bot.get(url)
