## **Installation**

Install `selenium` to 

*  scrape Youtube channel videos page

*  scrape comments/likes of each 1-month-ago Youtube video

Install `youtube-dl` to 

*   download each Youtube 1-month-ago video

In [None]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl
!sudo chmod a+rx /usr/local/bin/youtube-dl

# **import**

In [2]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common import exceptions

from bs4 import BeautifulSoup
import os
import csv
import sys
from time import sleep
import pandas as pd
import warnings
from tqdm import tqdm
from concurrent import futures
from concurrent.futures import ThreadPoolExecutor, as_completed


Define where to download videos

In [4]:
path_to_videos = '/content/drive/MyDrive/Brut/'
os.makedirs(f'{path_to_videos}/videos', exist_ok=True)

# **Scrape videos from Youtube channel**

Define some util functions

In [5]:
def clean_views(views):
  '''function that transforms "1.4K views", "1.8M views" into "1400.0", "1800000.0" 
    Args:
        views (str): raw text videos views as scraped from Youtube channel videos page
    Returns:
        (float) videos views as a number
  '''
  views_pp = views.replace(' views', '')
  if 'K' in views_pp:
    views_pp = views_pp.replace('K', '')
    views_pp = float(views_pp)*1000
  elif 'M' in views_pp:
    views_pp = views_pp.replace('M', '')
    views_pp = float(views_pp)*1000000
  return views_pp

def get_video_hash(video_url):
  """function that isolates {hash} from url 'https://www.youtube.com/watch?v={hash}' or
     'https://www.youtube.com/shorts/{hash}'

     Args:
         video_url (str): url of the Youtube video
     Returns
         (str): hash of the Youtube video
  """
  video_hash = video_url.split('https://www.youtube.com/watch?v=')
  if len(video_hash)>1:
    video_hash = video_hash[1]
  else:
    video_hash = video_url.split('https://www.youtube.com/shorts/')[1]
  return video_hash

def get_thumbnail(video_url):
  """function that provides thumbnail url from video url

     Args:
         video_url (str): url of the Youtube video
     Returns:
         (str): url to the high-quality thumbnail of the video
  """
  video_hash = get_video_hash(video_url)
  return f'https://i.ytimg.com/vi/{video_hash}/hqdefault.jpg'

def scrape_videos_channel(channel):
  """function that scrapes videos url from youtube channel name.

    Args:
        channel (str): Youtube channel name
    Returns:
        (pd.DataFrame): a dataframe with following attributes for each video:
                        *  title : title of the video
                        *  link : url to the video
                        *  views : number of views (float)
                        *  published : how many days/month/year ago the video was published
                        *  thumbnail : url to the video thumbnail
  """
  driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
  driver.get(f'https://www.youtube.com/{channel}/UCSKdvgqdnj72_SLggp7BDTg/videos')

  #find the last video in the page
  lastElement = driver.find_elements_by_id("video-title")[-1]
  #scroll to the bottom of the page and force load more videos
  lastElement.send_keys(Keys.NULL)
  #wait while youtube loads more data
  sleep(1)

  #find the last video again, but this time assign that element to a new variable.
  # This will allow us to compare the last element variables, and loop until they 
  # are the same (meaning no more videos to load)
  lastElementNow = driver.find_elements_by_id("video-title")[-1]
  #loop until lastElement = lastElementNow, signaling the end of the videos content
  while lastElement != lastElementNow :
      lastElement = driver.find_elements_by_id("video-title")[-1]
      lastElement.send_keys(Keys.NULL)
      sleep(1)
      lastElementNow = driver.find_elements_by_id("video-title")[-1]
  df = {}
  for video_title, video_metadata in zip(driver.find_elements_by_id("video-title"), 
                                        driver.find_elements_by_id("metadata-line")):
    df.setdefault('title', []).append(video_title.text)
    df.setdefault('link', []).append(video_title.get_attribute('href'))
    df.setdefault('views', []).append(video_metadata.text.split("\n")[0])
    df.setdefault('published', []).append(video_metadata.text.split("\n")[1])
  df = pd.DataFrame(df)
  df['views'] = df['views'].apply(clean_views)
  df['thumbnail'] = df['link'].apply(get_thumbnail)
  return df

Launch Brut channel videos page scraping and save the videos dataframe

In [None]:
df_videos = scrape_videos_channel('UCSKdvgqdnj72_SLggp7BDTg')
df_videos.to_csv(f'{path_to_videos}/videos.csv', index=False)

In [6]:
df_videos.head()

Unnamed: 0,title,link,views,published,thumbnail
0,Le mystérieux naufrage du Titanic a-t-il enfin...,https://www.youtube.com/shorts/qdb9sLJq9X4,2600.0,3 hours ago,https://i.ytimg.com/vi/qdb9sLJq9X4/hqdefault.jpg
1,"L'histoire de Joëlle Aubron, bourgeoise devenu...",https://www.youtube.com/watch?v=K1oUbHbMjWw,21000.0,9 hours ago,https://i.ytimg.com/vi/K1oUbHbMjWw/hqdefault.jpg
2,Dépression : pendant un mois Louma filme son q...,https://www.youtube.com/watch?v=F7hhT_nvDpo,53000.0,1 day ago,https://i.ytimg.com/vi/F7hhT_nvDpo/hqdefault.jpg
3,Zelensky s’exprime sur les déclarations d’Emma...,https://www.youtube.com/shorts/qMTEMNIUC68,35000.0,2 days ago,https://i.ytimg.com/vi/qMTEMNIUC68/hqdefault.jpg
4,"Un an après ""Je ne suis pas une salope, je sui...",https://www.youtube.com/watch?v=T5-b95ugQT0,26000.0,2 days ago,https://i.ytimg.com/vi/T5-b95ugQT0/hqdefault.jpg


# **Scrape comments and likes from each 1-month ago video**

Load videos dataframe and filter on 1-month-ago videos

In [7]:
df_videos = pd.read_csv(f'{path_to_videos}/videos.csv')

In [8]:
df_videos = df_videos.loc[df_videos['published'] == '1 month ago']

In [9]:
df_videos.shape

(62, 5)

Define some utils functions

In [10]:
def clean_likes(likes):
  '''function transform "1.4K views", "1.8M views" into "1400", "1800000" 
  '''
  if 'K' in likes:
    likes_pp = likes.replace('K', '')
    likes_pp = float(likes_pp)*1000
  elif 'M' in likes:
    likes_pp = likes.replace('M', '')
    likes_pp = float(likes_pp)*1000000
  else:
    likes_pp = float(likes)
  return likes_pp

def scrape_comments_from_video(url):
    """
    Extracts the comments from the Youtube video given by the URL.
    Args:
        url (str): The URL to the Youtube video
    Returns:
        (pd.DataFrame) dataframe that gathers video comments
    """
    sleep_time = 5
    # Note: Download and replace argument with path to the driver executable.
    # Simply download the executable and move it into the webdrivers folder.
    driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

    # Navigates to the URL, maximizes the current window, and
    # then suspends execution for (at least) 5 seconds (this
    # gives time for the page to load).
    driver.get(url)
    driver.maximize_window()
    sleep(sleep_time)
    if 'shorts' in url:
      comments_button = driver.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-shorts/div[1]/ytd-reel-video-renderer[1]/div[2]/ytd-reel-player-overlay-renderer/div[2]/div[3]/ytd-button-renderer/a/tp-yt-paper-button/yt-icon')
      driver.execute_script("arguments[0].click();", comments_button)
      sleep(sleep_time)
    try:
        # Extract the elements storing the video title and
        # comment section.
        if 'shorts' in url:
          likes = driver.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-shorts/div[1]/ytd-reel-video-renderer[1]/div[2]/ytd-reel-player-overlay-renderer/div[2]/div[2]/ytd-like-button-renderer/ytd-toggle-button-renderer[1]/a/tp-yt-paper-button/yt-formatted-string').text
          title = driver.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-shorts/div[1]/ytd-reel-video-renderer[1]/div[2]/ytd-reel-player-overlay-renderer/div[1]/ytd-reel-player-header-renderer/h2/yt-formatted-string').text
        else:
          title = driver.find_element_by_xpath('//*[@id="container"]/h1/yt-formatted-string').text
          likes = driver.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[8]/div[2]/ytd-video-primary-info-renderer/div/div/div[3]/div/ytd-menu-renderer/div[1]/ytd-toggle-button-renderer[1]/a/yt-formatted-string').text
        likes = clean_likes(likes)
        comment_section = driver.find_element_by_xpath('//*[@id="comments"]')
    except exceptions.NoSuchElementException:
        # Note: Youtube may have changed their HTML layouts for
        # videos, so raise an error for sanity sake in case the
        # elements provided cannot be found anymore.
        error = "Error: Double check selector OR "
        error += "element may not yet be on the screen at the time of the find operation"
        print(error)

    # Scroll into view the comment section, then allow some time
    # for everything to be loaded as necessary.
    driver.execute_script("arguments[0].scrollIntoView();", comment_section)
    sleep(sleep_time)

    # Scroll all the way down to the bottom in order to get all the
    # elements loaded (since Youtube dynamically loads them).
    last_height = driver.execute_script("return document.documentElement.scrollHeight")

    while True:
        # Scroll down 'til "next load".
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")

        # Wait to load everything thus far.
        sleep(sleep_time)

        # Calculate new scroll height and compare with last scroll height.
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # One last scroll just in case.
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")

    try:
        # Extract the elements storing the usernames and comments.
        comment_elems = driver.find_elements_by_xpath('//*[@id="content-text"]')
    except exceptions.NoSuchElementException:
        error = "Error: Double check selector OR "
        error += "element may not yet be on the screen at the time of the find operation"
        print(error)

    df_comments = pd.DataFrame({'comment':[c.text for c in comment_elems],
                                'link':[url]*len(comment_elems),
                                'likes':[likes]*len(comment_elems)}
                              )
    return df_comments

Launch comments scraping for each video in parallel

In [None]:
comments=[]
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  with tqdm(total=len(df_videos['link']), desc='scrape comments on videos') as pbar:
      with ThreadPoolExecutor(max_workers=4) as executor:
          compute_futures = [executor.submit(scrape_comments_from_video, url) \
                              for url in df_videos['link']]
          for future in as_completed(compute_futures):
              comments.append(future.result())
              pbar.update(1)

Concatenate each video comments dataframe to get a whole comments dataframe

In [None]:
df_comments = pd.concat(comments, axis=0)

Save the videos comments dataframe

In [None]:
df_comments.to_csv(f'{path_to_videos}/comments.csv', index=False)

In [16]:
df_comments['link'].value_counts() # one video seems to not have any comments

https://www.youtube.com/watch?v=Ik6Xl9CLQFY    745
https://www.youtube.com/watch?v=y5IBXayQtk4    542
https://www.youtube.com/watch?v=daAJ4LgLW1k    376
https://www.youtube.com/watch?v=mKHBNhbA4yg    275
https://www.youtube.com/watch?v=oq1WKm7q3Lk    170
                                              ... 
https://www.youtube.com/shorts/UJk1h90UisU       9
https://www.youtube.com/shorts/7U6zEvI42gA       9
https://www.youtube.com/shorts/JrP3XbC25J4       8
https://www.youtube.com/shorts/fK6aW_U0E4M       7
https://www.youtube.com/shorts/-hZtpDwkoo0       6
Name: link, Length: 61, dtype: int64

## **Download 1-month-ago videos**

Load videos dataframe and filter on 1-month-ago videos

In [None]:
df_videos = pd.read_csv(f'{path_to_videos}/videos.csv')
df_videos = df_videos.loc[df_videos['published'] == '1 month ago']

In [None]:
for video_url in df_videos['link']:
  video_hash = get_video_hash(video_url)
  os.system(f"youtube-dl https://youtube.com/watch?v={video_hash} -o '{path_to_videos}/videos/{video_hash}.%(ext)s'")