<a href="https://colab.research.google.com/github/thecatbaron/testing/blob/master/YouTube_search_archival_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **YouTube search/archival notebook** 
Notebook for querying Youtube for topic/tags, video metadata/engagement archiving, comments, and video/thumb download. Retrieves data from the YouTube API. Requires Python 3.

*Currently this is a work in progress!*

## **Setup**

### Imports and defines


In [1]:
!pip install youtube-dl
!apt-get -qq install -y atomicparsley ffmpeg
!pip install dprint
!pip install pytchat

from googleapiclient.discovery import build
from google.colab import drive
import pandas as pd
import json
import re
import requests
import datetime
import os
import youtube_dl
import pytchat
from requests.exceptions import ConnectionError
from requests.packages.urllib3.exceptions import ProtocolError
from urllib.parse import parse_qs
from ipywidgets import IntProgress
from dprint import dprint
from __future__ import unicode_literals

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting youtube-dl
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 3.6 MB/s 
[?25hInstalling collected packages: youtube-dl
Successfully installed youtube-dl-2021.12.17
Selecting previously unselected package atomicparsley.
(Reading database ... 155639 files and directories currently installed.)
Preparing to unpack .../atomicparsley_0.9.6-1_amd64.deb ...
Unpacking atomicparsley (0.9.6-1) ...
Setting up atomicparsley (0.9.6-1) ...
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dprint
  Downloading dprint-0.1.0-py2.py3-none-any.whl (7.4 kB)
Installing collected packages: dprint
Successfully installed dprint-0.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytchat
  Downloading pytchat-0.5.5-p

In [4]:
def search(query_keyword, output_name, max_results=25, geocode=None, event_type=None, order="date", lang=None):
  """
    Pass in a keyword and output name (e.g. "kamala_biden") with optional max_results (max 50), 
    geocode (e.g. 38.7442,-90.3054,1mi), event_type ("live", "completed", "upcoming"), 
    order ("date", "title" (alphabetical), "viewCount" (popularity), "rating", "relevance"),
    or lang (iso639-2). Writes output to jsonl file.
    Ordering defaults to recent (i.e. not mixed, the API default, or popular) YouTube videos.
  """
  url = "https://www.youtube.com/watch?v=tL3twECKpQ0"
  params = {
            "part": "snippet",
            "q": query_keyword,
            "maxResults": max_results,
            "type": "video",
            "order": order,
            "key": API_key
        } 
  if lang is not None:
    params['lang'] = lang
  if geocode is not None:
    params['geocode'] = geocode
  resp = requests.get(url, params=params) 

  output_name_jsonl = output_name + '.jsonl'
  
  with open(output_name_jsonl, 'w') as outfile:
    for video in resp.json()["items"]:
        # double encoding json = no good
        # json.dump(c.json(), outfile)
        # outfile.write('\n')
        outfile.write(c.json() + '\n')



def get_df(youtube_jsonl): 
  """
    Pass in youtube_jsonl (e.g. 'output.jsonl') and get a dataframe of 8 columns: 
    'id.kind': string, The type of the API resource (e.g. "video", "channel")
    'id.videoId': string, The unique video ID
    'snippet.publishedAt': datetime, The creation date and time of the resource
    'snippet.channelId': string, The unique channel ID
    'snippet.title': string, The title of the search result
    'snippet.description': string, A description of the search result.
    'snippet.thumbnails.default.url': string, URL of the default thumbnail image
    'snippet.channelTitle': string, The title of the channel that published the resource that the search result identifies
  """
  df = pd.json_normalize(pd.Series(open(youtube_jsonl).readlines()).apply(json.loads))
  # only keep interesting columnsb
  df = df[['id.kind','id.videoId','snippet.publishedAt', 'snippet.channelId', 'snippet.title', 'snippet.description', 'snippet.thumbnails.default.url', 'snippet.channelTitle']]
  # clean data
  df['id.kind'] = df['id.kind'].str.replace(r'youtube#', '')
  df['snippet.publishedAt'] = df['snippet.publishedAt'].astype('datetime64[ns]')
  return df 



def is_yt_video_live(video_link_or_id):
  """ 
    Pass in Youtube video url or video ID string and get True if video is live, False if not
  """
  with YoutubeDL({'ignoreerrors': True, "quiet": True}) as ydl: # for some reason, adding "is_live": True to options is not effective
    info_dict = ydl.extract_info(video_link_or_id, download=False)
    # info_dict.get('is_live', None) returns None if not live, True if live
    return True if info_dict.get('is_live', None) else False
    

def download_video(list_of_video_links, include_thumbnail_download=True): 
  """ 
    Pass in a list of Youtube URLs and 
    include_thumbnail_download bool (default True) to download 
    thumbnails along with videos, or to download videos only. 
  """
  ydl_opts = {
    'writethumbnail': include_thumbnail_download,
    'postprocessors': [
        # remove commented lines to download audio only
        #{
        #    'key': 'FFmpegExtractAudio',
        #    'preferredcodec': 'mp3',
        #}, 
        {'key': 'EmbedThumbnail'},
        {'key': 'FFmpegMetadata'},
    ],
    'retries':4, 
    'ignoreerrors': True, 
    'format': 'mp4', 
    'subtitleslangs': ['en'], 
    'writeautomaticsub': True, 
    'convertsubtitles': 'srt', 
    'restrictfilenames': True
  }   
  try: 
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
      ydl.download(list_of_video_links)
  except youtube_dl.utils.DownloadError as e:
    dprint(e)
  return


def download(link): 
  """ 
    Python equivalent of a given wget command, 
    best applied to download thumbnail urls
  """
  os.system('wget %s'%link)
  return

def get_more_video_info(video_id): 
  """ 
    Given a video ID, returns a dictionary that contains: 
    - number of likes it's received
    - number of comments
    - number of favorites
    - number of dislikes 
    - view count
    - cateogory ID 
    - tags/hashtags
  """
  url = "https://www.googleapis.com/youtube/v3/videos?part=snippet&part=statistics"
  params = {
            "id": video_id,
            "key": API_key
        } 
  r = requests.get(url, params=params) 
  video_dict = r.json()['items'][0]['statistics']
  video_dict['cateogoryId'] = r.json()['items'][0]['snippet']['categoryId']
  video_dict['tags'] = r.json()['items'][0]['snippet']['tags']
  return video_dict


def merge_df_with_more_video_info(df): 
  """ 
    Concatenates df with the dictionary
    that is formed by get_more_video_info 
    and returns the final df
  """
  more_df = [] 
  for x in df['id.videoId']: 
    more_df.append(get_more_video_info(x))
  df2 = pd.DataFrame(more_df)
  return pd.concat([df, df2], axis=1)


def get_yt_livestream_comments(video_url_or_id, output_name="output"):
  """ 
     Given a livestream video ID or URL to livestream video (where the livestream chat is 
     currently active, or has ended BUT the Live Chat Replay is enabled), 
     writes .jsonl file of livestream comments. 
     Example use: 
     >>> url = "https://www.youtube.com/watch?v=fJKBM6WGR7s"
     >>> output_name = "youtube_video_election" # your file will be saved as "youtube_video_election.jsonl"
     >>> get_yt_livestream_comments(url, output_name)
  """
  match = re.search('((?<=(v|V)/)|(?<=be/)|(?<=(\?|\&)v=)|(?<=embed/))([\w-]+)', video_url_or_id)
  if match: # Extract video ID from YouTube link
    video_url_or_id = match.group(0)
  output_name_jsonl = output_name + '.jsonl'
  chat = pytchat.create(video_url_or_id)
  with open(output_name_jsonl, 'w') as outfile:
    while chat.is_alive():
      for c in chat.get().items:
        outfile.write(c.json() + '\n')

### YouTube keys

In [5]:
API_key = "AIzaSyAJCJnRynX12B1qgibW6kyVNRQbP8z8LVY" # insert your API key here

# Video download

In [6]:
url = "https://www.youtube.com/watch?v=y8Kyi0WNg40" # insert your link here
download_video([url]) # notice that it takes in a list of strings, so you can also create an array of youtube links and pass it in

[youtube] y8Kyi0WNg40: Downloading webpage
[youtube] y8Kyi0WNg40: Downloading player 0e7373c2
[youtube] y8Kyi0WNg40: Downloading thumbnail ...
[youtube] y8Kyi0WNg40: Writing thumbnail to: Dramatic_Look-y8Kyi0WNg40.jpg
[download] Destination: Dramatic_Look-y8Kyi0WNg40.mp4
[download] 100% of 194.28KiB in 00:03
[ffmpeg] Correcting extension to webp and escaping path for thumbnail "Dramatic_Look-y8Kyi0WNg40.jpg"
[ffmpeg] Converting thumbnail "Dramatic_Look-y8Kyi0WNg40.webp" to JPEG
[atomicparsley] Adding thumbnail to "Dramatic_Look-y8Kyi0WNg40.mp4"
[ffmpeg] Adding metadata to 'Dramatic_Look-y8Kyi0WNg40.mp4'


# YouTube Live Comments

In [None]:
url = "https://www.youtube.com/watch?v=fJKBM6WGR7s"
output_name = "youtube_video_election" # your file will be saved as "youtube_video_election.jsonl"
get_yt_livestream_comments(url, output_name)

# Example Uses

Let's say you wanted to get information of videos that matched the keyword "election" and was "live" on YouTube.

In [None]:
search("election", "output.jsonl", event_type="live") # this writes a json of the results 
# to view your json inline as a df
yt_results = get_df("output.jsonl")
yt_results

Great, now you want to download the videos along with its thumbnail. 

In [None]:
# to form the video url you have to prefix the video id with the youtube domain e.g. "https://www.youtube.com/watch?v=" + yt_results["id.videoId"].astype(str)
video_links = download_video("https://www.youtube.com/watch?v=" + yt_results["id.videoId"].astype(str))
download_video(video_links) 

If you only wanted to download 1 video, without the thumbnail, it would look like: 

In [None]:
download_video([video_links[0]], False) # for the second video, you do NOT want the thumbnail to be downloaded

# **Storage**

In [None]:
drive.mount('/content/drive')

# The path to copy our data to
OUT_PATH = '/content/drive/Shared drives/Election Integrity Partnership/Raw Data/YouTube/'

# Set this to the current ticket
ticket = "EIP-xxx"

!mkdir -p "{OUT_PATH}/{ticket}"
!cp *.jsonl "{OUT_PATH}/{ticket}/"
!cp *.jpg "{OUT_PATH}/{ticket}/" 2>/dev/null
!cp *.mp3 "{OUT_PATH}/{ticket}/" 2>/dev/null
!cp *.mp4 "{OUT_PATH}/{ticket}/" 2>/dev/null
!cp *.mkv "{OUT_PATH}/{ticket}/" 2>/dev/null