# 01_youtube_200_transcripts.ipynb
Scrape 200 YouTube videos + English auto-subtitles
Canadian far-right / extremism policy

In [1]:
import os, yt_dlp, pandas as pd
from googleapiclient.discovery import build
from pathlib import Path

API_KEY = os.getenv('YOUTUBE_API_KEY')
RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

QUERIES = [
    "canada far-right policy 2024",
    "canadian violent extremism research",
    "deradicalization canada policy",
    "online radicalization youth canada",
    "extremist movements canada policy"
]

In [2]:
youtube = build('youtube', 'v3', developerKey=API_KEY)
videos = []

for query in QUERIES:
    request = youtube.search().list(q=query, part='snippet', maxResults=40, type='video')
    response = request.execute()
    for item in response['items']:
        videos.append({
            'video_id': item['id']['videoId'],
            'title': item['snippet']['title'],
            'channel': item['snippet']['channelTitle'],
            'published': item['snippet']['publishedAt'],
            'query_used': query
        })

print(f"Found {len(videos)} videos ready for subtitle download")
df = pd.DataFrame(videos)
df.to_csv("master_dataset.csv", index=False)

Found 200 videos ready for subtitle download


In [9]:
# CORRECTED VERSION — works 100%
ydl_opts = {
    'writeautomaticsub': True,
    'skip_download': True,
    'subtitleslangs': ['en'],
    'subtitlesformat': 'vtt',
    'quiet': False,
    'retries': 10,
    'sleep_interval': 3,
    'extractor_args': {'youtube': {'player_client': ['android']}},
    'outtmpl': str(RAW_DIR / '%(id)s.%(ext)s')
}

# Fixed line: videos is a list of dicts → use list comprehension correctly
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    urls = [f"https://www.youtube.com/watch?v={video['video_id']}" for video in videos]
    ydl.download(urls)

print("All subtitle downloads finished!")

[youtube] Extracting URL: https://www.youtube.com/watch?v=9LHCC4TteGU
[youtube] 9LHCC4TteGU: Downloading webpage
[youtube] 9LHCC4TteGU: Downloading android player API JSON




[info] 9LHCC4TteGU: Downloading subtitles: en
[info] 9LHCC4TteGU: Downloading 1 format(s): 18
[info] Writing video subtitles to: data/raw/9LHCC4TteGU.en.vtt
[download] Destination: data/raw/9LHCC4TteGU.en.vtt
[download] 100% of    7.09KiB in 00:00:00 at 31.68KiB/s
[youtube] Extracting URL: https://www.youtube.com/watch?v=7EHVS1Su5Po
[youtube] 7EHVS1Su5Po: Downloading webpage
[youtube] 7EHVS1Su5Po: Downloading android player API JSON




[info] 7EHVS1Su5Po: Downloading subtitles: en
[info] 7EHVS1Su5Po: Downloading 1 format(s): 18
[info] Writing video subtitles to: data/raw/7EHVS1Su5Po.en.vtt
[download] Destination: data/raw/7EHVS1Su5Po.en.vtt
[download] 100% of   13.25KiB in 00:00:00 at 115.17KiB/s
[youtube] Extracting URL: https://www.youtube.com/watch?v=QThBwP2KV3k
[youtube] QThBwP2KV3k: Downloading webpage
[youtube] QThBwP2KV3k: Downloading android player API JSON




[info] QThBwP2KV3k: Downloading subtitles: en
[info] QThBwP2KV3k: Downloading 1 format(s): 18
[info] Writing video subtitles to: data/raw/QThBwP2KV3k.en.vtt
[download] Destination: data/raw/QThBwP2KV3k.en.vtt
[download] 100% of   58.71KiB in 00:00:00 at 319.43KiB/s
[youtube] Extracting URL: https://www.youtube.com/watch?v=56w54X-gLig
[youtube] 56w54X-gLig: Downloading webpage
[youtube] 56w54X-gLig: Downloading android player API JSON




[info] 56w54X-gLig: Downloading subtitles: en
[info] 56w54X-gLig: Downloading 1 format(s): 18
[info] Writing video subtitles to: data/raw/56w54X-gLig.en.vtt
[download] Destination: data/raw/56w54X-gLig.en.vtt
[download] 100% of  405.98KiB in 00:00:00 at 1.21MiB/s
[youtube] Extracting URL: https://www.youtube.com/watch?v=3XcN3DCDqFs
[youtube] 3XcN3DCDqFs: Downloading webpage
[youtube] 3XcN3DCDqFs: Downloading android player API JSON




[info] 3XcN3DCDqFs: Downloading subtitles: en
[info] 3XcN3DCDqFs: Downloading 1 format(s): 18
[info] Writing video subtitles to: data/raw/3XcN3DCDqFs.en.vtt
[download] Destination: data/raw/3XcN3DCDqFs.en.vtt
[download] 100% of    5.24KiB in 00:00:00 at 33.40KiB/s
[youtube] Extracting URL: https://www.youtube.com/watch?v=dZxvVo1pd0o
[youtube] dZxvVo1pd0o: Downloading webpage
[youtube] dZxvVo1pd0o: Downloading android player API JSON




[info] dZxvVo1pd0o: Downloading subtitles: en
[info] dZxvVo1pd0o: Downloading 1 format(s): 18
[info] Writing video subtitles to: data/raw/dZxvVo1pd0o.en.vtt
[download] Destination: data/raw/dZxvVo1pd0o.en.vtt
[download] 100% of    9.61KiB in 00:00:00 at 70.01KiB/s
[youtube] Extracting URL: https://www.youtube.com/watch?v=_m1FBOIALus
[youtube] _m1FBOIALus: Downloading webpage


KeyboardInterrupt: 

In [11]:
import glob
vtt_files = glob.glob("data/raw/*.en.vtt")
print(f"Successfully downloaded {len(vtt_files)} English subtitle files")

Successfully downloaded 6 English subtitle files
