# 01_youtube_200_transcripts.ipynb
Scrape 200 YouTube videos + English auto-subtitles
Canadian far-right / extremism policy

In [1]:
import os, yt_dlp, pandas as pd
from googleapiclient.discovery import build
from pathlib import Path

API_KEY = os.getenv('YOUTUBE_API_KEY')
RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

QUERIES = [
    "canada far-right policy 2024",
    "canadian violent extremism research",
    "deradicalization canada policy",
    "online radicalization youth canada",
    "extremist movements canada policy"
]

In [2]:
youtube = build('youtube', 'v3', developerKey=API_KEY)
videos = []

for query in QUERIES:
    request = youtube.search().list(q=query, part='snippet', maxResults=40, type='video')
    response = request.execute()
    for item in response['items']:
        videos.append({
            'video_id': item['id']['videoId'],
            'title': item['snippet']['title'],
            'channel': item['snippet']['channelTitle'],
            'published': item['snippet']['publishedAt'],
            'query_used': query
        })

print(f"Found {len(videos)} videos ready for subtitle download")
df = pd.DataFrame(videos)
df.to_csv("master_dataset.csv", index=False)

Found 200 videos ready for subtitle download


In [3]:
ydl_opts = {
    'writeautomaticsub': True,
    'skip_download': True,
    'subtitleslangs': ['en'],
    'subtitlesformat': 'vtt',
    'quiet': False,
    'outtmpl': str(RAW_DIR / '%(id)s.%(ext)s')
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    urls = [f"https://www.youtube.com/watch?v={v}" for v in videos['video_id']]
    ydl.download(urls)

print("Subtitle download complete!")

TypeError: list indices must be integers or slices, not str

In [4]:
import glob
vtt_files = glob.glob("data/raw/*.en.vtt")
print(f"Successfully downloaded {len(vtt_files)} English subtitle files")

Successfully downloaded 0 English subtitle files
