# GRII YouTube Transcript Extractor

Extracts transcripts in batches. Re-run with a fresh runtime if you get IP blocked.
Upload your previous `grii_transcripts.json` to resume where you left off.

In [None]:
!pip install -q youtube-transcript-api yt-dlp

In [None]:
import json, os, time
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, IpBlocked

# Load previous results if uploaded
existing_ids = set()
prev_results = []
prev_errors = []
if os.path.exists('grii_transcripts.json'):
    with open('grii_transcripts.json') as f:
        prev = json.load(f)
    prev_results = prev.get('transcripts', [])
    prev_errors = prev.get('errors', [])
    existing_ids = {t['video_id'] for t in prev_results}
    print(f'Loaded {len(prev_results)} previous transcripts, will skip those.')
else:
    print('No previous file found, starting fresh.')

# Fetch channel video list
print('Fetching channel videos...')
ydl_opts = {'quiet': True, 'no_warnings': True, 'extract_flat': True, 'skip_download': True}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info('https://www.youtube.com/@GRIIPusatSore/videos', download=False)
    entries = info.get('entries', [])

SKIP_KEYWORDS = ['Sermon Clips', 'Sermon Clip', 'Koor ', 'Virtual Ensemble',
                 'Virtual Choir', 'Sekolah Minggu', 'Jendela Anak', 'Perkenalan']
sermons = []
for e in entries:
    title = e.get('title', '')
    if not any(kw.lower() in title.lower() for kw in SKIP_KEYWORDS):
        sermons.append({'id': e['id'], 'title': title})

remaining = [s for s in sermons if s['id'] not in existing_ids]
print(f'Total: {len(sermons)} sermons, {len(existing_ids)} already done, {len(remaining)} remaining')

In [None]:
# Extract transcripts with delays to avoid IP ban
results = list(prev_results)
errors = list(prev_errors)
ytt = YouTubeTranscriptApi()
ip_blocked = False

for i, sermon in enumerate(remaining):
    vid = sermon['id']
    title = sermon['title']
    print(f'[{i+1}/{len(remaining)}] {title}...', end=' ')

    try:
        detected_lang = 'id'
        try:
            transcript = ytt.fetch(vid, languages=['id'])
        except NoTranscriptFound:
            try:
                transcript = ytt.fetch(vid, languages=['en'])
                detected_lang = 'en'
            except NoTranscriptFound:
                tlist = ytt.list(vid)
                available = list(tlist)
                if not available:
                    raise NoTranscriptFound(vid, [], None)
                transcript = ytt.fetch(vid, languages=[available[0].language_code])
                detected_lang = available[0].language_code

        segments = [{'text': s.text, 'start': s.start, 'duration': s.duration} for s in transcript.snippets]
        full_text = ' '.join(s['text'] for s in segments)

        try:
            with yt_dlp.YoutubeDL({'quiet': True, 'skip_download': True}) as ydl:
                meta = ydl.extract_info(f'https://www.youtube.com/watch?v={vid}', download=False)
                yt_title = meta.get('title', title)
        except Exception:
            yt_title = title

        results.append({
            'video_id': vid, 'title': yt_title, 'full_text': full_text,
            'segments': segments, 'language': detected_lang,
            'source_url': f'https://www.youtube.com/watch?v={vid}',
        })
        print(f'OK ({len(segments)} segments)')

    except IpBlocked:
        print('IP BLOCKED - stopping.')
        ip_blocked = True
        break

    except Exception as e:
        errors.append(f'{vid} ({title}): {type(e).__name__}: {str(e)[:100]}')
        print(f'FAILED: {type(e).__name__}')

    # Delay: 3s per video, extra 10s every 5 videos
    time.sleep(3)
    if (i + 1) % 5 == 0:
        time.sleep(10)

new_count = len(results) - len(prev_results)
print(f'\nExtracted {new_count} new transcripts this run. Total: {len(results)}')
if ip_blocked:
    print('\n>>> IP was blocked. Save the file, disconnect runtime, reconnect, upload file, and re-run. <<<')

In [None]:
# Save results
import os
output = {'transcripts': results, 'errors': errors}
with open('grii_transcripts.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

total = len(sermons)
done = len(results)
print(f'Saved {done}/{total} transcripts to grii_transcripts.json')
print(f'File size: {os.path.getsize("grii_transcripts.json") / 1024 / 1024:.1f} MB')
if done < total:
    print(f'\n{total - done} remaining. Disconnect runtime, reconnect, upload this file, and re-run.')

In [None]:
# Download the file
from google.colab import files
files.download('grii_transcripts.json')