In [24]:
from youtube_transcript_api import YouTubeTranscriptApi
from elasticsearch import Elasticsearch

## Youtube video Transcription API

In [25]:
video_id = 'rMq63r3zi4c' 
ytt_api  = YouTubeTranscriptApi()

In [26]:
transcript = ytt_api.fetch(video_id)

In [27]:
transcript[11]

FetchedTranscriptSnippet(text='think I should already update this', start=26.84, duration=4.2)

In [28]:
def format_timestamp(seconds: float) -> str:
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours == 0:
        return f"{minutes}:{secs:02}"
    return f"{hours}:{minutes:02}:{secs:02}"

def make_subtitles(transcript) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)

In [29]:
subtitles = make_subtitles(transcript)
print(subtitles[:500])

0:00 hi everyone Welcome to our event this
0:03 event is brought to you by data do club
0:05 which is a community of people who love
0:07 data we have weekly events well lately
0:10 they're not so weekly but we're getting
0:13 back on track and this week we will
0:15 actually have two events anyways so if
0:18 you want to find out about all the
0:20 events we have in our pipeline which is
0:22 currently I think two there's a link in
0:24 the description go check it out and I
0:26 think I should 


## Alternate solution for transcripts (local data/ folder)

In [11]:
import requests

def fetch_transcript_cached(video_id):
    url_prefix = 'https://raw.githubusercontent.com/alexeygrigorev/workshops/refs/heads/main/temporal.io/data'
    url = f'{url_prefix}/{video_id}.txt'
    
    raw_text = requests.get(url).content.decode('utf8')
    
    lines = raw_text.split('\n')
    
    video_title = lines[0]
    subtitles = '\n'.join(lines[2:]).strip()
    
    return {
        "video_id": video_id,
        "title": video_title,
        "subtitles": subtitles
    }

In [13]:
fetch_transcript_cached('-Gj7SaI-QW4')

{'video_id': '-Gj7SaI-QW4',
 'title': 'Lessons Learned from Freelancing and Working in a Start-up',
 'subtitles': "0:00 everyone Welcome to our event this event\n0:02 is brought to you by datadox club which\n0:04 is a community of people who love data\n0:05 we have weekly events and today is one\n0:07 of such events if you want to find out\n0:09 more about the events we have there is a\n0:11 link in the description go there check\n0:13 it out and see what you like do not\n0:16 forget to subscribe to our YouTube\n0:17 channel this way you will get notified\n0:20 about amazing live streams like we have\n0:23 today and we have a very cool slack\n0:26 Community where you can hang out with\n0:27 other data enthusiasts and one of the\n0:31 things we have in slack is an amazing\n0:33 initiative Adonis is one of those who\n0:37 organize a lot of things there called\n0:39 project of the week and we're just\n0:41 finishing wrapping up\n0:43 a week about learning rust so check it\n0:46 out it's a

### Proxy for YouTube

In [36]:
import os
from youtube_transcript_api.proxies import GenericProxyConfig

proxy_user = os.environ['PROXY_USER']
proxy_password = os.environ['PROXY_PASSWORD']
proxy_base_url = os.environ['PROXY_BASE_URL']

proxy_url = f'http://{proxy_user}:{proxy_password}@{proxy_base_url}'

proxy = GenericProxyConfig(
    http_url=proxy_url,
    https_url=proxy_url,
)

def fetch_transcript(video_id):
    ytt_api = YouTubeTranscriptApi(proxy_config=proxy)
    transcript = ytt_api.fetch(video_id)
    return transcript

KeyError: 'PROXY_USER'

## Elastic Search 

In [10]:
es = Elasticsearch("http://localhost:9200")

In [14]:
stopwords = [
    "a","about","above","after","again","against","all","am","an","and","any",
    "are","aren","aren't","as","at","be","because","been","before","being",
    "below","between","both","but","by","can","can","can't","cannot","could",
    "couldn't","did","didn't","do","does","doesn't","doing","don't","down",
    "during","each","few","for","from","further","had","hadn't","has","hasn't",
    "have","haven't","having","he","he'd","he'll","he's","her","here","here's",
    "hers","herself","him","himself","his","how","how's","i","i'd","i'll",
    "i'm","i've","if","in","into","is","isn't","it","it's","its","itself",
    "let's","me","more","most","mustn't","my","myself","no","nor","not","of",
    "off","on","once","only","or","other","ought","our","ours","ourselves",
    "out","over","own","same","shan't","she","she'd","she'll","she's","should",
    "shouldn't","so","some","such","than","that","that's","the","their",
    "theirs","them","themselves","then","there","there's","these","they",
    "they'd","they'll","they're","they've","this","those","through","to",
    "too","under","until","up","very","was","wasn't","we","we'd","we'll",
    "we're","we've","were","weren't","what","what's","when","when's","where",
    "where's","which","while","who","who's","whom","why","why's","with",
    "won't","would","wouldn't","you","you'd","you'll","you're","you've",
    "your","yours","yourself","yourselves",
    "get"
]

index_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": stopwords
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                }
            },
            "analyzer": {
                "english_with_stop_and_stem": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_possessive_stemmer",
                        "english_stop",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "english_with_stop_and_stem",
                "search_analyzer": "english_with_stop_and_stem"
            },
            "subtitles": {
                "type": "text",
                "analyzer": "english_with_stop_and_stem",
                "search_analyzer": "english_with_stop_and_stem"
            }
        }
    }
}

index_name = "podcasts"
    
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'podcasts'})

In [15]:
doc = {
    "video_id": video_id,
    "title": "Reinventing a Career in Tech",
    "subtitles": subtitles
}

es.index(index="podcasts", id=video_id, document=doc)

ObjectApiResponse({'_index': 'podcasts', '_id': 'rMq63r3zi4c', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

## Search ElasticSearch

In [37]:
def search_videos(query: str, size: int = 5):
    body = {
        "size": size,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "subtitles"],
                "type": "best_fields",
                "analyzer": "english_with_stop_and_stem"
            }
        },
        "highlight": {
            "pre_tags": ["*"],
            "post_tags": ["*"],
            "fields": {
                "title": {
                    "fragment_size": 150,
                    "number_of_fragments": 1
                },
                "subtitles": {
                    "fragment_size": 150,
                    "number_of_fragments": 1
                }
            }
        }
    }
    
    response = es.search(index="podcasts", body=body)
    hits = response.body['hits']['hits']
    
    results = []
    for hit in hits:
        highlight = hit['highlight']
        highlight['video_id'] = hit['_id']
        results.append(highlight)

    return results

In [40]:
results = search_videos("Agents")
results

[{'subtitles': ['AI and *agents*\n44:19 OpenAI *agents* SDK.'],
  'title': ['Building reliable AI products in the era of Gen AI and *Agents*'],
  'video_id': 'x2AAjqz2XmM'},
 {'subtitles': ['*agent*.'], 'video_id': 'DSxqUlumM3A'},
 {'subtitles': ['Okay.\n37:11 >> The the other thing so I think proactive\n37:13 *agents* and I think multiplayer *agents* I\n37:15 mentioned um AI in in Slack and Discord\n37:'],
  'video_id': 'eC3RNuI6ow0'},
 {'subtitles': ["Corp can do\n57:15 it.\n57:16 >> But yeah, I asked recently um a tool\n57:21 like Corser, it's a coding *agent*."],
  'video_id': 'vK_SxyqIfwk'},
 {'subtitles': ["now what the data is there how the what\n15:31 the skills of the people are you don't\n15:33 know all of this we can say for the\n15:35 recommendation *agent*"],
  'video_id': 'jGbfeYdlCiQ'}]

## Get podcast videso and process for video_id

In [30]:
import requests
import yaml

events_url = 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/187b7d056a36d5af6ac33e4c8096c52d13a078a7/_data/events.yaml'

raw_yaml = requests.get(events_url).content
events_data = yaml.load(raw_yaml, yaml.CSafeLoader)

podcasts = [d for d in events_data if (d.get('type') == 'podcast') and (d.get('youtube'))]

print(f"Found {len(podcasts)} podcasts")

Found 194 podcasts


In [31]:
videos = []

for podcast in podcasts:
    _, video_id = podcast['youtube'].split('watch?v=')

    # Skip problematic videos
    if video_id in ['FRi0SUtxdMw', 's8kyzy8V5b8']:
        continue

    videos.append({
        'title': podcast['title'],
        'video_id': video_id
    })

print(f"Will process {len(videos)} videos")

Will process 192 videos


In [35]:
from tqdm.auto import tqdm

for video in tqdm(videos):
    video_id = video['video_id']
    video_title = video['title']
    print(video_id, video_title)

    if es.exists(index='podcasts', id=video_id):
        print(f'already processed {video_id}')
        continue

    # Real-Time
    # transcript = fetch_transcript(video_id)
    # subtitles = make_subtitles(transcript)
    # Batch (pre-processed) to avoid YouTube ratelimit
    subtitles = fetch_transcript_cached(video_id)['subtitles'] 
    
    doc = {
        "video_id": video_id,
        "title": video_title,
        "subtitles": subtitles
    }
    
    es.index(index="podcasts", id=video_id, document=doc)

  0%|          | 0/192 [00:00<?, ?it/s]

D2rw52SOFfM Reinventing a Career in Tech
eC3RNuI6ow0 How to Build and Evaluate AI systems in the Age of LLMs
ZFrcrTtnB1Q From Biotechnology to Bioinformatics Software
x2AAjqz2XmM Building reliable AI products in the era of Gen AI and Agents
vK_SxyqIfwk Lessons from Applied AI: Tesla, Waymo, and Beyond
B2tzuUg5uZs From Semiconductors to Machine Learning: A Career in Data and Teaching
DSxqUlumM3A Lessons from Two Decades of AI
gXvVMvhfrIY From Theme Parks to Tesla: Building Data Products That Work
b92gwrsVQtg From Astronomy to Applied ML
5km62e4nDaw From Medicine to Machine Learning: How Public Learning Turned into a Career
B76J4QkZPWs Mindful Data Strategy: From Pipelines to Business Impact
S93V8RgwBig Taking your Freelance Career to the Next Level
pkcpH5N-GP8 From Simulation Algorithms to Production-Grade Data Systems
vXbMUfHE1OE From Hackathons To Developer Advocacy
ekG5zJioyFs Build a Strong Career in Data
7ePp6wuxM5s From Supply Chain Management to Digital Warehousing and FinOps
PxA

## Run `workflow.py`

In [45]:
!python workflow.py

Found 194 podcasts
Will process 193 videos
already processed: D2rw52SOFfM
already processed: eC3RNuI6ow0
already processed: ZFrcrTtnB1Q
already processed: x2AAjqz2XmM
already processed: vK_SxyqIfwk
already processed: B2tzuUg5uZs
already processed: DSxqUlumM3A
already processed: gXvVMvhfrIY
already processed: b92gwrsVQtg
already processed: s8kyzy8V5b8
already processed: 5km62e4nDaw
already processed: B76J4QkZPWs
already processed: S93V8RgwBig
already processed: pkcpH5N-GP8
already processed: vXbMUfHE1OE
already processed: ekG5zJioyFs
already processed: 7ePp6wuxM5s
already processed: PxAh08Pcmj4
already processed: BP6w_vKySN0
already processed: DX9c__a4jzg
already processed: AlCFKbFIEM8
already processed: NfAJAr7FvyY
already processed: 1aMuynlLM3o
already processed: QKWu5-6_6TE
already processed: GifY8Zn-pnU
already processed: sXU9vMDBjmk
already processed: GHbeXIKnkLQ
already processed: kV0ZDy2UtJA
already processed: bT7-HRNCltk
already processed: yTZ4cddD7DU
already processed: VXQIGHUW

In [54]:
!uv run python worker.py

[2m2025-12-17T20:41:05.067160Z[0m [33m WARN[0m [2mtemporalio_sdk_core::worker::heartbeat[0m[2m:[0m Worker heartbeating configured for runtime, but server version does not support it.
Found 194 podcasts
Will process 192 videos
^C
Traceback (most recent call last):
  File [35m"/usr/lib/python3.13/asyncio/runners.py"[0m, line [35m118[0m, in [35mrun[0m
    return [31mself._loop.run_until_complete[0m[1;31m(task)[0m
           [31m~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^[0m
  File [35m"/usr/lib/python3.13/asyncio/base_events.py"[0m, line [35m725[0m, in [35mrun_until_complete[0m
    return [31mfuture.result[0m[1;31m()[0m
           [31m~~~~~~~~~~~~~[0m[1;31m^^[0m
  File [35m"/home/ht/Documents/GitHub_HT/project_deep_research_agent/src/flow/worker.py"[0m, line [35m37[0m, in [35mrun_worker[0m
    await worker.run()
  File [35m"/home/ht/Documents/GitHub_HT/project_deep_research_agent/src/flow/.venv/lib/python3.13/site-packages/temporalio/worker/_wor