In [8]:
import os, re, math, time, textwrap
from typing import List, Tuple
import psycopg2
import psycopg2.extras
import ollama
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import requests
from bs4 import BeautifulSoup

In [21]:
def extract_video_id(url: str) -> str:
    m = re.search(r"(?:v=|youtu\.be/|shorts/)([A-Za-z0-9_-]{6,})", url)
    return m.group(1) if m else url

def extract_video_title(url: str) -> str:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.title.string.replace(" - YouTube", "").strip()
    return re.sub(r'[^0-9a-zA-Z\u0E00-\u0E7F\.]', '', title)

def get_youtube_transcript(video_id: str) -> str:
    try:
        tlist = YouTubeTranscriptApi.get_transcript(video_id, languages=['th', 'en'])
    except TranscriptsDisabled:
        raise RuntimeError("Transcripts are disabled for this video.")
    except NoTranscriptFound:
        raise RuntimeError("No transcript found for this video.")
    # Merge with spaces; keep punctuation minimal
    text = " ".join([item["text"].strip() for item in tlist if item["text"].strip()])
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [54]:
url = "https://youtu.be/KSbXyRZQbUY?si=gGv553ps5TI-FDQa"

video_id = extract_video_id(url=url)
video_title = extract_video_title(url=url)
text = get_youtube_transcript(video_id=video_id)

print(f"url: {url}")
print(f"video_id: {video_id}")
print(f"video_title: {video_title}")

url: https://youtu.be/KSbXyRZQbUY?si=gGv553ps5TI-FDQa
video_id: KSbXyRZQbUY
video_title: ยังมีโอกาสสร้างความมั่งคั่งได้อยู่ไหมในยุคแบบนี้


In [51]:
from pythainlp.tokenize import word_tokenize

def chunk_thai_text(text, max_tokens, overlap):
    """
    Chunk Thai text using PyThaiNLP word tokenizer with overlap.
    """
    words = word_tokenize(text, keep_whitespace=False)
    chunks = []
    start = 0

    while start < len(words):
        end = min(start + max_tokens, len(words))
        chunk = ''.join(words[start:end])  # Join with no spaces for Thai
        chunks.append(chunk)
        start += max_tokens - overlap

    return chunks

# Example usage:
chunks = chunk_thai_text(text=text, max_tokens=1000, overlap=200)

In [None]:
def ingest_video(url_or_id: str) -> Tuple[str, int]:
    video_id = extract_video_id(url_or_id)
    print(f"Fetching transcript for: {video_id}")
    transcript = get_youtube_transcript(video_id)
    chunks = smart_split(transcript)
    print(f"Transcript length: {len(transcript):,} chars; chunks: {len(chunks)}")

    print("Embedding chunks via Ollama...")
    chunk_embs = embed_texts(chunks)
    dim = len(chunk_embs[0])

    print(f"Initializing DB (dim={dim}) and storing chunks...")
    init_db(dim)
    clear_video(video_id)
    upsert_chunks(video_id, chunks, chunk_embs)

    return video_id, len(chunks)