In [33]:
import os
import re
from enum import Enum
from urllib.parse import urlparse, parse_qs
from tqdm import tqdm

from dotenv import load_dotenv
from langchain_core.documents.base import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pydantic import BaseModel, Field
from pytube import Playlist, YouTube
from youtube_transcript_api import YouTubeTranscriptApi
import yt_dlp

load_dotenv(override=True)


def get_videos_links(url_playlist: str) -> list[str]:

    url = url_playlist
    playlist = Playlist(url)
    urls_videos = list(playlist.video_urls)
    return urls_videos


def get_video_transcription(urls_videos: list[str], language: str="pt") -> list[str]:
    
    transcriptions = list()
    for url_video in urls_videos:
        try:
            video_id = url_video.find("v=")
            video_id = url_video[video_id+2:]
            ytt_api = YouTubeTranscriptApi() 
            transcript_list = ytt_api.list(video_id)
            transcript = transcript_list.find_transcript([language, 'en']).fetch()
            print(transcript)
            transcription = ""
            for transc in transcript:
                transcription += transc["text"] + " "
            transcriptions.append(transcription)
        except Exception as e:
            print(f"Erro ao obter transcrição do vídeo {url_video}: {e}")
            transcriptions.append("Transcrição indisponível")
            
    return transcriptions


def get_videos_metadata(urls_videos: list[str], language: str='pt') -> list[dict]:

    transcriptions = get_video_transcription(urls_videos, language)
    lista_videos_medata = list()
    for i, url_video in tqdm(enumerate(urls_videos)):
        try:
            video = YouTube(url_video)
            metadata_video = {
                "Url Vídeo":                url_video,
                "Título":                   video.title,
                "Descrição":                video.description,
                "Data de Publicação":       video.publish_date.strftime('%Y-%m-%d'),
                "Duração (segundos)":       video.length,
                "Canal":                    video.author,
                "URL do Canal":             video.channel_url,
                "Visualizações":            video.views,
                "Palavras-chave":           video.keywords,
                "Thumbnail":                video.thumbnail_url,
                "Transcrição":              transcriptions[i]
            }
            lista_videos_medata.append(metadata_video)
        except Exception as e:
            print(f"Tentando com yt-dlp. {e}")
            try:
                ydl_opts = {
                    'quiet': True,
                    'skip_download': True,
                    'forcejson': True
                }
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    info = ydl.extract_info(url_video, download=False)
                    metadata_video = {
                        "Url Vídeo":                url_video,
                        "Título":                   info.get("title"),
                        "Descrição":                info.get("description"),
                        "Data de Publicação":       info.get("upload_date"),
                        "Duração (segundos)":       info.get("duration"),
                        "Canal":                    info.get("uploader"),
                        "URL do Canal":             info.get("channel_url"),
                        "Visualizações":            info.get("view_count"),
                        "Palavras-chave":           info.get("tags"),
                        "Thumbnail":                info.get("thumbnail"),
                        "Transcrição":              transcriptions[i]
                    }
                    lista_videos_medata.append(metadata_video)
                    print("Conseguimos com o yt-dlp")
            except Exception as e:
                print(f"yt-dlp também falhou. {e}")
                metadata_video = {
                    "Url Vídeo":                url_video,
                    "Título":                   "Indisponível",
                    "Descrição":                "Indisponível",
                    "Data de Publicação":       "Indisponível",
                    "Duração (segundos)":       "Indisponível",
                    "Canal":                    "Indisponível",
                    "URL do Canal":             "Indisponível",
                    "Visualizações":            "Indisponível",
                    "Palavras-chave":           "Indisponível",
                    "Thumbnail":                "Indisponível",
                    "Transcrição":              transcriptions[i]
                }
                lista_videos_medata.append(metadata_video)
    return lista_videos_medata


def sanitize_index_name(url: str) -> str:
    query = parse_qs(urlparse(url).query)
    playlist_id = query.get("list", [""])[0]
    name = playlist_id.lower()
    name = re.sub(r'[^a-z0-9\-]', '-', name)
    name = re.sub(r'-+', '-', name)
    name = name.strip('-')
    return name


def get_data_from_playlist(url_playlist: str, language: str = "pt"):

    urls_videos = get_videos_links(url_playlist)
    
    lista_videos_metadata = get_videos_metadata(urls_videos, language)
    
    return lista_videos_metadata




In [40]:
response = get_data_from_playlist("https://www.youtube.com/playlist?list=PLrIEi9PL7EuHt05NBceRFFv5o2m_Bj40d", "es")

Erro ao obter transcrição do vídeo https://www.youtube.com/watch?v=mbsCUmJNheM: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=mbsCUmJNheM! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

Ways to work around this are explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).


If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add

KeyboardInterrupt: 

In [12]:
example = "https://www.youtube.com/watch?v=mbsCUmJNheM" 
url_video = example
video_id = url_video.find("v=")
video_id = url_video[video_id+2:]
ytt_api = YouTubeTranscriptApi() 

In [14]:
transcript_list = ytt_api.list(video_id)

In [19]:
# iterate over all available transcripts
for transcript in transcript_list:

    # the Transcript object provides metadata properties
    print(
        transcript.video_id,
        transcript.language,
        transcript.language_code,
        # whether it has been manually created or generated by YouTube
        transcript.is_generated,
        # whether this transcript can be translated or not
        transcript.is_translatable,
        # a list of languages the transcript can be translated to
        transcript.translation_languages,
    )

    # fetch the actual transcript data
    print(transcript.fetch())

    # translating the transcript will return another transcript object
    print(transcript.translate('en').fetch())

mbsCUmJNheM Spanish (auto-generated) es True True [_TranslationLanguage(language='English', language_code='en')]
FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='Hola chicos Cómo están les saludo con', start=0.9, duration=4.02), FetchedTranscriptSnippet(text='muchísimo gusto ya saben que mi nombre', start=3.179, duration=3.541), FetchedTranscriptSnippet(text='es Cristian que soy el profe Cristian y', start=4.92, duration=3.48), FetchedTranscriptSnippet(text='que soy el profe que más conoce de exani', start=6.72, duration=4.499), FetchedTranscriptSnippet(text='en todo el país chicos Les traigo una', start=8.4, duration=4.56), FetchedTranscriptSnippet(text='buena noticia quiero que sepan que les', start=11.219, duration=3.96), FetchedTranscriptSnippet(text='voy a compartir un curso completito de', start=12.96, duration=3.96), FetchedTranscriptSnippet(text='redacción directa para todas aquellas', start=15.179, duration=3.901), FetchedTranscriptSnippet(text='personas que van a ha

In [None]:
playlists_enem, lang_enem = [
    'https://www.youtube.com/playlist?list=PL8vXuI6zmpdiy2Xhd1Tn4_q974HxDUzox', # Kultivi, ENEM | Português, 61 videos
    'https://www.youtube.com/playlist?list=PLQVUQftDIJQErJmPAGyHrdtxrgLklj6a_', # Curso Enem Gratuito, REDAÇÂO ENEM, 74 videos
    'https://www.youtube.com/playlist?list=PLdkZd7NEhSen1qvUgdM56njgqWyUHBbVT', # Kaylanne Roberta, GEOGRAFIA PARA O ENEM, 100 videos
    'https://www.youtube.com/playlist?list=PLMra4G0-Z7pMYLE-D-ptnHt1IW_Y1hn8H', # Parabólica, História para o ENEM, 106 videos
    'https://www.youtube.com/playlist?list=PLMra4G0-Z7pOBZjKCxen5cRa6iUMAdYKh', # Parabólica, Filosofia para o ENEM, 134 videos
    'https://www.youtube.com/playlist?list=PLPNLvl90MqKT06er408AFhGbSHt7YuCZS', # Partiu Universidade, Sociologia para o ENEM, 65 videos
    'https://www.youtube.com/playlist?list=PLGyv8aUrOlzB76mu-Em4JaCFaR2dnccKH', # Gis com Giz Matemática, ENEM MATEMÁTICA, 155 videos
    'https://www.youtube.com/playlist?list=PLruc42XZPq_beOi5MBhnvnNXuqPuubwWx', # Não Perca a Cabeça, Matemática | Extensivo Enem 2020 NPAC, 26 videos
    'https://www.youtube.com/playlist?list=PLruc42XZPq_ZOg6er3bnrCwc_SuE0MiD5', # Não Perca a Cabeça, Química | Extensivo Enem 2020 NPAC, 32 videos
    'https://www.youtube.com/playlist?list=PLruc42XZPq_Z3eTDHPQ6WS2SQdlFGYMFZ', # Não Perca a Cabeça, Biologia | Extensivo Enem 2020 NPAC, 29 videos
    'https://www.youtube.com/playlist?list=PLruc42XZPq_ZDymwPY1ZM3gNMg53dqTM-', # Não Perca a Cabeça, Física | Extensivo Enem 2020 NPAC, 32 videos
    'https://www.youtube.com/playlist?list=PL0LfmDSptaT3Lvnf-YsUI18pzihaLPARU', # Marcelão da Química, EXTENSIVO QUÍMICA COMPLETO, 98 videos
    'https://www.youtube.com/playlist?list=PL8vXuI6zmpdiG6QR-LpXXbUYzPz5rOhF2', # Kultivi, Física, 67 videos
    'https://www.youtube.com/playlist?list=PL8vXuI6zmpdgu7TOyarRBU42MXOrUJnOS', # Kultivi, Biologia, 121 videos
], "pt"

playlists_sat, lang_sat = [
    "https://www.youtube.com/playlist?list=PLXmJvdMwHZJ-UdQx7Y79LluJpYblQK9yV",  # Hayden Rhodea SAT, Free 90-Days of SAT Prep Lessons
    "https://www.youtube.com/playlist?list=PL0o_zxa4K1BWLRgHW2i8diQ359U5FbKnI",  # The Organic Chemistry Tutor, SAT Math
    "https://www.youtube.com/playlist?list=PLXmJvdMwHZJ98xsCj2NWsdyCaNac4xX6J",  # Hayden Rhodea SAT, Digital SAT Prep
    "https://www.youtube.com/playlist?list=PLtxyMEBEejjngM0SdF4IJERmllvugNwC1",  # MathPrep with Ahmad Bilal, Digital SAT Math
    "https://www.youtube.com/playlist?list=PLfCCnEv2UFBGhA3X0GhD6PNOs-_V5w8IL",  # Prepworks Education, 2025 DIGITAL SAT READING & WRITING
    "https://www.youtube.com/playlist?list=PLfCCnEv2UFBErzkHx6EC2DWzJO43p3Umv",  # Prepworks Education, 2025 DIGITAL SAT MATH
], "en"

playlists_icfes, lang_icfes = [
    "https://www.youtube.com/playlist?list=PLc3AJ6veKFxwRmzseIcfbiP0PCau2xpYL",  # Dalton Academy, 2024-1 Matemáticas
    "https://www.youtube.com/playlist?list=PLc3AJ6veKFxxMtIfqaNLOPBBhFGjfa1p_",  # Dalton Academy, Sociales Examen 2025
    "https://www.youtube.com/playlist?list=PLfgpI424HxeV9j_saqo8mZNsNmvxWok-P",  # Andres Sandoval Acero, Videos preparación de ICFES
    "https://www.youtube.com/playlist?list=PLrG8f_FO9cA1dSoVpiKLaGSqQoQuYRrs5",  # Ángela, Preparación ICFES
    "https://www.youtube.com/playlist?list=PLCDYXrbSVBJEc1qeqL5fDluLmy9xurEj0",  # Las Mates Fáciles, ICFES Matemática
    "https://www.youtube.com/playlist?list=PLJxMpbL3S1R5jcWCtMe-_7shXos03QmFn",  # Praxis Thinks, Prática Geral ICFES
    "https://www.youtube.com/playlist?list=PLMY6ovNEwSVPd0q7RzsV8pCmutQMxjLhz",  # Andrea :3, Icfes Geral
    "https://www.youtube.com/playlist?list=PLJxMpbL3S1R51_T3hRbBuP0kv9DbF9JRZ",  # Praxis Thinks, Leitura Crítica
    "https://www.youtube.com/playlist?list=PLJxMpbL3S1R4HgIuHX1ionNrD4R-tH899",  # Praxis Thinks, Biologia
    "https://www.youtube.com/playlist?list=PLJxMpbL3S1R6OjrYLlTob7UbSK21a7-__",  # Praxis Thinks, Química sem fórmulas
    "https://www.youtube.com/playlist?list=PLJxMpbL3S1R4dPMe8WQa5ecaEhyyKRPCE",  # Praxis Thinks, Sociales y Ciudadanas
    "https://www.youtube.com/playlist?list=PLJxMpbL3S1R4e2yL-BcUeV-ZuUdoALJcH",  # Praxis Thinks, Física sem fórmulas
], "es"

playlists_cuet, lang_cuet = [
    "https://www.youtube.com/playlist?list=PLhrnHPBcOqMmeWwrMIPT7x94vqG-CKioM",  # Dear Sir, CUET 2022 Preparation
    "https://www.youtube.com/playlist?list=PLhrnHPBcOqMmXznFFIzL3BxrOtSNLPYhr",  # Dear Sir, CUET GAT 2025
    "https://www.youtube.com/playlist?list=PLhrnHPBcOqMnMjpIc8DgpLVuEKrHv3WJK",  # Dear Sir, CUET 2025 English
], "en"

playlists_portugal, lang_portugal = [
    "https://www.youtube.com/playlist?list=PLptnVRHa5SDKrsEj_zotfxAMh4a2ux1Yk",  # EmPortuguês, Gramática
    "https://www.youtube.com/playlist?list=PLOnAU1tbviBVcOL2iPwLINvubS0r1CDTA",  # Clara, Exames Nacionais PT
    "https://www.youtube.com/playlist?list=PLJjXYrAIDW5KSpq-1BPYeuJ8frBKQqB8g",  # Prof. Bruno Amorim, Matemática A
], "pt"

playlists_exanii, lang_exanii = [
    "https://www.youtube.com/playlist?list=PLrIEi9PL7EuFVHdc8qXihzilsc0cqPSs_",  # Profe Cristian, Curso Integral Plus 2025
    "https://www.youtube.com/playlist?list=PLrIEi9PL7EuElFoiP5I2SS7ltgAkg1OAp",  # Profe Cristian, Pensamiento Matemático
    "https://www.youtube.com/playlist?list=PLrIEi9PL7EuHt05NBceRFFv5o2m_Bj40d",  # Profe Cristian, Redacción Indirecta
    "https://www.youtube.com/playlist?list=PL6diNeAKW2DNuwoS24vpFVumVUpCG2Tsx",  # Tony, Historia de México
    "https://www.youtube.com/playlist?list=PLPqecczCAWWX2oJq9NnG4fUJOHjNU_qa3",  # Economia Desde Casa, Administración
], "es"



In [21]:
video_id

'mbsCUmJNheM'

In [17]:
transcript = transcript_list.find_transcript(['es', 'en'])

In [18]:
transcript

<youtube_transcript_api._transcripts.Transcript at 0x18c0d907cd0>

In [35]:
from langchain_community.document_loaders import YoutubeLoader

In [38]:
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=QsYGlZkevEg", add_video_info=False
)
transcript = loader.load()

AttributeError: type object 'YouTubeTranscriptApi' has no attribute 'list_transcripts'