In [3]:
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from youtube_transcript_api import YouTubeTranscriptApi
from pytube import Playlist
from tqdm import tqdm

from langchain_community.document_loaders import TextLoader
from langchain.schema import Document
from dotenv import load_dotenv

load_dotenv()

import time
import os

def insert_metadata_and_type(documents, reference):
    for idx, doc in enumerate(documents):
        page_content = doc.page_content
        title = reference.metadata['title']
        view_count = reference.metadata['view_count']
        publish_date = reference.metadata['publish_date']
        author = reference.metadata['author']
        description = reference.metadata['description']
        link_video = f"https://www.youtube.com/watch?v={reference.metadata['source']}"
        new_page_content = f'Title: {title}\nViews count: {view_count}\nPublish date: {publish_date}\nAuthor: {author}\nDescription: {description}\nLink video: {link_video}\n\n{page_content}'
        
        documents[idx] = Document(
            page_content=new_page_content,
            metadata=reference.metadata,
            type=reference.type
        )
    return documents

playlist_link_list = ['https://www.youtube.com/playlist?list=PLg0ijj_Zz1a4r6JNAIcKI3io8PKdmdMYD',  # Calculo vetorial
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a5m-lHE3KnqBBBwjrcHqhym',  # Keras
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a68r2P9yENIqm3M-YvOlgYe',  # Processamento digital de sinais
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a7-MDI1XO2ovJ0uuBe1xCb7',  # Sistemas operacionais
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a4EQOpxQXIb9pPHu6w9herV',  # Compiladores
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a6qd__yo0mBPdlFav6BzYoT',  # Estrutura de dados em C
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a6Q3uvjhv1xLfAflNfqSFv-',  # P-valor
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a4Lwe4V2LeAe7u7vxb-p416',  # Sistemas embarcados
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a5ts7T1AlmYPIgWJHig3PZj',  # Sintetizando sons com python
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a6x31a2sZ69r9zD_J-abKjN',  # Arduino
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a5Th6rb0N7ELizobNIB1PvF',  # Sintese aditiva
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a5tKtNvv92N-mJnQKjcApMI',  # MIDI e OSC
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a5c2HiUHHYo7EGfKmSAZHyP',  # Efeitos digitais
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a4xma5FVf0LbMrfUNCFhU9j',  # Sintese granular
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a5ZJpE85xKTUYY3TYTCi9e-',  # Lab software basico
                      'https://www.youtube.com/playlist?list=PLg0ijj_Zz1a7n0fhJGSqC1cntApRYIGFH'  # palestras
                      ]

In [4]:
all_urls = []
for playlist_link in playlist_link_list:
    playlist_object = Playlist(playlist_link)
    list_of_urls = playlist_object.video_urls
    all_urls += list_of_urls
    
print('All Urls in playlist: ', len(all_urls))

All Urls in playlist:  210


In [6]:
all_transcripts = []
for url in tqdm(all_urls):
    try:
        loader = YoutubeLoader.from_youtube_url(
            url, add_video_info=True, language=['pt']
        )
        transcript = loader.load()
        all_transcripts.append(transcript[0])
    except:
        print('Erro no vídeo: ', url)

 29%|██▊       | 60/210 [02:00<04:48,  1.93s/it]

Erro no vídeo:  https://www.youtube.com/watch?v=r8hIT0leJwE


100%|██████████| 210/210 [06:54<00:00,  1.98s/it]


In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4_000,
    chunk_overlap=2_000,
    length_function=len,
    is_separator_regex=False,
)

all_transcripts_json_format = [vars(x) for x in all_transcripts]
data_loaded = [Document(
            page_content=x['page_content'],
            metadata=x['metadata'],
            type=x['type']
        ) for x in all_transcripts_json_format]

documents_splitted = []
for rec_doc in data_loaded:
    docs = insert_metadata_and_type(text_splitter.create_documents([rec_doc.page_content]), rec_doc)
    documents_splitted.extend(docs)
    
print('Total documents: ', len(documents_splitted))

Total documents:  406


In [8]:
import numpy as np
import pandas as pd

content_splitted = np.array([x.page_content for x in documents_splitted]).reshape(-1, 1)
titles = np.array([x.metadata['title'] for x in documents_splitted]).reshape(-1, 1)
df = pd.DataFrame(np.concatenate([titles, content_splitted], axis=1), columns=['titles', 'content'])

In [14]:
json_content_splitted = [vars(x) for x in documents_splitted]

In [16]:
import json

# Salvando a lista em um arquivo JSON
with open('../data/json_content_splitted_canal_tavares.json', 'w', encoding='utf-8') as arquivo_json:
    json.dump(json_content_splitted, arquivo_json, ensure_ascii=False, indent=4)

In [10]:
df.to_csv('../data/content_splitted_canal_tavares.csv', index=False)