In [1]:
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from youtube_transcript_api import YouTubeTranscriptApi
from pytube import Playlist
from tqdm import tqdm

from langchain_community.document_loaders import TextLoader
from langchain.schema import Document
from dotenv import load_dotenv

load_dotenv()

import time
import os

def insert_metadata_and_type(documents, reference):
    for idx, doc in enumerate(documents):
        page_content = doc.page_content
        title = reference.metadata['title']
        view_count = reference.metadata['view_count']
        publish_date = reference.metadata['publish_date']
        author = reference.metadata['author']
        description = reference.metadata['description']
        link_video = f"https://www.youtube.com/watch?v={reference.metadata['source']}"
        new_page_content = f'Title: {title}\nViews count: {view_count}\nPublish date: {publish_date}\nAuthor: {author}\nDescription: {description}\nLink video: {link_video}\n\n{page_content}'
        
        documents[idx] = Document(
            page_content=new_page_content,
            metadata=reference.metadata,
            type=reference.type
        )
    return documents

playlist_link_list = ['https://www.youtube.com/playlist?list=PLMra4G0-Z7pMYLE-D-ptnHt1IW_Y1hn8H', # Parabolica
                      'https://www.youtube.com/playlist?list=PL8vXuI6zmpdj_YFEHTaBDccdSCC1LVNH0', # Quimica Kultivi
                      'https://www.youtube.com/playlist?list=PL8vXuI6zmpdiG6QR-LpXXbUYzPz5rOhF2', # Fisica Kultivi
                      'https://www.youtube.com/playlist?list=PLufADJj3qLe9PlyOZVXTV-URSDEmKgHPy'] # Biologia Thaisefinish


In [2]:

all_urls = []
for playlist_link in playlist_link_list:
    playlist_object = Playlist(playlist_link)
    list_of_urls = playlist_object.video_urls
    all_urls += list_of_urls
    
print('All Urls in playlist: ', len(all_urls))


All Urls in playlist:  357


In [3]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5_000,
    chunk_overlap=1_000,
    length_function=len,
    is_separator_regex=False,
)

all_transcripts = []
for url in tqdm(all_urls):
    try:
        loader = YoutubeLoader.from_youtube_url(
            url, add_video_info=True, language=['pt']
        )
        transcript = loader.load()
        all_transcripts.append(transcript[0])
    except:
        print('Erro no vídeo: ', url)


  1%|▏         | 5/357 [00:10<11:30,  1.96s/it]

Erro no vídeo:  https://www.youtube.com/watch?v=n-gxIqg5xwI


  3%|▎         | 9/357 [00:20<13:45,  2.37s/it]

Erro no vídeo:  https://www.youtube.com/watch?v=jTuT8iiSF3g


  3%|▎         | 11/357 [00:24<11:57,  2.07s/it]

Erro no vídeo:  https://www.youtube.com/watch?v=QoSWUeL3fW0


  4%|▎         | 13/357 [00:28<12:44,  2.22s/it]

Erro no vídeo:  https://www.youtube.com/watch?v=vOQf4k5-T98


 11%|█         | 38/357 [01:24<11:37,  2.19s/it]

In [6]:
all_transcripts_json_format = [vars(x) for x in all_transcripts]
data_loaded = [Document(
            page_content=x['page_content'],
            metadata=x['metadata'],
            type=x['type']
        ) for x in all_transcripts_json_format]

documents_splitted = []
for rec_doc in data_loaded:
    docs = insert_metadata_and_type(text_splitter.create_documents([rec_doc.page_content]), rec_doc)
    documents_splitted.extend(docs)
    
print('Total documents: ', len(documents_splitted))

Total documents:  234


In [26]:
import numpy as np
import pandas as pd

content_splitted = np.array([x.page_content for x in documents_splitted]).reshape(-1, 1)
titles = np.array([x.metadata['title'] for x in documents_splitted]).reshape(-1, 1)
df = pd.DataFrame(np.concatenate([titles, content_splitted], axis=1), columns=['titles', 'content'])

In [29]:
df.to_csv('../data/data_playlists_enem.csv', index=False)