In [1]:
!pip install pytube

Collecting pytube
  Using cached pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Using cached pytube-15.0.0-py3-none-any.whl (57 kB)
Installing collected packages: pytube
Successfully installed pytube-15.0.0


In [1]:
from indexify import IndexifyClient, ExtractionGraph
client = IndexifyClient()

In [3]:
extraction_graph_spec = """
name: 'video-knowledgebase'
extraction_policies:
   - extractor: 'tensorlake/audio-extractor'
     name: 'audio_clips'
   - extractor: 'tensorlake/whisper-asr'
     name: 'transcription'
     content_source: 'audio_clips'
   - extractor: 'tensorlake/chunk-extractor'
     name: 'transcription_chunks'
     input_params:
        chunk_size: 1000
        overlap: 250
     content_source: 'transcription'
   - extractor: 'tensorlake/minilm-l6'
     name: 'transcription-embedding'
     content_source: 'transcription_chunks'
"""

extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
client.create_extraction_graph(extraction_graph)                                            

In [2]:
from pytube import YouTube
import os
yt = YouTube("https://www.youtube.com/watch?v=cplSUhU2avc")
file_name = "state_of_the_union_2024.mp4"
if not os.path.exists(file_name):
    yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=file_name)

In [6]:
client.upload_file(extraction_graphs="video-knowledgebase", path="state_of_the_union.mp4")       

'c151c4c8d936a6bf'

In [4]:
client.extractors()

[Extractor(name=tensorlake/audio-extractor, description=Extract audio from video, input_params={'properties': {}, 'title': 'AudioExtractorConfig', 'type': 'object'}, input_mime_types=['video', 'video/mp4', 'video/mov', 'video/avi'], outputs={}),
 Extractor(name=tensorlake/chunk-extractor, description=Text Chunk Extractor, input_params={'properties': {'chunk_size': {'default': 100, 'title': 'Chunk Size', 'type': 'integer'}, 'headers_to_split_on': {'default': [], 'items': {'type': 'string'}, 'title': 'Headers To Split On', 'type': 'array'}, 'overlap': {'default': 0, 'title': 'Overlap', 'type': 'integer'}, 'text_splitter': {'default': 'recursive', 'enum': ['char', 'recursive', 'markdown', 'html'], 'title': 'Text Splitter', 'type': 'string'}}, 'title': 'ChunkExtractionInputParams', 'type': 'object'}, input_mime_types=['text/plain'], outputs={}),
 Extractor(name=tensorlake/minilm-l6, description=MiniLM-L6 Sentence Transformer, input_params=None, input_mime_types=['text/plain'], outputs={'em

In [12]:
client.add_extraction_policy(extractor='tensorlake/audio-extractor', name="audio_clips_of_videos")

In [14]:
client.add_extraction_policy(extractor='tensorlake/whisper-asr', name="audio-transcription", content_source='audio_clips_of_videos')

In [15]:
client.add_extraction_policy(extractor='tensorlake/chunk-extractor', name="transcription-chunks", content_source='audio-transcription')

In [16]:
client.add_extraction_policy(extractor='tensorlake/minilm-l6', name="transcription-embedding", content_source='transcription-chunks', input_params={'chunk_size': 2000, 'overlap': 200})

In [32]:
from indexify_langchain import IndexifyRetriever
params = {"name": "transcription-embedding.embedding", "top_k": 50}
retriever = IndexifyRetriever(client=client, params=params)

In [33]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [35]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)


In [38]:
chain.invoke("Whats President Biden doing to save climate and the evidences he provides?")        

'Biden is taking significant action on climate by cutting carbon emissions in half by 2030, creating clean energy jobs, launching the Climate Corps, and working towards environmental justice. He mentions that the world is facing a climate crisis and that all Americans deserve the freedom to be safe. Biden also mentions that America is safer today than when he took office and provides statistics on murder rates and violent crime decreasing.'