In [None]:
import os
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint, ChatHuggingFace
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from urllib.parse import urlparse, parse_qs


In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_key"

In [4]:
def get_youtube_video_id(url: str) -> str:
    """
    Extract the video ID from a YouTube URL.
    Works for standard, short, and URLs with extra parameters.
    """
    parsed_url = urlparse(url)

    # Case 1: Standard URL (https://www.youtube.com/watch?v=VIDEO_ID)
    if parsed_url.hostname in ["www.youtube.com", "youtube.com"]:
        query_params = parse_qs(parsed_url.query)
        return query_params.get("v", [None])[0]

    # Case 2: Short URL (https://youtu.be/VIDEO_ID)
    elif parsed_url.hostname in ["youtu.be"]:
        return parsed_url.path.lstrip("/")

    return None


In [8]:
url = "https://youtu.be/TrExLi4Rbqo"

video_id = get_youtube_video_id(url)

print("Video ID:", video_id)

Video ID: TrExLi4Rbqo


In [9]:
ytt_api = YouTubeTranscriptApi()
fetched_transcript = ytt_api.fetch(video_id,languages=["en"])


In [10]:
fetched_transcript

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text="is India's history just a story of the past or\xa0\nis it the key to understanding one of the world's\xa0\xa0", start=4.0, duration=5.6), FetchedTranscriptSnippet(text='greatest and most enduring civilizations journey\xa0\nwith us through thousands of years of remarkable\xa0\xa0', start=9.6, duration=5.76), FetchedTranscriptSnippet(text='diversity and unity revolutionary ideas and\xa0\nimperial power struggles for freedom and modern\xa0\xa0', start=15.36, duration=5.92), FetchedTranscriptSnippet(text="aspirations what makes India's story so unique and\xa0\nhow does this ancient legacy shape our present and\xa0\xa0", start=21.28, duration=6.72), FetchedTranscriptSnippet(text='future india is not just a country it is a vast\xa0\nliving tapestry woven from thousands of years\xa0\xa0', start=28.0, duration=6.0), FetchedTranscriptSnippet(text='of human experience from the snowcapped Himalayas\xa0\nin the north to the tropical shores

In [11]:
transcript = " ".join(chunk.text for chunk in fetched_transcript)
print(transcript)

is India's history just a story of the past or 
is it the key to understanding one of the world's   greatest and most enduring civilizations journey 
with us through thousands of years of remarkable   diversity and unity revolutionary ideas and 
imperial power struggles for freedom and modern   aspirations what makes India's story so unique and 
how does this ancient legacy shape our present and   future india is not just a country it is a vast 
living tapestry woven from thousands of years   of human experience from the snowcapped Himalayas 
in the north to the tropical shores of the Indian   Ocean in the south the Indian subcontinent 
has been home to some of the world's oldest   and most influential civilizations its story is 
one of extraordinary depth and diversity marked   by spiritual discovery imperial grandeur foreign 
invasions cultural synthesis and revolutionary   resistance geographically varied and culturally 
rich India has nurtured a multitude of languages   faiths phil

In [13]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [14]:
len(chunks)

47

In [15]:
chunks[8]

Document(metadata={}, page_content="noble truths and the eight-fold path a practical\xa0\xa0 guide to end suffering and attain liberation\xa0\nbuddhism's egalitarian ethics simplicity and\xa0\xa0 monastic institutions attracted followers across\xa0\nsocial classes it was patronized by Emperor Ashoka\xa0\xa0 of the Maria Empire 3rd century B.CE CE who\xa0\nhelped spread the religion across Asia from\xa0\xa0 Sri Lanka to China Korea and Japan through\xa0\nmissionaries and inscriptions while Janism\xa0\xa0 remained influential primarily in western India\xa0\nBuddhism declined in the subcontinent by the 10th\xa0\xa0 century CE partly due to the revival of devotional\xa0\nHinduism and the decline of monastic institutions\xa0\xa0 after invasions however it continued to thrive in\xa0\nEast and Southeast Asia these traditions although\xa0\xa0 numerically smaller today played a critical\xa0\nrole in shaping Indian ethical thought art and\xa0\xa0 religious pluralism their emphasis on non-violenc

In [17]:
# Create embeddings using Hugging Face model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# Build the FAISS vector store
vector_store = FAISS.from_documents(chunks, embeddings)

In [18]:
vector_store.index_to_docstore_id

{0: 'e3c8eabc-c8e0-4f9c-a5b2-065f65f2b001',
 1: 'eb2886c2-df5b-4991-ada5-cb167b88358e',
 2: '54f3878f-da20-4225-ba3d-748a8374802b',
 3: '27dec55b-fcd1-4d1e-9f9c-0f6428d0b0b3',
 4: '171a95b1-7862-467d-ba80-064f03a03fe9',
 5: 'de89ea7c-8aed-40c9-9468-ea7aad5da80a',
 6: 'ab4f24b5-59e8-4c44-8966-4ae47305ca3e',
 7: 'e8b85be6-c91f-4745-a811-e0d0660e1d4d',
 8: '18ca8dd6-af9b-476a-a68c-c4fd1909b4f6',
 9: '7eb4ac53-f9a2-4d14-a345-68b8a8f5c012',
 10: '53f8861e-89b9-4a4c-a7d8-cc576219ad51',
 11: '5edc1bd4-d277-42fb-bf8e-abdc015fac5e',
 12: '44ed6091-aabe-4b91-8a3b-4db77d533d28',
 13: 'f37c3864-7dd2-4f5b-b486-196c9f3a96be',
 14: 'f67a85d8-8df0-47c1-a05c-a364301f396b',
 15: '37cda24c-0338-4496-acb5-a73d23a65f2f',
 16: '65529b76-76a6-4f56-8998-adc3d706f9ee',
 17: '9c0bc8c2-f4d6-43c2-8922-0a342edfda04',
 18: '4c5d376b-d74e-4351-bb61-00aee547ff1a',
 19: '05e2b5c7-86f5-40c1-868c-73d9f3ce8989',
 20: '6057732c-2655-467c-adfe-889dc27409b7',
 21: '315ce996-2b62-4bb3-9a94-ffa8367ba22e',
 22: 'f6655fb2-6029-

In [18]:
vector_store.get_by_ids(['753f5749-628e-4851-8275-7962d496f726'])

[]

In [20]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [21]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000129863907A0>, search_kwargs={'k': 4})

In [22]:
retriever.invoke('What is the main topic of the video?')

[Document(id='1c54460b-d08d-4a8d-a0ea-f6844ebf20fc', metadata={}, page_content="under finance minister Manohan Singh reforms\xa0\xa0 dismantled the license Raj reduced tariffs and\xa0\nopened markets to foreign investment triggering\xa0\xa0 rapid economic growth and integration into the\xa0\nglobal economy this period witnessed the IT boom\xa0\xa0 with cities like Bangalore and Hyderabbad emerging\xa0\nas global technology hubs fueling urbanization and\xa0\xa0 the rise of a new middle class with increased\xa0\nconsumption and aspirations politically India\xa0\xa0 saw significant shifts the Paratia Janata\xa0\nparty BJP rose to prominence with its Hindutva\xa0\xa0 ideology emphasizing Hindu cultural nationalism\xa0\nreshaping electoral politics and social discourse\xa0\xa0 simultaneously regional coalitions became key to\xa0\ngovernance reflecting India's complex diversity\xa0\xa0 socially India grappled with issues of dite\xa0\nrights castbased discrimination and affirmative\xa0\xa0 ac

In [23]:
# Hugging Face LLM
llm_x = HuggingFaceEndpoint(
    repo_id="openai/gpt-oss-120b",
    task="text-generation",
    temperature=0.2,
    max_new_tokens=512 
)
llm = ChatHuggingFace(llm=llm_x)

In [24]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [25]:
question          = "What is the main topic of the video?"
retrieved_docs    = retriever.invoke(question)

In [26]:
retrieved_docs

[Document(id='1c54460b-d08d-4a8d-a0ea-f6844ebf20fc', metadata={}, page_content="under finance minister Manohan Singh reforms\xa0\xa0 dismantled the license Raj reduced tariffs and\xa0\nopened markets to foreign investment triggering\xa0\xa0 rapid economic growth and integration into the\xa0\nglobal economy this period witnessed the IT boom\xa0\xa0 with cities like Bangalore and Hyderabbad emerging\xa0\nas global technology hubs fueling urbanization and\xa0\xa0 the rise of a new middle class with increased\xa0\nconsumption and aspirations politically India\xa0\xa0 saw significant shifts the Paratia Janata\xa0\nparty BJP rose to prominence with its Hindutva\xa0\xa0 ideology emphasizing Hindu cultural nationalism\xa0\nreshaping electoral politics and social discourse\xa0\xa0 simultaneously regional coalitions became key to\xa0\ngovernance reflecting India's complex diversity\xa0\xa0 socially India grappled with issues of dite\xa0\nrights castbased discrimination and affirmative\xa0\xa0 ac

In [27]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"under finance minister Manohan Singh reforms\xa0\xa0 dismantled the license Raj reduced tariffs and\xa0\nopened markets to foreign investment triggering\xa0\xa0 rapid economic growth and integration into the\xa0\nglobal economy this period witnessed the IT boom\xa0\xa0 with cities like Bangalore and Hyderabbad emerging\xa0\nas global technology hubs fueling urbanization and\xa0\xa0 the rise of a new middle class with increased\xa0\nconsumption and aspirations politically India\xa0\xa0 saw significant shifts the Paratia Janata\xa0\nparty BJP rose to prominence with its Hindutva\xa0\xa0 ideology emphasizing Hindu cultural nationalism\xa0\nreshaping electoral politics and social discourse\xa0\xa0 simultaneously regional coalitions became key to\xa0\ngovernance reflecting India's complex diversity\xa0\xa0 socially India grappled with issues of dite\xa0\nrights castbased discrimination and affirmative\xa0\xa0 action debates environmental protests emerged\xa0\nin response to industrializati

In [28]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [29]:
final_prompt

StringPromptValue(text="\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don't know.\n\n      under finance minister Manohan Singh reforms\xa0\xa0 dismantled the license Raj reduced tariffs and\xa0\nopened markets to foreign investment triggering\xa0\xa0 rapid economic growth and integration into the\xa0\nglobal economy this period witnessed the IT boom\xa0\xa0 with cities like Bangalore and Hyderabbad emerging\xa0\nas global technology hubs fueling urbanization and\xa0\xa0 the rise of a new middle class with increased\xa0\nconsumption and aspirations politically India\xa0\xa0 saw significant shifts the Paratia Janata\xa0\nparty BJP rose to prominence with its Hindutva\xa0\xa0 ideology emphasizing Hindu cultural nationalism\xa0\nreshaping electoral politics and social discourse\xa0\xa0 simultaneously regional coalitions became key to\xa0\ngovernance reflecting India's complex diversity\xa0\

In [30]:
answer = llm.invoke(final_prompt)
print(answer.content)

The video’s main topic is the historical and contemporary evolution of India—covering its economic reforms, political shifts, social changes, and cultural developments that have shaped modern Indian society.


In [31]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [32]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [33]:
parallel_chain.invoke('Explain ancient civilization')

{'context': "away technology unrivaled in the ancient world\xa0\xa0 graneries dockyards and public baths reflected\xa0\nnot only architectural prowess but also a\xa0\xa0 strong administrative structure the uniformity in\xa0\nbrick sizes and town layouts suggest centralized\xa0\xa0 planning and control possibly by a ruling elite or\xa0\ncouncil the economy was based on agriculture craft\xa0\xa0 production and extensive trade networks seals\xa0\nbearing animal motifs and a still undeciphered\xa0\xa0 script point to commercial and possibly religious\xa0\nactivities goods such as beads ivory and ceramics\xa0\xa0 were traded with Mesopotamia and other distant\xa0\nregions indicating a robust international\xa0\xa0 exchange system yet despite its achievements the\xa0\nIndis Valley civilization mysteriously declined\xa0\xa0 around 1900 B.CE scholars have debated the causes\xa0\nclimate change river shifts tectonic activity and\xa0\xa0 the Aryan migration or invasion theory each\xa0\nremains pl

In [34]:
parser = StrOutputParser()

In [35]:
main_chain = parallel_chain | prompt | llm | parser

In [36]:
main_chain.invoke('Can you summarize the video')

'The video presents a sweeping overview of India’s historical journey—from ancient civilizations through colonial rule to independence and its aftermath. It highlights key figures such as Mahatma\u202fGandhi, Jawaharlal\u202fNehru, Sardar\u202fPatel, Dr.\u202fB.R.\u202fAmbedkar, Subhas\u202fChandra\u202fBose, and Muhammad\u202fAli\u202fJinnah, illustrating their differing visions of a secular, socialist, and democratic nation.  \n\nIt explains how the Indian National Congress and the Muslim League’s demand for a separate Muslim homeland led to the 1947 partition of British India into India and Pakistan, triggering massive migrations, communal violence, and lasting trauma.  \n\nThe narrative emphasizes India’s rich cultural and geographic diversity, its legacy of spiritual, imperial, and revolutionary movements, and how this deep‑rooted past continues to shape contemporary Indian policies on secularism, minority rights, and nation‑building. The video concludes by noting the ongoing chal