## Youtube Simple Rag chatbot

In [1]:
from youtube_transcript_api import YouTubeTranscriptApi,TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai  import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv 

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

## Step 1a - Indexing (Document Ingestion)

In [2]:
video_id = "Gfr50f6ZBvo" # only the ID, not full URL
try:
    transcript_list = YouTubeTranscriptApi().fetch(video_id,languages=["en"]).to_raw_data()

    transcript = " ".join(chunk["text"] for chunk in transcript_list)
    print(transcript)
    
except TranscriptsDisabled:
    print("No captions avaible for this video")

the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we will talk many times again in the future this is the lex friedman podcast to support it please check out our sponsors in the description and now dear friends here's demis hassabis let's start with a bit of a personal question am i an ai program you wrote to interview people until i get good enough 

## Step 1b - Indexing (Text Splitting)

In [3]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [4]:
print(len(chunks))
chunks

168


[Document(metadata={}, page_content="the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we will talk many times again in the future this is the lex friedman podcast to support it please check out our sponsors in the description and now dear friends here's demis hassabis let's start with a bit of a personal question am i an ai program you wrote to inte

## Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [5]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") 
vector_store = FAISS.from_documents(chunks,embedding=embeddings)

In [6]:
vector_store.index_to_docstore_id

{0: '903975f0-8f33-4c8c-9167-dec016e29266',
 1: 'd6a2f50d-6692-4c1f-98f0-e6b57276f3c1',
 2: '87a2eec9-eac2-4e2d-add4-6727ba4eeb07',
 3: '4656cc4e-593d-4818-8d2b-d7db214830bf',
 4: '5c1ab2c1-b4d1-4fd5-9bfa-5bad291ea9e5',
 5: '39dcde61-00f5-4eef-b318-0b7aa8ca12c7',
 6: 'ec7d3db7-2f56-4871-b896-7ea7cd4cbd53',
 7: 'b8b434c1-39a7-46d7-87ef-425185e282ce',
 8: '18f255c4-ed82-4f3f-ab09-05d694da64fb',
 9: 'cf3af9dd-8731-4a42-9c77-bb304832a38a',
 10: '3b91bdc9-ef50-409c-8cdd-03f939df7b9e',
 11: '1a68fa53-9a10-49ff-acf4-fc44cc4f4db6',
 12: '50af1019-cf45-491c-85cf-114ac25a8794',
 13: '8c3addb3-7b78-4d4b-b2ee-1242a1d838f0',
 14: 'f2cfe44b-7e47-4630-861d-911e46116e9e',
 15: 'f57e8f67-ba2c-437e-83fe-baa4709715ea',
 16: '8e817cb9-e125-415b-bcc0-43a4b343b1c2',
 17: 'c7ede644-570e-41d9-ab86-d063ac7822dc',
 18: 'f57a9801-9cd5-466f-81f1-9c529644265b',
 19: '16785e05-ef93-4085-8116-1983df7270d7',
 20: '762e0748-ec1f-45f9-a76f-34aafbdb1dbc',
 21: 'ae0df83d-2eeb-4806-93a0-09f8af8faab4',
 22: 'bda92326-a6a8-

In [7]:
vector_store.get_by_ids(["ff64ffd3-eec1-414d-a404-f6b386e5c155","ce73aabe-1c8b-4e75-af38-457429370a74"])

[]

## Step 2 - Retrieval

In [8]:
retriever = vector_store.as_retriever(search_type="similarity",search_kwargs={"k":4})

In [9]:
retriever

VectorStoreRetriever(tags=['FAISS', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000200B99AECF0>, search_kwargs={'k': 4})

In [10]:
retriever.invoke("What is deepmind")

[Document(id='478316eb-e887-4d71-a1f0-7f2e62d5fa26', metadata={}, page_content="that are amazingly smart at certain things like maybe playing go and chess and other things but they don't feel at all in any shape or form conscious in the way that you know you do to me or i do to you and um and i think actually building ai is uh these intelligent constructs uh is one of the best ways to explore the mystery of consciousness to break it down because um we're going to have devices that are pretty smart at certain things or capable of certain things but potentially won't have any semblance of self-awareness or other things and in fact i would advocate if there's a choice building systems in the first place ai systems that are not conscious to begin with uh are just tools um until we understand them better and the capabilities better so on that topic just not as the ceo of deep mind just as a human being let me ask you about this one particular anecdotal evidence of the google engineer who ma

In [11]:
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

In [12]:
prompt = PromptTemplate(
    template="""
 You are a helpfull assistant 
 Answer only  from the provided transcipt context.
 If the context is insufficient, just say you don't know.
 {context}
 Question: {question}
  """,
  input_variables=["context","question"]
)

In [14]:
question = "is the topic of nuclear fusion discussed in this video? if yes then what was discussed"
retriever_docs = retriever.invoke(question)

In [15]:
retriever_docs

[Document(id='ffde3bc7-8126-4567-849d-a5015925c67a', metadata={}, page_content="in this case in fusion we we collaborated with epfl in switzerland the swiss technical institute who are amazing they have a test reactor that they were willing to let us use which you know i double checked with the team we were going to use carefully and safely i was impressed they managed to persuade them to let us use it and um and it's a it's an amazing test reactor they have there and they try all sorts of pretty crazy experiments on it and um the the the what we tend to look at is if we go into a new domain like fusion what are all the bottleneck problems uh like thinking from first principles you know what are all the bottleneck problems that are still stopping fusion working today and then we look at we you know we get a fusion expert to tell us and then we look at those bottlenecks and we look at the ones which ones are amenable to our ai methods today yes right and and and then and would be intere

In [16]:
context_text = "\n\n".join(doc.page_content for doc in retriever_docs)
context_text

"in this case in fusion we we collaborated with epfl in switzerland the swiss technical institute who are amazing they have a test reactor that they were willing to let us use which you know i double checked with the team we were going to use carefully and safely i was impressed they managed to persuade them to let us use it and um and it's a it's an amazing test reactor they have there and they try all sorts of pretty crazy experiments on it and um the the the what we tend to look at is if we go into a new domain like fusion what are all the bottleneck problems uh like thinking from first principles you know what are all the bottleneck problems that are still stopping fusion working today and then we look at we you know we get a fusion expert to tell us and then we look at those bottlenecks and we look at the ones which ones are amenable to our ai methods today yes right and and and then and would be interesting from a research perspective from our point of view from an ai point of\n\

In [17]:
final_prompt = prompt.invoke({"context":context_text,"question":question})

In [18]:
final_prompt

StringPromptValue(text="\n You are a helpfull assistant \n Answer only  from the provided transcipt context.\n If the context is insufficient, just say you don't know.\n in this case in fusion we we collaborated with epfl in switzerland the swiss technical institute who are amazing they have a test reactor that they were willing to let us use which you know i double checked with the team we were going to use carefully and safely i was impressed they managed to persuade them to let us use it and um and it's a it's an amazing test reactor they have there and they try all sorts of pretty crazy experiments on it and um the the the what we tend to look at is if we go into a new domain like fusion what are all the bottleneck problems uh like thinking from first principles you know what are all the bottleneck problems that are still stopping fusion working today and then we look at we you know we get a fusion expert to tell us and then we look at those bottlenecks and we look at the ones whic

## Step 4 - Generation

In [19]:
answer = llm.invoke(final_prompt)
print(answer.content)

Yes, nuclear fusion is discussed in the video. Here's what was discussed:

*   Collaboration with EPFL: The speaker mentions collaborating with EPFL (the Swiss Federal Institute of Technology in Lausanne) and using their test reactor for fusion experiments.
*   Bottleneck Problems: They focus on identifying and addressing the bottleneck problems that are preventing fusion from working effectively, using AI methods.
*   Plasma Control: They developed an AI controller using deep reinforcement learning that can hold plasma in specific shapes for record amounts of time. This was published in a Nature paper.
*   Future Research: They are now in discussion with fusion startups to identify the next problem they can tackle in the fusion area using AI.
*   Traditional Controllers vs. AI: Traditional controllers for plasma containment are handcrafted and cannot react in real-time to the plasma's behavior. The AI controller learns to react instead.
*   Reinforcement Learning: Controlling the magn

## Building a Chain

In [20]:
from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [21]:
def format_docs( retriever_docs):
    context_text = "\n\n".join( doc.page_content for doc in retriever_docs)
    return context_text

In [22]:
parallel_chain = RunnableParallel({
      "context":retriever | RunnableLambda(format_docs),
      "question":RunnablePassthrough()
})

In [23]:
parallel_chain.invoke('who is Demis')

{'context': "the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we will talk many times again in the future this is the lex friedman podcast to support it please check out our sponsors in the description and now dear friends here's demis hassabis let's start with a bit of a personal question am i an ai program you wrote to interview people until i get

In [24]:
parser = StrOutputParser()

In [26]:
main_chain = parallel_chain  | prompt | llm | parser

In [27]:
main_chain.invoke('Can you summarize the video')

"Here's a summary of the provided transcript snippets:\n\n*   The discussion involves using simulations and learning functionals to describe chemistry, specifically how electron clouds interact when elements are combined. The goal is to simulate larger materials by approximating Schrodinger's equation.\n*   The process involves running molecular dynamics simulations on compute clusters to generate data. This data is then used to learn the functional that maps initial conditions and simulation parameters.\n*   There is also a discussion about finding passions that intersect with unique strong skills.\n*   The transcript also touches on the topic of physics, with the idea of finding a much simpler or deeper explanation than the standard model.\n*   The speaker is asked about his perfect productive day and habits."