<a href="https://colab.research.google.com/github/vkrisvasan/llamaKV/blob/main/llamaindexYouTubeVideoCommentResponse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#This code is designed to fetch comments from a YouTube video,
#load the video's transcript, create a vector index from the transcript,
#and then use a large language model to respond to the top comments.
#create YOUTUBE_API_KEY in Google Developers Console >new project>Explore & Enable APIs.>
#navigate to YouTube Data API v3 under YouTube APIs>Enable the API>
#Create a credential>Note the API key

# Install required packages
!pip install llama-index llama-index-llms-groq groq llama-index-embeddings-huggingface llama-index-readers-youtube-transcript -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [14]:
import os
from googleapiclient.discovery import build
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from llama_index.core import (VectorStoreIndex, StorageContext, load_index_from_storage,Settings)


In [5]:
# Import os and getpass for handling credentials
import os
import getpass
# Prompt for credentials if not found in environment variables
credential_names = ["GROQ_API_KEY","YOUTUBE_API_KEY"]
for credential in credential_names:
  if credential not in os.environ:
    os.environ[credential]=getpass.getpass("Provide your..." + credential)

Provide your...GROQ_API_KEY··········
Provide your...YOUTUBE_API_KEY··········


In [7]:
VIDEO_ID = "Kbk9BiPhm7o"  # Replace with your actual video ID
# Set up API key and video ID
YOUTUBE_API_KEY = os.environ['YOUTUBE_API_KEY']

# Step 1: Initialize the YouTube API client
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

In [8]:
def get_comments(video_id, max_results=100):
    comments = []
    response = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=max_results,
        textFormat="plainText"
    ).execute()

    for item in response.get('items', []):
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comments.append(comment)

    return comments

In [9]:
# Fetch comments from the video
comments = get_comments(VIDEO_ID)
print(f"Fetched {len(comments)} comments")

Fetched 100 comments


In [18]:
# Initialize the Groq Llama-8B model
llm = Groq(model="llama-3.1-8b-instant", api_key=os.environ["GROQ_API_KEY"])
print("LLM initialized")
Settings.llm = llm

# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model initialized")
Settings.embed_model = embed_model

#  Load YouTube Transcript
youtube_link = f"https://www.youtube.com/watch?v={VIDEO_ID}"
loader = YoutubeTranscriptReader()
documents = loader.load_data(ytlinks=[youtube_link])
print("Documents loaded")

# Create a Vector Index from the Transcript
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
print("Index created")



LLM initialized
Embedding model initialized
Documents loaded
Index created


In [19]:
# Persist the Index to Storage
persist_dir = "./youtube_index_storage"
storage_context = StorageContext.from_defaults()

#index.storage_context = storage_context
index.storage_context.persist(persist_dir=persist_dir)
print("Index persisted")

# Load the Index from Storage
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
index = load_index_from_storage(storage_context=storage_context)
print("Index loaded from storage")

# Create a Query Engine from the Index
query_engine = index.as_query_engine(llm=llm)
print("Query engine created")

Index persisted
Index loaded from storage
Query engine created


In [27]:

# Respond to the Selected Comments [exclude the 1st comment position as it is metadata about the video]
top_comments = comments[27:32]  # Select comments based on position

for comment in top_comments:
    response = query_engine.query(comment)
    print(f"**YouTube Comment: {comment}\n")
    print(f"**Langauge Model Response: {response}\n")


**YouTube Comment: 1:21:52 Musk claiming to be an alien is pure troll comment that an unreasonable amount of people will now believe

**Langauge Model Response: It's interesting to consider how some statements can be perceived as having a significant impact on people's beliefs, even if they're not meant to be taken literally. The idea that a well-known figure might make a comment that seems outlandish, only to have some people take it seriously, raises questions about the nature of trust and the spread of information.

**YouTube Comment: It's so much better of the camera didn't keep on panning to each subject. Both of you should be in the scene at the same time. Way better. And a tv in the middle.

**Langauge Model Response: It would be more engaging to have a conversation with both participants in the same frame, allowing for a more dynamic and interactive discussion. A TV in the middle could also be a great way to display visual aids or demonstrations, making the conversation more en