<a href="https://colab.research.google.com/github/xjdeng/youtube-ai-query/blob/main/youtube_video_and_comment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-api-python-client youtube-transcript-api

In [None]:
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from google.colab import userdata
import google.generativeai as genai
import json
import pprint
from google.colab import files
api_key = userdata.get("YOUTUBE_API_KEY") #Set up Youtube Data API, see video: https://www.youtube.com/watch?v=c2niAOP82h4
youtube = build('youtube', 'v3', developerKey=api_key)
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY') #Set up Gemini Credentials, see video: https://www.youtube.com/watch?v=S1elvCs1gyI
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash-latest")

In [None]:
def get_comments(video_url, maxresults=1000):
    if "&" in video_url:
        video_url = video_url.split("&")[0]
    video_id = video_url.split("v=")[1]

    # Fetch top-level comments
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=maxresults,
        order="relevance"  # Fetch top-liked comments
    )
    response = request.execute()

    comments = []
    for item in response.get('items', []):
        snippet = item['snippet']['topLevelComment']['snippet']
        comment_text = snippet['textDisplay']
        rating = snippet.get('likeCount', 0)  # Default to 0 if 'likeCount' is missing
        comments.append({"comment": comment_text, "rating": rating})

    return comments

def get_transcript(video_url):
  if "&" in video_url:
    video_url = video_url.split("&")[0]
  video_id = video_url.split("v=")[1]
  entries = []
  try:
      transcript = YouTubeTranscriptApi.get_transcript(video_id)
      for entry in transcript:
          entries.append(entry)
  except Exception as e:
      print("Error:", e)
  return entries

def pipeline(video_url, maxresults=1000):
  transcript = get_transcript(video_url)
  comments = get_comments(video_url, maxresults)
  return {"transcript": transcript, "comments": comments}

def query_video(query, video_url, maxresults=1000):
  data = pipeline(video_url, maxresults)
  prompt = f"""

  I'd like to ask the following question on the following Youtube video, given its transcript and top comments.  Please consider the perspective of both the video and the comments and give more weight to comments with higher ratings.

  Here's the question:
  ---
  {query}
  ---

  Do not download data from Internet, instead, formulate your answer using the following Youtube transcript and top comments:
  ---
  {data}
  ---

  """
  response = model.generate_content(prompt)
  return response.text

In [None]:
query = "Compare and contrast the perspective of the video with its top comments."
url = "https://www.youtube.com/watch?v=iGFqfTCL2fs"

pprint.pprint(query_video(query, url))

In [None]:
with open("youtube_video_and_comments.json", "w") as f:
  json.dump(pipeline(url), f)
  files.download("youtube_video_and_comments.json")