In [30]:
import httpx
import pandas as pd
import os
from dotenv import load_dotenv

BASE_URL = "https://www.googleapis.com/youtube/v3/commentThreads"


async def fetch_all_comments(video_id):
     # Load environment variables from .env file
    load_dotenv()

    # Now you can access your environment variables using os.getenv()
    YT_API_KEY = os.getenv("YOUTUBE_API_KEY")

    comments = []
    next_page_token = None

    async with httpx.AsyncClient() as client:
        while True:
            params = {
                "part": "snippet",
                "videoId": video_id,
                "maxResults": 100,
                "key": YT_API_KEY,
                "pageToken": next_page_token
            }
            response = await client.get(BASE_URL, params=params)
            if response.status_code == 200:
                data = response.json()
                for item in data.get("items", []):
                    top_comment = item["snippet"]["topLevelComment"]["snippet"]
                    comments.append({
                        "author": top_comment["authorDisplayName"],
                        "text": top_comment["textOriginal"],
                        "likeCount": top_comment.get("likeCount", 0),
                        "publishedAt": top_comment["publishedAt"]
                    })
                next_page_token = data.get("nextPageToken")
                if not next_page_token:
                    break
            else:
                raise Exception(f"Failed to fetch comments: {response.status_code} - {response.text}")

    return comments


In [2]:
async def test_fetch_all_comments():
    video_id = "jAoIOIjMZM0"  # Replace with an actual YouTube video ID

    # Fetch the comments
    comments = await fetch_all_comments(video_id)

    # Sort comments by number of likes (in descending order)
    comments_sorted = sorted(comments, key=lambda x: x['likeCount'], reverse=True)

    # Print the number of comments
    print(f"Total number of comments fetched: {len(comments)}")

    # Print some of the fetched comments (for example, the first 5 most liked ones)
    print("First 5 most liked comments:")
    for i, comment in enumerate(comments_sorted[:5]):
        print(f"{i+1}. Author: {comment['author']}")
        print(f"   Comment: {comment['text']}")
        print(f"   Likes: {comment['likeCount']}")
        print(f"   Published at: {comment['publishedAt']}")
        print("-" * 40)


load_dotenv()  # Make sure to load environment variables
await test_fetch_all_comments() 


Total number of comments fetched: 500
First 5 most liked comments:
1. Author: @DedicatedSpirit8
   Comment: I love Charlie&#39;s passion<br>Life is purposeless.<br>To die sith a full bank account and regreets is waaay worse than dying penniless but fulfilled.
   Likes: 121
   Published at: 2025-01-19T10:06:30Z
----------------------------------------
2. Author: @IcyPandazzz
   Comment: My friend, flying is BY FAR the safest mode of transportation that humans have. You&#39;re more likely to get eaten by a shark than suffer a single injury during flight. Being eaten by a shark is so rare that you&#39;re more likely to be struck by lightning... TWICE in your lifetime. To add a cherry on top, there is a 63% chance that you will get into at least 3 car accidents by the age of 55. Flight technology is also so advanced that the planes literally fly themselves PERFECTLY! The only reason to have a pilot anymore for the bigger travel companies is to make sure that the plane&#39;s computers are a

In [None]:
from openai import OpenAI


# Summarize comments function
async def summarize_comments(video_id):
    # Fetch comments
    comments = await fetch_all_comments(video_id)

    # Sort comments by likes
    comments_sorted = sorted(comments, key=lambda x: x['likeCount'], reverse=True)

    # Prepare prompt with weighted comments
    prompt = "Summarize the following YouTube comments. Comments with more likes are more important:\n\n"
    for comment in comments_sorted[:50]:  # Include top 50 comments for the prompt
        prompt += f"- [{comment['likeCount']} likes] {comment['text']}\n"

    # Initialize OpenAI API
    load_dotenv()
    client = OpenAI(api_key=os.getenv("THREAD_OPENAI_API_KEY"))

    # Summarize using GPT
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": "You are an expert summarizer."},
            {"role": "user", "content": prompt}
        ]
    )

    # Print the summary
    summary = response.choices[0].message.content
    return summary, comments_sorted

summary, comments = await summarize_comments("jAoIOIjMZM0")
print("\nSummary of Comments:")
print(summary)


Summary of Comments:
The YouTube comments mainly focus on the phrase "talk this" and the $4.2 million that the YouTuber, Charlie, spent on his e-sports team. Viewers found the fact that Charlie could finally "talk this" amusing, and it became a running joke. Additionally, they admired Charlie's financial commitment to e-sports, considering his massive expenditure a casual show of wealth. They also appreciate his passionate dedication to his team, despite facing economic losses. Lastly, comments indicate some excitement over a typo on the video title and Charlie's video content related to the e-sports scene. Some viewers also expressed their appreciation for the improved lighting in the display case and the performance of Charlie's team in different games. Overall, the general sentiment is supportive and amused by Charlie's antics.


In [None]:
from openai import OpenAI
import numpy as np
import re
import tiktoken
import os

client = OpenAI(api_key=os.getenv("THREAD_OPENAI_API_KEY"))

def clean_text(text):
    return re.sub(r"[^\x00-\x7F]+", "", text)  # Strip non-ASCII characters

def is_valid_comment(text, encoder, max_tokens=8192, max_chars=10000):
    if not isinstance(text, str):
        return False
    text = text.strip()
    if not text:
        return False
    if len(text) > max_chars:
        return False
    try:
        text.encode("utf-8")
    except UnicodeEncodeError:
        return False
    if len(encoder.encode(text)) > max_tokens:
        return False
    return True

def vectorize_comments(comments):
    encoder = tiktoken.encoding_for_model("text-embedding-3-small")

    # Preprocess and validate
    texts = []
    for comment in comments:
        raw = comment.get("text", "")
        cleaned = clean_text(str(raw)).strip()
        if is_valid_comment(cleaned, encoder):
            texts.append(cleaned)
        else:
            print(f"‚ùå Skipped invalid comment: {repr(cleaned)[:100]}")

    if not texts:
        raise ValueError("No valid comment text to embed.")

    print("Total valid texts:", len(texts))

    # Batch constraints
    MAX_TOKENS_PER_TEXT = 8192
    MAX_TOKENS_PER_BATCH = 300_000
    MAX_TEXTS_PER_BATCH = 2048

    current_batch = []
    current_token_count = 0
    embeddings = []

    for text in texts:
        tokens = len(encoder.encode(text))

        if (current_token_count + tokens > MAX_TOKENS_PER_BATCH) or (len(current_batch) >= MAX_TEXTS_PER_BATCH):
            print(f"üì§ Sending batch of {len(current_batch)} texts ({current_token_count} tokens)")
            try:
                response = client.embeddings.create(
                    input=current_batch,
                    model="text-embedding-3-small"
                )
                embeddings.extend([np.array(item.embedding) for item in response.data])
            except Exception as e:
                print("‚ùå Batch failed. Dumping inputs:")
                for i, item in enumerate(current_batch):
                    print(f"[{i}] ({len(item)} chars): {repr(item[:80])}")
                raise e
            current_batch = []
            current_token_count = 0

        current_batch.append(text)
        current_token_count += tokens

    # Final batch
    if current_batch:
        print(f"üì§ Sending final batch of {len(current_batch)} texts ({current_token_count} tokens)")
        response = client.embeddings.create(
            input=current_batch,
            model="text-embedding-3-small"
        )
        embeddings.extend([np.array(item.embedding) for item in response.data])

    print(f"‚úÖ Total embeddings generated: {len(embeddings)}")
    return embeddings


def search_similar_comments(question, embeddings, comments, top_k=5):
    # Embed the question
    response = client.embeddings.create(
        input=[question],
        model="text-embedding-3-small"
    )
    question_vector = np.array(response.data[0].embedding)

    # Normalize vectors for cosine similarity
    q_norm = question_vector / np.linalg.norm(question_vector)
    comment_vectors = [e / np.linalg.norm(e) for e in embeddings]

    # Compute cosine similarities
    similarities = [(i, np.dot(q_norm, e)) for i, e in enumerate(comment_vectors)]
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Return top_k matching comments
    top_indices = [i for i, _ in similarities[:top_k]]
    return [comments[i] for i in top_indices]


summary, comments = await summarize_comments("jAoIOIjMZM0")

embeddings = vectorize_comments(comments)



‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
Total valid texts: 6857
üì§ Sending batch of 2048 texts (43208 tokens)
üì§ Sending batch of 2048 texts (37447 tokens)
üì§ Sending batch of 2048 texts (30087 tokens)
üì§ Sending final batch of 713 texts (5567 tokens)
‚úÖ Total embeddings generated: 6857


In [None]:
summary, comments = await summarize_comments("jAoIOIjMZM0")

top_comments = comments[:500]

embeddings = vectorize_comments(top_comments)

question = "How much money did he spend or lose on Moist Esports?"

similar_comments = search_similar_comments(question, embeddings, top_comments)

# Print out the similar comments
print("\nTop 5 Similar Comments:")
for i, comment in enumerate(similar_comments):
    print(f"{i+1}. Author: {comment['author']}")
    print(f"   Comment: {comment['text']}")
    print(f"   Likes: {comment['likeCount']}")
    print(f"   Published at: {comment['publishedAt']}")
    print("-" * 40)

‚ùå Skipped invalid comment: ''
Total valid texts: 499
üì§ Sending final batch of 499 texts (9483 tokens)
‚úÖ Total embeddings generated: 499

Top 5 Similar Comments:
1. Author: @sadganonkiller
   Comment: the money is in the venues. think having your own stadium, for your team, like football teams do, would really open more doors
   Likes: 0
   Published at: 2025-01-19T02:39:40Z
----------------------------------------
2. Author: @xxsodomaruxx
   Comment: "The eSports organization formally known as Moist"
   Likes: 0
   Published at: 2025-01-19T23:34:34Z
----------------------------------------
3. Author: @boostedn
   Comment: Start a CS team üí™
   Likes: 1
   Published at: 2025-01-19T16:17:01Z
----------------------------------------
4. Author: @srdjan455
   Comment: Ok but why is eSports such a big money sink?
   Likes: 0
   Published at: 2025-01-21T15:33:27Z
----------------------------------------
5. Author: @dokgohyuk3753
   Comment: Genuine question, how much money is the com

In [None]:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("THREAD_OPENAI_API_KEY"))

def generate_answer(question, relevant_comments, summary):
    relevant_text = "\n".join([f"- {comment['text']}" for comment in relevant_comments])

    prompt = f"""
    Video Summary:
    {summary}

    Related Comments:
    {relevant_text}

    Question: {question}

    Based on the summary and related comments, please provide an answer.
    """

    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

summary, comments = await summarize_comments("jAoIOIjMZM0")

top_comments = comments[:500]

embeddings = vectorize_comments(top_comments)

question = "How much money did he spend or lose on Moist Esports?"

similar_comments = search_similar_comments(question, embeddings, top_comments)

response = generate_answer(question, similar_comments, summary)

print(response)

‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
‚ùå Skipped invalid comment: ''
Total valid texts: 6855
üì§ Sending batch of 2048 texts (39570 tokens)
üì§ Sending batch of 2048 texts (34009 tokens)
üì§ Sending batch of 2048 texts (27824 tokens)
üì§ Sending final batch of 711 texts (4885 tokens)
‚úÖ Total em