# YouTube Comments Processing Notebook

This notebook processes YouTube comments by:
- Encoding the comments
- Reducing comment encodings to 2-dimensional space

using the [text-embedding-004](https://ai.google.dev/gemini-api/docs/embeddings) model. 

The processed data will be used for clustering analysis.

## Package Installation
Run this cell to install required packages.

In [None]:
%%capture
%pip install pandas numpy tqdm python-dotenv
%pip install -q -U google-genai

## Imports and Setup


In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from google import genai
import os
from tqdm import tqdm
import time
import ast


## Google API Client Setup
Initialize the Google Gemini API client with your API key.

In [None]:
env_path = os.path.abspath("sm-insights-next/.env.local")
local_path = "../../sm-insights-next/.env.local"
load_dotenv(dotenv_path=local_path)
gemini_api_key = os.getenv("GEMINI_API_KEY")

client = genai.Client(api_key=gemini_api_key)

## Utility Functions

In [None]:
def add_embedding_dimensions(df, all_embeddings):
    if len(all_embeddings) > 0 and all_embeddings[0] is not None:
        dims_amount = len(all_embeddings[0])
        for dim in range(dims_amount):
            df[f'embed_dim_{dim}'] = [emb[dim] if emb is not None else None for emb in all_embeddings]
    
    return df

In [None]:
def add_embeddings_to_dataframe(df, model_name="models/text-embedding-004", task_type="CLUSTERING", output_dimensionality=2, batch_size=100):
    print("Generating embeddings...")
    texts = df['text'].tolist()
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Processing embeddings batches", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        try:
            embeddings_response = client.models.embed_content(
                    model=model_name,
                    contents=batch_texts,
                    config={
                    'task_type': task_type,
                    'output_dimensionality': output_dimensionality
                    }
                )
            all_embeddings.extend([item.values for item in embeddings_response.embeddings])

        except Exception as e:
            print(f"Error during embedding batch {i // batch_size}: {e}")
            all_embeddings.extend([None] * len(batch_texts))  

    add_embedding_dimensions(df, all_embeddings)

    return df

In [None]:
def validate_dataframe(df):
    required_columns = ['text', 'author', 'likes', 'replyCount']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

In [None]:
def load_and_validate_dataset(input_file):
    print(f"Loading data from {input_file}...")
    df = pd.read_csv(input_file)
    validate_dataframe(df)
    return df

## Main Process

In [None]:
dataset_name = "honey_scam_500"
input_file = f"../datasets/youtube-comments/{dataset_name}.csv"
df = load_and_validate_dataset(input_file)

In [None]:
df =  add_embeddings_to_dataframe(df)
display(df.head())

In [None]:
output_file= f"./datasets/with-assumptions/{dataset_name}.csv"
df.to_csv(output_file, index=False)
