In [244]:
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import json
import os
import pickle
import numpy as np
from collections import defaultdict
import faiss
import regex as re
import pandas as pd

In [245]:
embed_model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m-v1.5")
print("Setup complete!")

Setup complete!


In [None]:
key = "*insert your key here*"

client = OpenAI(
    api_key=key
)

In [247]:
def filter_comments(video_comments):
    filtered_comments = {}
    for video_id, comments in video_comments.items():
        filtered_comments[video_id] = [
            comment for comment in comments
            if is_valid_comment(comment)
        ]
    return filtered_comments

def is_valid_comment(comment):
    comment = comment.strip()

    # Condition 1: Non-empty and >2 words
    if len(comment.split()) <= 2:
        return False

    # Condition 2: Not purely numbers or special characters
    if re.fullmatch(r"[^\w\s]+", comment) or re.fullmatch(r"\d+", comment):
        return False

    # Condition 3: No excessive repeated characters (e.g., "loooool" or "aaaaaa")
    if re.search(r"(.)\1{4,}", comment):  # Four or more consecutive repeated characters
        return False

    # Condition 4: Not spam-like (e.g., "www.example.com" or repeated phrases)
    if re.search(r"(https?:\/\/|www\.)", comment):  # Links
        return False
    if re.search(r"(buy now|subscribe|click here|free money)", comment, re.IGNORECASE):  # Common spam phrases
        return False

    return True


In [248]:
def compute_and_store_embeddings(comment_df, embed_model, save_path="embeddings_progress.pkl"):
    # Load existing progress if available
    if os.path.exists(save_path):
        with open(save_path, "rb") as f:
            embeddings = pickle.load(f)
        print(f"Loaded progress from {save_path}")
    else:
        embeddings = {}

    total_comments = len(comment_df)
    processed_comments = len(embeddings)
    print(f"Total comments to process: {total_comments}, already processed: {processed_comments}")

    # Flag to track if any new embeddings were added
    new_embeddings_computed = False

    for _, row in comment_df.iterrows():
        comment = row["comment"]
        if comment in embeddings:
            continue
        try:
            embeddings[comment] = embed_model.encode(comment)
            processed_comments += 1
            new_embeddings_computed = True  # Mark that new embeddings were added
            if processed_comments % 2000 == 0:
                print(f"Processed {processed_comments}/{total_comments} comments")

        except Exception as e:
            print(f"Error processing '{comment}': {e}")

    # Save only if new embeddings were computed
    if new_embeddings_computed:
        with open(save_path, "wb") as f:
            pickle.dump(embeddings, f)
        print(f"Saved updated embeddings to {save_path}")
    else:
        print("No new embeddings were computed. Skipping save.")

    print(f"Embedding computation complete. Total embeddings: {len(embeddings)}")
    return embeddings


In [249]:
# Step 5: Updated clustering function for exact clustering with cosine similarity
def cluster_comments(video_comments, embeddings):

    embeddings = np.array(embeddings).astype("float32")
    
    faiss.normalize_L2(embeddings)  # Normalizes each vector to unit length

    dim = embeddings.shape[1]
    flat_index = faiss.IndexFlatIP(dim)
    flat_index.add(embeddings)

    # Perform clustering
    clustering = faiss.Clustering(dim, n_clusters=min(len(embeddings), 377))
    clustering.train(embeddings, flat_index)

    # Assign comments to clusters
    distances, cluster_indices = flat_index.search(embeddings, 1)
    clustered_comments = defaultdict(list)
    for i, cluster_idx in enumerate(cluster_indices.flatten()):
        clustered_comments[cluster_idx].append(video_comments[i])
    
    return clustered_comments


In [250]:
def cluster_all_comments(embeddings_dict, n_clusters=25):
    all_comments = list(embeddings_dict.keys())
    embeddings = np.array([embeddings_dict[comment] for comment in all_comments]).astype("float32")
    faiss.normalize_L2(embeddings)

    # Perform global clustering
    dim = embeddings.shape[1]
    flat_index = faiss.IndexFlatIP(dim)
    flat_index.add(embeddings)

    clustering = faiss.Clustering(dim, n_clusters)
    clustering.train(embeddings, flat_index)

    # Assign comments to clusters
    _, cluster_indices = flat_index.search(embeddings, 1)
    global_clusters = defaultdict(list)
    for i, cluster_idx in enumerate(cluster_indices.flatten()):
        global_clusters[cluster_idx].append(all_comments[i])

    print(f"Total clusters formed: {len(global_clusters)}")
    # for cluster_id, comments in global_clusters.items():
    #     print(f"Cluster {cluster_id}: {len(comments)} comments")
    return global_clusters


In [251]:
def save_classification(classification, save_path="classification.json"):
    with open(save_path, "w") as f:
        json.dump(classification, f, indent=4)

In [252]:
def find_centroid_representative(cluster_embeddings, comments):
    # Compute the centroid of the cluster
    centroid = np.mean(cluster_embeddings, axis=0)

    # Find the embedding closest to the centroid
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    closest_idx = np.argmin(distances)
    
    # Return the comment closest to the centroid
    return comments[closest_idx]

def classify_clusters(global_clusters, embeddings_dict):
    classification_results = {}
    prompt_template = """You are tasked with categorizing a given music description into one of the following categories. Each description will only belong to one category. Read the description carefully and assign it to the most appropriate category based on the following guidelines:

Music: The description focuses on anything related to the music itself. This includes:
- How the music sounds (e.g., melody, tune, beat, harmony, or rhythm).
- The vibe, energy, or mood of the music (e.g., relaxing, upbeat, groovy).
- The quality or listener’s opinion of the music (e.g., “the music is great,” “amazing beat”).

Examples:
- “It was awesome.”
- “Always bring tears to my eyes beautiful track.”
- “GODDDD!!!! What a sooothinggg beautifulll banger.”

Artist: The description mentions the artist or creator of the music and appreciates their singing quality. This includes:
- Mentions of the artist’s name or pronouns (e.g., “he,” “she,” “they”).
- Opinions, praise, or criticism directed toward the artist (e.g., “this artist is brilliant”).
- Comments directly addressed to the artist or the creator of the audio.

Examples:
- “love u sir.”
- “The depth of voice and emotion is so HEART TOUCHING. Beautifully sung by Aima Begh.”
- “️such a magical voice.”

Ambiguous: The description contains phrases, words, or content that are unclear, incomplete, or don’t make sense in the given context. This includes:
- Random letters, symbols, or gibberish.
- Words or phrases that do not convey a clear meaning.
- Non-standard terms, slang, or ambiguous references.

Examples:
- “@lm bohemia Dz 🔥.”
- “what my waat rat fat gaiaaaa🇧🇩”
- “3 like ha Bhi.”

Others: Any description that does not clearly fit into the Music, Artist, or Ambiguous categories. This includes:
- General observations not specifically about the music, artist, or ambiguous phrases.

Examples:
- “Old days are gone.”
- “Is that earbud in his ear?”
- “Only 277 views in 2 days.”

Instructions:
For each music description:
1. Read the description carefully.
2. Compare it against the definitions and examples for each category.
3. Assign the description to the most suitable category (Music, Artist, Ambiguous, Others).

Response Format:
“Category: [Insert category name]”"""

    for cluster_id, comments in global_clusters.items():
        cluster_embeddings = np.array([embeddings_dict[c] for c in comments])
        representative = find_centroid_representative(cluster_embeddings, comments)

        try:
            # Generate classification via GPT-4
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": prompt_template},
                    {"role": "user", "content": f"Comment: {representative}"}
                ],
                max_tokens=100
            )

            # Extract the category from the response
            label = response.choices[0].message.content.strip().replace("Category: ", "")

            # Assign the category to all comments in the cluster
            for comment in comments:
                classification_results[comment] = label

        except Exception as e:
            print(f"Error classifying cluster {cluster_id}: {e}")

        print(f"Cluster {cluster_id} classified as {label}")
        # break

    return classification_results


In [253]:
def classify_comments_individually(comment_df):
    classification_results = {}
    prompt_template = """You are tasked with categorizing a given music description into one of the following categories. Each description will only belong to one category. Read the description carefully and assign it to the most appropriate category based on the following guidelines:

Music: The description focuses on anything related to the music itself. This includes:
- How the music sounds (e.g., melody, tune, beat, harmony, or rhythm).
- The vibe, energy, or mood of the music (e.g., relaxing, upbeat, groovy).
- The quality or listener’s opinion of the music (e.g., “the music is great,” “amazing beat”).

Examples:
- “It was awesome.”
- “Always bring tears to my eyes beautiful track.”
- “GODDDD!!!! What a sooothinggg beautifulll banger.”

Artist: The description mentions the artist or creator of the music and appreciates their singing quality. This includes:
- Mentions of the artist’s name or pronouns (e.g., “he,” “she,” “they”).
- Opinions, praise, or criticism directed toward the artist (e.g., “this artist is brilliant”).
- Comments directly addressed to the artist or the creator of the audio.

Examples:
- “love u sir.”
- “The depth of voice and emotion is so HEART TOUCHING. Beautifully sung by Aima Begh.”
- “️such a magical voice.”

Ambiguous: The description contains phrases, words, or content that are unclear, incomplete, or don’t make sense in the given context. This includes:
- Random letters, symbols, or gibberish.
- Words or phrases that do not convey a clear meaning.
- Non-standard terms, slang, or ambiguous references.

Examples:
- “@lm bohemia Dz 🔥.”
- “what my waat rat fat gaiaaaa🇧🇩”
- “3 like ha Bhi.”

Others: Any description that does not clearly fit into the Music, Artist, or Ambiguous categories. This includes:
- General observations not specifically about the music, artist, or ambiguous phrases.

Examples:
- “Old days are gone.”
- “Is that earbud in his ear?”
- “Only 277 views in 2 days.”

Instructions:
For each music description:
1. Read the description carefully.
2. Compare it against the definitions and examples for each category.
3. Assign the description to the most suitable category (Music, Artist, Ambiguous, Others).

Response Format:
“Category: [Insert category name]”"""

    for _, row in comment_df.iterrows():
        comment = row["comment"]
        try:
            # Generate classification via GPT-4
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": prompt_template},
                    {"role": "user", "content": f"Comment: {comment}"}
                ],
                max_tokens=100
            )

            # Extract the category from the response
            label = response.choices[0].message.content.strip().replace("Category: ", "")
            # comment_df["individual_label"] = label
            classification_results[comment] = label
            # print(f"Comment '{comment}' classified as {label}")

        except Exception as e:
            print(f"Error classifying comment '{comment}': {e}")

        # map the comment to the label
        comment_df["individual_label"] = comment_df["comment"].map(classification_results)

        # break

    return classification_results

In [254]:
def process_video_comments(video_df, embed_model, output_file="classified_comments5.json"):
    #Step 0: Filter out empty or irrelevant comments
    # print("Filtering comments...")
    # video_df = filter_comments(video_df)

    #Step 1: Compute embeddings and save progress
    print("Computing embeddings...")
    embeddings_dict = compute_and_store_embeddings(video_df, embed_model)

    #Step 2: Process each video for clustering and classification
    print("Clustering comments...")
    clusters = cluster_all_comments(embeddings_dict)

    #Step 3: Classify clusters
    print("Classifying clusters...")
    clusters = classify_clusters(clusters, embeddings_dict)

    #Step 4: Map cluster labels to comment dataframe
    print("Mapping cluster labels to comments...")
    video_df["cluster_label5"] = video_df["comment"].map(clusters)

    #Step 4: Save final results to a file
    print("Saving results...")
    with open(output_file, "w") as f:
        json.dump(clusters, f, indent=2)
    
    # saving df to csv
    video_df.to_csv("classified_comments.csv", index=False)
    
    print(f"Final results saved to {output_file}")

    return clusters


In [255]:
def process_video_comments_individually(video_df, output_file="classified_comments_individually.json"):
    
    cluster_resul = classify_comments_individually(video_df)

    # save the df to csv
    video_df.to_csv("classified_comments_new.csv", index=False)

    with open(output_file, "w") as f:
        json.dump(cluster_resul, f, indent=2)

    print(f"Final results saved to {output_file}")

    return cluster_resul

In [256]:
# create a new df
df = pd.read_csv("classified_comments.csv")


# classified_clusters = process_video_comments(df, embed_model)
classified_clusters = process_video_comments_individually(df)

Final results saved to classified_comments_individually.json
