In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re

In [2]:
def cluster_cohesion(embeddings, labels):
    scores = []
    for c in np.unique(labels):
        idx = labels == c
        if idx.sum() < 2:
            continue
        sims = cosine_similarity(embeddings[idx])
        scores.append(sims.mean())
    return float(np.mean(scores))

In [3]:
# Download resources (run once)
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

def get_top_keywords_filtered(texts, n=10):
    # 1. SETUP LISTS
    custom_stops = list(nltk.corpus.stopwords.words('english'))
    pop_slang = [
        "lil", "gon", "bout", "em", "ayy", "uh", "huh", "ha", "vi", "doo", "wee",
        "yeah", "oh", "baby", "know", "like", "got", "just", "don", "ve", "ll", 
        "want", "need", "love", "time", "way", "make", "say", "come", "go", "right",
        "look", "good", "feel", "really", "cause", "wanna", "gonna", "gotta", "ain",
        "girl", "boy", "man", "woman", "hey", "ooh", "whoa", "shit", "fuck", "bitch",
        "nigga", "niggas", "damn", "ass", "tell", "think", "never", "back", "let",
        "swag", "yuh", "hum", "who", "what", "where", "why", "top", "call", "put",
        "gang", "thug", "bro", "pussy", "tryna", "chick", "girls", "slatt", "mmh"
    ]
    
    abstract_concepts = [
        "life", "day", "night", "heart", "mind", "world", "everything", "nothing", 
        "things", "nothin", "songs", "song", "name", "eyes", "face", "voice", 
        "head", "hand", "hands", "god", "soul", "mind", "pain", "hope", "wish", 
        "fame", "lie", "lies", "truth", "word", "words", "end", "reason", "part",
        "told", "saw", "knew", "met", "said", "made", "found", "came", "went",
        "die", "born", "live", "dead", "death", "control", "move", "wait", "hold",
        "stop", "start", "change", "keep", "leave", "stay", "believe", "remember"
    ]
    
    NAME_REPLACEMENTS = {
        "regina": "queen", "veronica": "girl", "heather": "student",
        "hamilton": "soldier", "burr": "man", "jefferson": "politician",
        "elphaba": "witch", "glinda": "friend", "evan": "boy",
        "connor": "friend", "usnavi": "guy", "vanessa": "girl"
    }

    # 2. APPLY NAME REPLACEMENTS (The Fix)
    processed_texts = []
    for t in texts:
        t_lower = t.lower() # Lowercase first
        for name, replacement in NAME_REPLACEMENTS.items():
            t_lower = t_lower.replace(name, replacement)
        processed_texts.append(t_lower)

    # 3. VECTORIZE (Use the processed list!)
    all_stops = custom_stops + pop_slang + abstract_concepts
    
    try:
        vec = CountVectorizer(stop_words=all_stops).fit(processed_texts)
        bag_of_words = vec.transform(processed_texts)
        sum_words = bag_of_words.sum(axis=0) 
        words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
        
        # 4. FINAL CLEANUP (Noun Check)
        cleaned_list = []
        for word, freq in sorted(words_freq, key=lambda x: x[1], reverse=True):
            if len(word) <= 2: continue 
            
            # Check Noun (NN, NNS)
            # We wrap it in list [] because pos_tag expects a list of tokens
            pos_tag = nltk.pos_tag([word])[0][1]
            if pos_tag.startswith('NN'):
                cleaned_list.append((word, freq))
                
        return cleaned_list[:n]
        
    except ValueError:
        # Handles empty clusters
        return []

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Bay_Techatham/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Bay_Techatham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Bay_Techatham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data = pd.read_csv("../output/songs_with_lyrics_cleaned.csv")
lyrics1 = data['lyrics'].dropna()
    
musical_data = pd.read_csv("../dataset/musical.csv")
lyrics2 = musical_data['Lyrics'].dropna()

# --- FIX 1: Use ignore_index=True to fix the numbering ---
separated_lyrics_series = pd.concat([lyrics1, lyrics2], ignore_index=True)
    
# --- FIX 2: Convert to a Python List for the model ---
# This prevents the KeyError: 480 completely
separated_lyrics = separated_lyrics_series.tolist()

corpus = []

print("üßπ Cleaning and Segmenting Lyrics...")

for lyric_text in separated_lyrics:
    if isinstance(lyric_text, str):
        # 1. Clean Tags
        text_no_tags = re.sub(r'\[.*?\]', ' ', lyric_text, flags=re.DOTALL)
        
        # 2. Split into lines
        raw_segments = text_no_tags.split('\n')
        
        # 3. Clean and Filter
        clean_lines = [line.strip() for line in raw_segments if len(line.strip()) > 0]
        
        # 4. THE MAGIC TRICK: "Sliding Window" of 2 lines
        # We combine Line 1+2, then Line 2+3, then Line 3+4...
        for i in range(len(clean_lines) - 1):
            # Combine current line and next line
            combined_segment = f"{clean_lines[i]} {clean_lines[i+1]}"

            # Only keep if it's long enough (e.g. > 40 chars)
            if len(combined_segment) > 40:
                corpus.append(combined_segment)

print(f"‚úÖ Processing complete. Total segments: {len(corpus)}")

üßπ Cleaning and Segmenting Lyrics...
‚úÖ Processing complete. Total segments: 22824


In [5]:
pop_song_lyrics_only = lyrics1.copy().tolist()
musical_lyrics_only = lyrics2.copy().tolist()
mixture_lyrics = separated_lyrics
corpus_lyrics = corpus

In [6]:
import optuna
import umap
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

selected_data = pop_song_lyrics_only

# --- 2. GENERATE EMBEDDINGS (Always re-run when data changes) ---
# We removed the 'if globals()' check because you are changing data!
print(f"‚è≥ Generating Embeddings for {len(selected_data)} items...")
print(f"   (Data Source: {len(selected_data)} text segments)")

model = SentenceTransformer('all-MiniLM-L6-v2') # 'all-mpnet-base-v2'
embeddings = model.encode(selected_data)

# --- 3. UMAP REDUCTION (Always re-run when data changes) ---
print("üìâ Running UMAP Reduction...")
reducer = umap.UMAP(
    n_neighbors=15, 
    n_components=5, # 5 dimensions is usually safer than 4 for complex mixes
    min_dist=0.0,
    metric='cosine', 
    random_state=42
)
reduced_embeddings = reducer.fit_transform(embeddings)

# --- 4. DEFINE OBJECTIVE ---
def objective(trial):
    # Search range: 20 to 80 clusters
    k = trial.suggest_int('n_clusters', 20, 80) 

    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    
    # Cluster on the reduced data (fast & clean)
    labels = kmeans.fit_predict(reduced_embeddings) 
    
    # Score on the original data (accurate)
    cohesion_score = cluster_cohesion(embeddings, labels)
    
    return cohesion_score

# --- 5. RUN OPTIMIZATION ---
print("ü§ñ Starting Optuna Optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) 

# --- 6. RESULTS ---
print("-" * 40)
print(f"üèÜ Best K found: {study.best_params['n_clusters']}")
print(f"üìà Best Cohesion Score: {study.best_value:.4f}")
print("-" * 40)

‚è≥ Generating Embeddings for 257 items...
   (Data Source: 257 text segments)
üìâ Running UMAP Reduction...


  warn(
[I 2026-02-12 13:30:55,697] A new study created in memory with name: no-name-61856734-df6c-4da2-9e29-30e439cfb983


ü§ñ Starting Optuna Optimization...


[I 2026-02-12 13:30:56,044] Trial 0 finished with value: 0.667854905128479 and parameters: {'n_clusters': 61}. Best is trial 0 with value: 0.667854905128479.
[I 2026-02-12 13:30:56,090] Trial 1 finished with value: 0.639173686504364 and parameters: {'n_clusters': 50}. Best is trial 0 with value: 0.667854905128479.
[I 2026-02-12 13:30:56,132] Trial 2 finished with value: 0.6214252710342407 and parameters: {'n_clusters': 48}. Best is trial 0 with value: 0.667854905128479.
[I 2026-02-12 13:30:56,165] Trial 3 finished with value: 0.5985437631607056 and parameters: {'n_clusters': 34}. Best is trial 0 with value: 0.667854905128479.
[I 2026-02-12 13:30:56,226] Trial 4 finished with value: 0.6895936727523804 and parameters: {'n_clusters': 75}. Best is trial 4 with value: 0.6895936727523804.
[I 2026-02-12 13:30:56,269] Trial 5 finished with value: 0.639173686504364 and parameters: {'n_clusters': 50}. Best is trial 4 with value: 0.6895936727523804.
[I 2026-02-12 13:30:56,312] Trial 6 finished wi

----------------------------------------
üèÜ Best K found: 80
üìà Best Cohesion Score: 0.7003
----------------------------------------


In [7]:
selected_data = musical_lyrics_only

# --- 2. GENERATE EMBEDDINGS (Always re-run when data changes) ---
# We removed the 'if globals()' check because you are changing data!
print(f"‚è≥ Generating Embeddings for {len(selected_data)} items...")
print(f"   (Data Source: {len(selected_data)} text segments)")

model = SentenceTransformer('all-MiniLM-L6-v2') # 'all-mpnet-base-v2'
embeddings = model.encode(selected_data)

# --- 3. UMAP REDUCTION (Always re-run when data changes) ---
print("üìâ Running UMAP Reduction...")
reducer = umap.UMAP(
    n_neighbors=15, 
    n_components=5, # 5 dimensions is usually safer than 4 for complex mixes
    min_dist=0.0,
    metric='cosine', 
    random_state=42
)
reduced_embeddings = reducer.fit_transform(embeddings)

# --- 4. DEFINE OBJECTIVE ---
def objective(trial):
    # Search range: 20 to 80 clusters
    k = trial.suggest_int('n_clusters', 20, 80) 

    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    
    # Cluster on the reduced data (fast & clean)
    labels = kmeans.fit_predict(reduced_embeddings) 
    
    # Score on the original data (accurate)
    cohesion_score = cluster_cohesion(embeddings, labels)
    
    return cohesion_score

# --- 5. RUN OPTIMIZATION ---
print("ü§ñ Starting Optuna Optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) 

# --- 6. RESULTS ---
print("-" * 40)
print(f"üèÜ Best K found: {study.best_params['n_clusters']}")
print(f"üìà Best Cohesion Score: {study.best_value:.4f}")
print("-" * 40)

‚è≥ Generating Embeddings for 273 items...
   (Data Source: 273 text segments)
üìâ Running UMAP Reduction...


  warn(
[I 2026-02-12 13:31:02,798] A new study created in memory with name: no-name-79ea20b6-04d7-431c-aab6-27a88051635a
[I 2026-02-12 13:31:02,829] Trial 0 finished with value: 0.5395725965499878 and parameters: {'n_clusters': 30}. Best is trial 0 with value: 0.5395725965499878.
[I 2026-02-12 13:31:02,893] Trial 1 finished with value: 0.6624217629432678 and parameters: {'n_clusters': 76}. Best is trial 1 with value: 0.6624217629432678.
[I 2026-02-12 13:31:02,924] Trial 2 finished with value: 0.5460042953491211 and parameters: {'n_clusters': 32}. Best is trial 1 with value: 0.6624217629432678.
[I 2026-02-12 13:31:02,964] Trial 3 finished with value: 0.5883476138114929 and parameters: {'n_clusters': 43}. Best is trial 1 with value: 0.6624217629432678.
[I 2026-02-12 13:31:02,991] Trial 4 finished with value: 0.5264215469360352 and parameters: {'n_clusters': 27}. Best is trial 1 with value: 0.6624217629432678.


ü§ñ Starting Optuna Optimization...


[I 2026-02-12 13:31:03,053] Trial 5 finished with value: 0.6647193431854248 and parameters: {'n_clusters': 73}. Best is trial 5 with value: 0.6647193431854248.
[I 2026-02-12 13:31:03,084] Trial 6 finished with value: 0.5579906105995178 and parameters: {'n_clusters': 33}. Best is trial 5 with value: 0.6647193431854248.
[I 2026-02-12 13:31:03,126] Trial 7 finished with value: 0.5954551100730896 and parameters: {'n_clusters': 47}. Best is trial 5 with value: 0.6647193431854248.
[I 2026-02-12 13:31:03,151] Trial 8 finished with value: 0.5187195539474487 and parameters: {'n_clusters': 25}. Best is trial 5 with value: 0.6647193431854248.
[I 2026-02-12 13:31:03,186] Trial 9 finished with value: 0.5525541305541992 and parameters: {'n_clusters': 35}. Best is trial 5 with value: 0.6647193431854248.
[I 2026-02-12 13:31:03,247] Trial 10 finished with value: 0.6641873121261597 and parameters: {'n_clusters': 72}. Best is trial 5 with value: 0.6647193431854248.
[I 2026-02-12 13:31:03,310] Trial 11 fi

----------------------------------------
üèÜ Best K found: 80
üìà Best Cohesion Score: 0.6797
----------------------------------------


In [8]:
selected_data = mixture_lyrics

# --- 2. GENERATE EMBEDDINGS (Always re-run when data changes) ---
# We removed the 'if globals()' check because you are changing data!
print(f"‚è≥ Generating Embeddings for {len(selected_data)} items...")
print(f"   (Data Source: {len(selected_data)} text segments)")

model = SentenceTransformer('all-MiniLM-L6-v2') # 'all-mpnet-base-v2'
embeddings = model.encode(selected_data)

# --- 3. UMAP REDUCTION (Always re-run when data changes) ---
print("üìâ Running UMAP Reduction...")
reducer = umap.UMAP(
    n_neighbors=15, 
    n_components=5, # 5 dimensions is usually safer than 4 for complex mixes
    min_dist=0.0,
    metric='cosine', 
    random_state=42
)
reduced_embeddings = reducer.fit_transform(embeddings)

# --- 4. DEFINE OBJECTIVE ---
def objective(trial):
    # Search range: 20 to 80 clusters
    k = trial.suggest_int('n_clusters', 20, 80) 

    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    
    # Cluster on the reduced data (fast & clean)
    labels = kmeans.fit_predict(reduced_embeddings) 
    
    # Score on the original data (accurate)
    cohesion_score = cluster_cohesion(embeddings, labels)
    
    return cohesion_score

# --- 5. RUN OPTIMIZATION ---
print("ü§ñ Starting Optuna Optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) 

# --- 6. RESULTS ---
print("-" * 40)
print(f"üèÜ Best K found: {study.best_params['n_clusters']}")
print(f"üìà Best Cohesion Score: {study.best_value:.4f}")
print("-" * 40)

‚è≥ Generating Embeddings for 530 items...
   (Data Source: 530 text segments)
üìâ Running UMAP Reduction...


  warn(
[I 2026-02-12 13:31:12,669] A new study created in memory with name: no-name-221ed78e-7c67-4b11-9570-6b0878cb5c68
[I 2026-02-12 13:31:12,717] Trial 0 finished with value: 0.5265852212905884 and parameters: {'n_clusters': 44}. Best is trial 0 with value: 0.5265852212905884.
[I 2026-02-12 13:31:12,779] Trial 1 finished with value: 0.5696064233779907 and parameters: {'n_clusters': 65}. Best is trial 1 with value: 0.5696064233779907.
[I 2026-02-12 13:31:12,824] Trial 2 finished with value: 0.5332485437393188 and parameters: {'n_clusters': 45}. Best is trial 1 with value: 0.5696064233779907.
[I 2026-02-12 13:31:12,860] Trial 3 finished with value: 0.5053288340568542 and parameters: {'n_clusters': 33}. Best is trial 1 with value: 0.5696064233779907.


ü§ñ Starting Optuna Optimization...


[I 2026-02-12 13:31:12,936] Trial 4 finished with value: 0.5873841643333435 and parameters: {'n_clusters': 73}. Best is trial 4 with value: 0.5873841643333435.
[I 2026-02-12 13:31:12,970] Trial 5 finished with value: 0.47654733061790466 and parameters: {'n_clusters': 22}. Best is trial 4 with value: 0.5873841643333435.
[I 2026-02-12 13:31:13,015] Trial 6 finished with value: 0.5088688135147095 and parameters: {'n_clusters': 36}. Best is trial 4 with value: 0.5873841643333435.
[I 2026-02-12 13:31:13,090] Trial 7 finished with value: 0.5799791812896729 and parameters: {'n_clusters': 69}. Best is trial 4 with value: 0.5873841643333435.
[I 2026-02-12 13:31:13,156] Trial 8 finished with value: 0.5566902160644531 and parameters: {'n_clusters': 59}. Best is trial 4 with value: 0.5873841643333435.
[I 2026-02-12 13:31:13,213] Trial 9 finished with value: 0.5439338684082031 and parameters: {'n_clusters': 50}. Best is trial 4 with value: 0.5873841643333435.
[I 2026-02-12 13:31:13,294] Trial 10 fi

----------------------------------------
üèÜ Best K found: 78
üìà Best Cohesion Score: 0.5959
----------------------------------------


In [9]:
selected_data = corpus_lyrics  

# --- 2. GENERATE EMBEDDINGS (Always re-run when data changes) ---
# We removed the 'if globals()' check because you are changing data!
print(f"‚è≥ Generating Embeddings for {len(selected_data)} items...")
print(f"   (Data Source: {len(selected_data)} text segments)")

model = SentenceTransformer('all-MiniLM-L6-v2') # 'all-mpnet-base-v2'
embeddings = model.encode(selected_data)

# --- 3. UMAP REDUCTION (Always re-run when data changes) ---
print("üìâ Running UMAP Reduction...")
reducer = umap.UMAP(
    n_neighbors=15, 
    n_components=5, # 5 dimensions is usually safer than 4 for complex mixes
    min_dist=0.0,
    metric='cosine', 
    random_state=42
)
reduced_embeddings = reducer.fit_transform(embeddings)

# --- 4. DEFINE OBJECTIVE ---
def objective(trial):
    # Search range: 20 to 80 clusters
    k = trial.suggest_int('n_clusters', 20, 80) 

    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    
    # Cluster on the reduced data (fast & clean)
    labels = kmeans.fit_predict(reduced_embeddings) 
    
    # Score on the original data (accurate)
    cohesion_score = cluster_cohesion(embeddings, labels)
    
    return cohesion_score

# --- 5. RUN OPTIMIZATION ---
print("ü§ñ Starting Optuna Optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) 

# --- 6. RESULTS ---
print("-" * 40)
print(f"üèÜ Best K found: {study.best_params['n_clusters']}")
print(f"üìà Best Cohesion Score: {study.best_value:.4f}")
print("-" * 40)

‚è≥ Generating Embeddings for 22824 items...
   (Data Source: 22824 text segments)
üìâ Running UMAP Reduction...


  warn(
[I 2026-02-12 13:31:53,403] A new study created in memory with name: no-name-858fef32-9fbb-432a-a5f8-eb653b0fd40a


ü§ñ Starting Optuna Optimization...


[I 2026-02-12 13:31:54,423] Trial 0 finished with value: 0.3531357944011688 and parameters: {'n_clusters': 72}. Best is trial 0 with value: 0.3531357944011688.
[I 2026-02-12 13:31:55,189] Trial 1 finished with value: 0.32317131757736206 and parameters: {'n_clusters': 37}. Best is trial 0 with value: 0.3531357944011688.
[I 2026-02-12 13:31:55,757] Trial 2 finished with value: 0.2569257616996765 and parameters: {'n_clusters': 20}. Best is trial 0 with value: 0.3531357944011688.
[I 2026-02-12 13:31:56,686] Trial 3 finished with value: 0.3361996114253998 and parameters: {'n_clusters': 54}. Best is trial 0 with value: 0.3531357944011688.
[I 2026-02-12 13:31:57,375] Trial 4 finished with value: 0.3462966978549957 and parameters: {'n_clusters': 33}. Best is trial 0 with value: 0.3531357944011688.
[I 2026-02-12 13:31:58,348] Trial 5 finished with value: 0.3405047655105591 and parameters: {'n_clusters': 55}. Best is trial 0 with value: 0.3531357944011688.
[I 2026-02-12 13:31:59,027] Trial 6 fin

----------------------------------------
üèÜ Best K found: 73
üìà Best Cohesion Score: 0.3574
----------------------------------------
