In [None]:
# from tfidf_songs import get_top_tfidf_words, get_global_top_words

from typing import List, Tuple, Optional
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

"""
tfidf_songs.py

Simple TF-IDF utilities to extract important words from a list of song lyrics (or any documents).

Usage:

    songs = [
        "Hello from the other side I must have called a thousand times",
        "You are the sunshine of my life that's why I'll always be around",
        "Is this the real life? Is this just fantasy?"
    ]

    per_song = get_top_tfidf_words(songs, top_n=5)
    global_top = get_global_top_words(songs, top_n=10)
"""



def get_top_tfidf_words(
    docs: List[str],
    top_n: int = 10,
    max_features: Optional[int] = None,
    stop_words: Optional[str] = "english",
) -> List[List[Tuple[str, float]]]:
    """
    Compute TF-IDF on the provided documents and return top_n words (token, score)
    for each document sorted by descending TF-IDF score.

    Parameters:
        docs: list of strings (songs / lyrics)
        top_n: number of top words to return per document
        max_features: if set, limits the vocabulary to the top max_features by term frequency
        stop_words: stop word strategy passed to TfidfVectorizer (e.g., 'english' or None)

    Returns:
        List (per document) of lists of (term, score) tuples.
    """
    if not docs:
        return []

    vectorizer = TfidfVectorizer(
        stop_words=stop_words,
        max_features=max_features,
        lowercase=True,
        token_pattern=r"(?u)\b\w+\b",
    )
    X = vectorizer.fit_transform(docs)  # shape (n_docs, n_terms)
    feature_names = np.array(vectorizer.get_feature_names_out())

    results: List[List[Tuple[str, float]]] = []
    for row in X:
        if row.nnz == 0:
            results.append([])
            continue
        # convert to dense array of scores for this doc
        scores = row.toarray().ravel()
        # get indices of top_n scores
        top_indices = np.argsort(scores)[::-1][:top_n]
        top_terms_scores = [(feature_names[i], float(scores[i])) for i in top_indices if scores[i] > 0]
        results.append(top_terms_scores)
    return results


def get_global_top_words(
    docs: List[str],
    top_n: int = 20,
    max_features: Optional[int] = None,
    stop_words: Optional[str] = "english",
) -> List[Tuple[str, float]]:
    """
    Compute TF-IDF across the corpus and return the top_n terms by summed TF-IDF score
    across all documents.

    Returns:
        List of (term, summed_score) tuples sorted by descending score.
    """
    if not docs:
        return []

    def _preprocessor(text: str) -> str:
        # normalize typographic apostrophes to ASCII so contractions like "don't" are preserved
        return text.replace("â€™", "'")

    vectorizer = TfidfVectorizer(
        preprocessor=_preprocessor,
        stop_words=stop_words,
        max_features=max_features,
        lowercase=True,
        token_pattern=r"(?u)\b\w[\w']*\b",
    )
    X = vectorizer.fit_transform(docs)  # shape (n_docs, n_terms)
    feature_names = np.array(vectorizer.get_feature_names_out())

    # sum TF-IDF scores across documents for each term
    summed = np.array(X.sum(axis=0)).ravel()
    top_indices = np.argsort(summed)[::-1][:top_n]
    return [(feature_names[i], float(summed[i])) for i in top_indices if summed[i] > 0]


if __name__ == "__main__":
    # quick demo
    sample_songs = [
        "Hello from the other side I must have called a thousand times",
        "You are the sunshine of my life that's why I'll always be around",
        "Is this the real life? Is this just fantasy? Caught in a landslide, no escape from reality",
    ]

    per_song = get_top_tfidf_words(sample_songs, top_n=5)
    for i, terms in enumerate(per_song):
        print(f"Song {i} top terms:", terms)

    print("Global top terms:", get_global_top_words(sample_songs, top_n=10))

Song 0 top terms: [('times', 0.5), ('thousand', 0.5), ('hello', 0.5), ('called', 0.5)]
Song 1 top terms: [('sunshine', 0.5286346066596935), ('ll', 0.5286346066596935), ('s', 0.5286346066596935), ('life', 0.4020402441612698)]
Song 2 top terms: [('reality', 0.3632547094545769), ('real', 0.3632547094545769), ('caught', 0.3632547094545769), ('fantasy', 0.3632547094545769), ('escape', 0.3632547094545769)]
Global top terms: [('Is', 0.5877677203306594), ('life', 0.5789387184958397), ('You', 0.4673509818107163), ("I'll", 0.4673509818107163), ('sunshine', 0.4673509818107163), ("that's", 0.4673509818107163), ('thousand', 0.4472135954999579), ('times', 0.4472135954999579), ('called', 0.4472135954999579), ('Hello', 0.4472135954999579)]


In [26]:
import pandas as pd

df = pd.read_csv('billboard_24years_lyrics_spotify_with_genres.csv')  # Assuming a CSV file with a 'lyrics' column

# get hiphop genre

hiphop_lyrics = df[df['genre'] == 'dancepop']['lyrics'].dropna().str.lower().tolist()

top_hiphop_words = get_global_top_words(hiphop_lyrics, top_n=30)

print("Top hiphop words:", top_hiphop_words)

Top hiphop words: [('ooh', 3.8692211355713853), ('oh', 2.972589957940446), ("can't", 2.8746559441823303), ('beautiful', 2.627572537105913), ('rockabye', 2.4464611375147522), ('just', 1.9607577069057711), ('wanna', 1.8179535649443688), ("i'm", 1.8095185009984873), ('shake', 1.7197039580162896), ('lost', 1.6446797270120272), ('love', 1.574973621906904), ("won't", 1.5480378373948653), ("you're", 1.5011621006405946), ('minutes', 1.4824172321817546), ('fikki', 1.4824172321817546), ('baby', 1.4360396943880591), ('middle', 1.3424825824669857), ('vs', 1.3387822050670577), ("don't", 1.3320349553919508), ('gonna', 1.2783677301700447), ('yeah', 1.2334279388881662), ('mind', 1.1449007055834177), ('got', 1.1431915334593077), ('time', 1.1313452762964327), ('moment', 1.1255709693704612), ('tick', 1.1118129241363162), ('tock', 1.1118129241363162), ('like', 1.0748043695927918), ('say', 1.0362694899623635), ('feel', 1.0348910315143796)]


In [27]:
top_hiphop_words

[('ooh', 3.8692211355713853),
 ('oh', 2.972589957940446),
 ("can't", 2.8746559441823303),
 ('beautiful', 2.627572537105913),
 ('rockabye', 2.4464611375147522),
 ('just', 1.9607577069057711),
 ('wanna', 1.8179535649443688),
 ("i'm", 1.8095185009984873),
 ('shake', 1.7197039580162896),
 ('lost', 1.6446797270120272),
 ('love', 1.574973621906904),
 ("won't", 1.5480378373948653),
 ("you're", 1.5011621006405946),
 ('minutes', 1.4824172321817546),
 ('fikki', 1.4824172321817546),
 ('baby', 1.4360396943880591),
 ('middle', 1.3424825824669857),
 ('vs', 1.3387822050670577),
 ("don't", 1.3320349553919508),
 ('gonna', 1.2783677301700447),
 ('yeah', 1.2334279388881662),
 ('mind', 1.1449007055834177),
 ('got', 1.1431915334593077),
 ('time', 1.1313452762964327),
 ('moment', 1.1255709693704612),
 ('tick', 1.1118129241363162),
 ('tock', 1.1118129241363162),
 ('like', 1.0748043695927918),
 ('say', 1.0362694899623635),
 ('feel', 1.0348910315143796)]