In [3]:
# from tfidf_songs import get_top_tfidf_words, get_global_top_words

from typing import List, Tuple, Optional
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

"""
tfidf_songs.py

Simple TF-IDF utilities to extract important words from a list of song lyrics (or any documents).

Usage:

    songs = [
        "Hello from the other side I must have called a thousand times",
        "You are the sunshine of my life that's why I'll always be around",
        "Is this the real life? Is this just fantasy?"
    ]

    per_song = get_top_tfidf_words(songs, top_n=5)
    global_top = get_global_top_words(songs, top_n=10)
"""



def get_top_tfidf_words(
    docs: List[str],
    top_n: int = 10,
    max_features: Optional[int] = None,
    stop_words: Optional[str] = "english",
) -> List[List[Tuple[str, float]]]:
    """
    Compute TF-IDF on the provided documents and return top_n words (token, score)
    for each document sorted by descending TF-IDF score.

    Parameters:
        docs: list of strings (songs / lyrics)
        top_n: number of top words to return per document
        max_features: if set, limits the vocabulary to the top max_features by term frequency
        stop_words: stop word strategy passed to TfidfVectorizer (e.g., 'english' or None)

    Returns:
        List (per document) of lists of (term, score) tuples.
    """
    if not docs:
        return []

    vectorizer = TfidfVectorizer(
        stop_words=stop_words,
        max_features=max_features,
        lowercase=True,
        token_pattern=r"(?u)\b\w+\b",
    )
    X = vectorizer.fit_transform(docs)  # shape (n_docs, n_terms)
    feature_names = np.array(vectorizer.get_feature_names_out())

    results: List[List[Tuple[str, float]]] = []
    for row in X:
        if row.nnz == 0:
            results.append([])
            continue
        # convert to dense array of scores for this doc
        scores = row.toarray().ravel()
        # get indices of top_n scores
        top_indices = np.argsort(scores)[::-1][:top_n]
        top_terms_scores = [(feature_names[i], float(scores[i])) for i in top_indices if scores[i] > 0]
        results.append(top_terms_scores)
    return results


def get_global_top_words(
    docs: List[str],
    top_n: int = 20,
    max_features: Optional[int] = None,
    stop_words: Optional[str] = "english",
) -> List[Tuple[str, float]]:
    """
    Compute TF-IDF across the corpus and return the top_n terms by summed TF-IDF score
    across all documents.

    Returns:
        List of (term, summed_score) tuples sorted by descending score.
    """
    if not docs:
        return []

    def _preprocessor(text: str) -> str:
        # normalize typographic apostrophes to ASCII so contractions like "don't" are preserved
        return text.replace("’", "'")

    vectorizer = TfidfVectorizer(
        preprocessor=_preprocessor,
        stop_words=stop_words,
        max_features=max_features,
        lowercase=True,
        token_pattern=r"(?u)\b\w[\w']*\b",
    )
    X = vectorizer.fit_transform(docs)  # shape (n_docs, n_terms)
    feature_names = np.array(vectorizer.get_feature_names_out())

    # sum TF-IDF scores across documents for each term
    summed = np.array(X.sum(axis=0)).ravel()
    top_indices = np.argsort(summed)[::-1][:top_n]
    return [(feature_names[i], float(summed[i])) for i in top_indices if summed[i] > 0]


if __name__ == "__main__":
    # quick demo
    sample_songs = [
        "Hello from the other side I must have called a thousand times",
        "You are the sunshine of my life that's why I'll always be around",
        "Is this the real life? Is this just fantasy? Caught in a landslide, no escape from reality",
    ]

    per_song = get_top_tfidf_words(sample_songs, top_n=5)
    for i, terms in enumerate(per_song):
        print(f"Song {i} top terms:", terms)

    print("Global top terms:", get_global_top_words(sample_songs, top_n=10))

Song 0 top terms: [('times', 0.5), ('thousand', 0.5), ('hello', 0.5), ('called', 0.5)]
Song 1 top terms: [('sunshine', 0.5286346066596935), ('ll', 0.5286346066596935), ('s', 0.5286346066596935), ('life', 0.4020402441612698)]
Song 2 top terms: [('reality', 0.3632547094545769), ('real', 0.3632547094545769), ('caught', 0.3632547094545769), ('fantasy', 0.3632547094545769), ('escape', 0.3632547094545769)]
Global top terms: [('Is', 0.5877677203306594), ('life', 0.5789387184958397), ('You', 0.4673509818107163), ("I'll", 0.4673509818107163), ('sunshine', 0.4673509818107163), ("that's", 0.4673509818107163), ('thousand', 0.4472135954999579), ('times', 0.4472135954999579), ('called', 0.4472135954999579), ('Hello', 0.4472135954999579)]


In [None]:
import pandas as pd

# the most distinct words within the dancepop genre

df = pd.read_csv('billboard_24years_lyrics_spotify_with_genres.csv')  # Assuming a CSV file with a 'lyrics' column

# get hiphop genre

hiphop_lyrics = df[df['genre'] == 'dancepop']['lyrics'].dropna().str.lower().tolist()

top_hiphop_words = get_global_top_words(hiphop_lyrics, top_n=30)

print("Top hiphop words:", top_hiphop_words)

Top hiphop words: [('ooh', 3.8692211355713857), ('oh', 2.9725899579404467), ("can't", 2.87465594418233), ('beautiful', 2.6275725371059133), ('rockabye', 2.4464611375147527), ('just', 1.9607577069057704), ('wanna', 1.8179535649443692), ("i'm", 1.809518500998487), ('shake', 1.7197039580162896), ('lost', 1.6446797270120272), ('love', 1.574973621906904), ("won't", 1.5480378373948658), ("you're", 1.501162100640595), ('minutes', 1.4824172321817546), ('fikki', 1.4824172321817546), ('baby', 1.4360396943880591), ('middle', 1.3424825824669855), ('vs', 1.3387822050670577), ("don't", 1.3320349553919502), ('gonna', 1.278367730170045), ('yeah', 1.2334279388881664), ('mind', 1.1449007055834177), ('got', 1.1431915334593075), ('time', 1.1313452762964327), ('moment', 1.1255709693704612), ('tick', 1.111812924136316), ('tock', 1.111812924136316), ('like', 1.0748043695927918), ('say', 1.036269489962364), ('feel', 1.0348910315143796)]


In [86]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List, Tuple

def get_genre_distinctive_emotion_words(
    df: pd.DataFrame,
    df_emo: pd.DataFrame,
    genre: str,
    top_n: int = 30,
    stop_words: str = "english"
) -> List[Tuple[str, float]]:
    """
    Compute words that are distinctive for a given genre compared to all other genres,
    restricting to words in the NRC Emotion Lexicon.

    Parameters:
        df: pandas DataFrame with 'lyrics' and 'genre' columns
        df_emo: pandas DataFrame with 'word' column from NRC Emotion Lexicon
        genre: target genre
        top_n: number of top words to return
        stop_words: stop word strategy for TfidfVectorizer

    Returns:
        List of (word, score) tuples sorted by descending distinctiveness
    """

    # Convert NRC words to a set for fast lookup
    emo_words_set = set(df_emo['word'])

    # Filter lyrics
    genre_lyrics = df[df['genre'] == genre]['lyrics'].dropna().str.lower().tolist()
    other_lyrics = df[df['genre'] != genre]['lyrics'].dropna().str.lower().tolist()

    all_lyrics = genre_lyrics + other_lyrics

    # Fit TF-IDF
    vectorizer = TfidfVectorizer(
        stop_words=stop_words,
        lowercase=True,
        token_pattern=r"(?u)\b\w[\w']*\b"
    )
    X = vectorizer.fit_transform(all_lyrics)
    feature_names = np.array(vectorizer.get_feature_names_out())

    # Split TF-IDF matrix
    X_genre = X[:len(genre_lyrics), :]
    X_other = X[len(genre_lyrics):, :]

    # Sum TF-IDF scores
    tfidf_genre_sum = np.array(X_genre.sum(axis=0)).ravel()
    tfidf_other_sum = np.array(X_other.sum(axis=0)).ravel()

    # Distinctiveness score
    distinctive_score = tfidf_genre_sum - tfidf_other_sum

    # Keep only words that are in the NRC lexicon
    top_indices = np.argsort(distinctive_score)[::-1]
    top_words = []
    for i in top_indices:
        word = feature_names[i]
        score = float(distinctive_score[i])
        if score > 0 and word in emo_words_set:
            top_words.append((word, score))
        if len(top_words) >= top_n:
            break

    return top_words


In [87]:
# the most distinct words within the hiphop genre compared to all other genres, restricted to emotion words
df_songs = pd.read_csv('billboard_24years_lyrics_spotify_with_genres.csv')
df_emo = pd.read_csv(
    'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt',
    sep='\t',
    header=None,
    names=['word', 'emotion', 'association']
)

# Get top distinctive emotion words for a genre
top_dancepop_emotion_words = get_genre_distinctive_emotion_words(
    df_songs,
    df_emo,
    genre='trap',
    top_n=30
)

print("Top distinctive emotion words for hip-hop:", top_dancepop_emotion_words)

Top distinctive emotion words for hip-hop: [('tumble', 0.531447436039241), ('fumble', 0.531447436039241), ('frost', 0.41950263964362866), ('awaken', 0.4103245144802319), ('villain', 0.4002840656021327), ('toby', 0.2679213290691669), ('nickel', 0.26695195872800437), ('kris', 0.26529471046284614), ('pow', 0.2511493187479352), ('convertible', 0.22792690744888117), ('snack', 0.22377851133050092), ('noisy', 0.21575951138728724), ('revive', 0.19367663799300366), ('repent', 0.19282259893886605), ('mogul', 0.19261285800528402), ('bowls', 0.19261285800528402), ('multiple', 0.19261285800528402), ('fender', 0.17714914534641366), ('vertical', 0.17714914534641366), ('nebula', 0.17714914534641366), ('casket', 0.17360414571353844), ('dab', 0.17182115163914397), ('deacon', 0.171716323976964), ('nodding', 0.171716323976964), ('craps', 0.171716323976964), ('niece', 0.171716323976964), ('adobe', 0.168714882240808), ('fountain', 0.16482433809656005), ('shaken', 0.16370259785417882), ('tart', 0.16344934745

In [88]:
from collections import defaultdict, Counter

def map_song_to_emotions_count(song_lyrics: str, df_emo: pd.DataFrame) -> dict:
    """
    Map words in a song's lyrics to emotions and count frequency.
    Returns a dictionary: emotion -> count of words
    """
    # Prepare mapping: word -> list of emotions
    word_to_emotions = defaultdict(list)
    for _, row in df_emo.iterrows():
        word_to_emotions[row['word']].append(row['emotion'])

    # Tokenize lyrics
    words = song_lyrics.lower().split()

    # Count occurrences of each emotion
    emotion_counts = Counter()
    for word in words:
        if word in word_to_emotions:
            for emotion in word_to_emotions[word]:
                emotion_counts[emotion] += 1

    return dict(emotion_counts)

In [89]:
df_emo_ones = df_emo[df_emo['association'] == 1]
example_lyrics = df_songs['lyrics'][1]  # first song
emotion_counts = map_song_to_emotions_count(example_lyrics, df_emo_ones)

print("Emotion counts for the song:", emotion_counts)

Emotion counts for the song: {'anger': 3, 'anticipation': 3, 'joy': 8, 'positive': 13, 'surprise': 4, 'trust': 9, 'negative': 16, 'fear': 3, 'sadness': 3, 'disgust': 2}


In [90]:
# # Apply your mapping function to each song's lyrics
# emotion_dicts = df_songs['lyrics'].apply(lambda lyrics: map_song_to_emotions_count(lyrics, df_emo_ones))

# # Convert list of dicts into a DataFrame
# df_emotions = pd.DataFrame(list(emotion_dicts))

# # Fill missing values with 0 (if some emotions are absent in a song)
# df_emotions = df_emotions.fillna(0)

# # Add emotion columns to the original songs DataFrame
# df_songs_with_emotions = pd.concat([df_songs, df_emotions], axis=1)


In [91]:
import pandas as pd
from collections import defaultdict, Counter
from multiprocessing import Pool, cpu_count

# --- Your function, modified to accept pre-built word_to_emotions dict ---
def map_song_to_emotions_count(song_lyrics: str, word_to_emotions: dict) -> dict:
    """
    Map words in a song's lyrics to emotions and count frequency.
    Returns a dictionary: emotion -> count of words
    """
    words = song_lyrics.lower().split()
    emotion_counts = Counter()
    for word in words:
        if word in word_to_emotions:
            for emotion in word_to_emotions[word]:
                emotion_counts[emotion] += 1
    return dict(emotion_counts)

# --- Main execution ---
if __name__ == "__main__":
    # Step 1: Filter emotions with association == 1
    df_emo_ones = df_emo[df_emo['association'] == 1]

    # Step 2: Pre-build word -> emotion mapping
    word_to_emotions = defaultdict(list)
    for _, row in df_emo_ones.iterrows():
        word_to_emotions[row['word']].append(row['emotion'])

    # Step 3: Prepare song lyrics list
    songs_lyrics = df_songs['lyrics'].tolist()

    # Step 4: Define worker for multiprocessing
    def worker_map_song(lyrics):
        return map_song_to_emotions_count(lyrics, word_to_emotions)

    # Step 5: Parallel MapReduce-style processing
    n_workers = cpu_count()
    with Pool(n_workers) as pool:
        emotion_dicts = pool.map(worker_map_song, songs_lyrics)

    # Step 6: Convert list of dicts to DataFrame and combine
    df_emotions = pd.DataFrame(emotion_dicts).fillna(0)
    df_songs_with_emotions = pd.concat([df_songs.reset_index(drop=True), df_emotions.reset_index(drop=True)], axis=1)



In [92]:
# List of emotion columns
emotion_cols = ['anticipation','fear','joy','positive','trust',
                'anger','disgust','negative','sadness','surprise']

# Include 'genre' column
cols_to_use = ['genre'] + emotion_cols
df_emotions_genre = df_songs_with_emotions[cols_to_use]

# Group by genre and compute mean for each emotion
df_genre_emotions_mean = df_emotions_genre.groupby('genre').mean().reset_index()

df_genre_emotions_mean

Unnamed: 0,genre,anticipation,fear,joy,positive,trust,anger,disgust,negative,sadness,surprise
0,"2step garage, r&b",10.000000,2.000000,6.000000,14.000000,19.000000,2.000000,0.000000,10.000000,4.000000,3.000000
1,acoustic hiphop,0.000000,2.000000,1.000000,1.000000,1.000000,2.000000,1.000000,6.000000,1.000000,0.000000
2,afrobeats,8.333333,7.666667,11.333333,21.333333,9.333333,5.666667,5.666667,13.333333,9.000000,5.666667
3,"afrobeats, pop, r&b, reggae",7.000000,5.000000,10.000000,20.000000,12.000000,3.000000,3.000000,4.000000,5.000000,1.000000
4,"afrobeats, r&b",5.000000,7.333333,11.666667,15.000000,5.666667,6.333333,5.666667,13.333333,8.333333,6.333333
...,...,...,...,...,...,...,...,...,...,...,...
939,worldbeat,9.000000,10.000000,12.000000,13.000000,11.000000,9.000000,9.000000,10.000000,10.000000,6.000000
940,"worldbeat, latin pop, pop rock",3.000000,2.000000,10.000000,20.000000,4.000000,2.000000,1.000000,10.000000,4.000000,6.000000
941,| recorded =,7.666667,2.000000,4.333333,7.666667,6.000000,2.000000,2.000000,6.333333,7.000000,4.333333
942,|length =,2.000000,6.000000,1.000000,1.000000,3.000000,2.000000,0.000000,10.000000,6.000000,0.000000
