# Lyrics PCA Bertopic Analysis

**Date:** June 13th, 2024

**Author:** Harris Zheng

**Description:** Analyze Correlations Between Song Lyric Features and Popularity

Some output cell visualizations may not show up in this notebook locally, 
but will show up on Google Colab through this link 
here: https://drive.google.com/file/d/1CEkhWTW7rLcfpULfmUUFAA3iLnvdpEly/view?usp=sharing.


# 0. Imports

In [None]:
## Check GPU
# !nvidia-smi

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
# !python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 485, done.[K
remote: Counting objects: 100% (216/216), done.[K
remote: Compressing objects: 100% (125/125), done.[K
^C
python3: can't open file '/content/rapidsai-csp-utils/colab/pip-install.py': [Errno 2] No such file or directory


In [None]:
# !pip install bertopic

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import polars as pl
import pprint
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import spacy
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import cudf
import cupy as cp

In [None]:
# Load clustering libraries
from bertopic import BERTopic
# from umap import UMAP
# from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from bertopic.representation import MaximalMarginalRelevance

In [None]:
# Load GPU clustering algorithms
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
import requests
stopwords_list = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt").content
STOP_WORDS = set(stopwords_list.decode().splitlines())

# 1. Load Joined Dataset

Load dataset joined from Spotify and Genius dataset

In [None]:
# dataset_path = "../assets/actually_merged_data_v1_title_artist.csv"
dataset_path = (
  "./data/actually_merged_data_v1_title_artist.csv"
)

In [None]:
df_songs = pl.read_csv(dataset_path)

In [None]:
df_songs.null_count()

title_hash,artist_hash,title,artist,median_rank,highest_rank,min_date,max_date,url,album,release_date,duration_ms,median_popularity,highest_popularity,median_streams,total_streams,num_days_on_chart,is_explicit,af_danceability,af_energy,af_key,af_loudness,af_mode,af_speechiness,af_acousticness,af_instrumentalness,af_liveness,af_valence,af_tempo,af_time_signature,title_genius,tag,artist_genius,year,views,features,lyrics,id,language_cld3,language_ft,language
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,4139,4108,4108,4108,4108,23230,0,0,4108,4109,4109,4109,4109,4109,4109,4109,4109,4109,4109,4109,4109,0,0,0,0,0,0,0,0,1422,1679,3287


In [None]:
## Filter for only english songs
df_songs_en = df_songs.filter(
    (pl.col("language_cld3") == "en")  |
    (pl.col("language_ft") == "en")
)
df_songs_en.shape

(31680, 41)

In [None]:
df_songs_en.head()

title_hash,artist_hash,title,artist,median_rank,highest_rank,min_date,max_date,url,album,release_date,duration_ms,median_popularity,highest_popularity,median_streams,total_streams,num_days_on_chart,is_explicit,af_danceability,af_energy,af_key,af_loudness,af_mode,af_speechiness,af_acousticness,af_instrumentalness,af_liveness,af_valence,af_tempo,af_time_signature,title_genius,tag,artist_genius,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,str,f64,i64,str,str,str,str,str,f64,f64,f64,f64,f64,i64,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,i64,i64,str,str,i64,str,str,str
"""rapapapa""","""richbrian""","""Rapapapa (feat…","""Rich Brian""",163.0,66,"""2019-07-26""","""2019-08-19""","""https://open.s…","""The Sailor""","""2019-07-26""",247165.0,18.0,18.0,27026.5,527996.0,17,True,0.782,0.752,3.0,-4.622,0.0,0.214,0.501,0.0,0.114,0.504,126.936,4.0,"""Rapapapa""","""rap""","""Rich Brian""",2019,131654,"""{RZA}""","""[Verse 1: Rich…",4707498,"""en""","""en""","""en"""
"""let my baby st…","""amandlastenber…","""Let My Baby St…","""Amandla Stenbe…",15.5,3,"""2017-05-25""","""2017-06-11""","""https://open.s…","""Let My Baby St…","""2017-05-11""",144933.0,34.0,34.0,,0.0,17,False,0.542,0.357,10.0,-6.938,0.0,0.0365,0.334,0.258,0.0724,0.197,102.251,4.0,"""Let My Baby St…","""rb""","""Amandla Stenbe…",2017,17418,"""{}""","""[Verse 1] I wa…",3084505,"""en""","""en""","""en"""
"""in my room""","""yellowclawmust…","""In My Room (fe…","""Yellow Claw, M…",40.5,32,"""2017-01-01""","""2017-10-03""","""https://open.s…","""Blood For Merc…","""2015-12-04""",168641.0,30.0,30.0,3214.0,3214.0,4,False,0.745,0.869,6.0,-4.375,1.0,0.0444,0.00193,0.0,0.0799,0.814,104.015,4.0,"""In My Room""","""rap""","""Yellow Claw & …",2015,90578,"""{Tyga,""Ty Doll…","""[Intro: Ty Dol…",2330489,"""en""","""en""","""en"""
"""california""","""charlottecardi…","""California""","""Charlotte Card…",28.0,17,"""2018-04-16""","""2018-12-06""","""https://open.s…","""California""","""2018-04-09""",208760.0,31.0,31.0,,0.0,9,False,0.677,0.557,4.0,-6.118,0.0,0.0365,0.0321,4e-06,0.186,0.293,92.03,4.0,"""California""","""pop""","""Charlotte Card…",2018,4214,"""{}""","""[Chorus] Miss …",3655696,"""en""","""en""","""en"""
"""you i""","""twinsmith""","""You & I""","""Twinsmith""",32.0,25,"""2018-08-26""","""2018-09-15""","""https://open.s…","""Stay Cool""","""2017-07-14""",210089.0,0.0,0.0,,0.0,10,False,0.624,0.68,7.0,-7.478,1.0,0.027,0.04,0.000215,0.16,0.204,102.014,4.0,"""You I""","""rock""","""Twinsmith""",2017,2925,"""{}""","""You & I both k…",3146967,"""en""","""en""","""en"""


In [None]:
## No duplicates
df_songs_en.filter(
    ~df_songs_en.select("title_hash", "artist_hash").is_unique()
)

title_hash,artist_hash,title,artist,median_rank,highest_rank,min_date,max_date,url,album,release_date,duration_ms,median_popularity,highest_popularity,median_streams,total_streams,num_days_on_chart,is_explicit,af_danceability,af_energy,af_key,af_loudness,af_mode,af_speechiness,af_acousticness,af_instrumentalness,af_liveness,af_valence,af_tempo,af_time_signature,title_genius,tag,artist_genius,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,str,f64,i64,str,str,str,str,str,f64,f64,f64,f64,f64,i64,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,i64,i64,str,str,i64,str,str,str


In [None]:
df_lyrics_artist_distribution = df_songs_en.group_by("artist_hash").agg(
    pl.struct(["title_hash", "artist_hash"]).n_unique().alias("len"),
    ((pl.struct(["title_hash", "artist_hash"]).n_unique()).alias("Percent")
      /df_songs_en.shape[0] * 100).round(2)
).sort("len", descending=True).rename({"len" : "Number of Songs"})

In [None]:
df_songs_en["artist"].n_unique()

11494

In [None]:
df_lyrics_artist_distribution.head(10)

artist_hash,Number of Songs,Percent
str,u32,f64
"""taylorswift""",127,0.4
"""drake""",108,0.34
"""future""",102,0.32
"""edsheeran""",79,0.25
"""youngboyneverb…",77,0.24
"""torylanez""",76,0.24
"""logic""",72,0.23
"""trippieredd""",71,0.22
"""juicewrld""",69,0.22
"""eminem""",68,0.21


# 2. Extract features from songs

# Data Cleaning

## Explore If Lyrics Need Cleaning

- We know square brackets "[]" are section markers that don't contribute to the lyrical themes, let's remove those.
- (Not sure if I want to do this yet) We know that the same phrases shouldn't be included more than a few times. Let's split sentences by commas/separators and remove similar phrases.

## Explore All Types of Open Brackets

In [None]:
df_songs_parenthesis = df_songs_en.filter(
  pl.col("lyrics").str.contains("\p{Ps}|\p{Pe}")
)

In [None]:
df_songs_parenthesis.shape

(29303, 41)

In [None]:
df_songs_parenthesis = df_songs_parenthesis.with_columns(
    pl.col("lyrics").str.extract_all("[\p{Ps}\p{Pe}]{1}").alias("all_brackets")
)

df_songs_parenthesis_all = df_songs_parenthesis.explode(
    "all_brackets"
)


df_songs_parenthesis_count = df_songs_parenthesis_all.group_by("all_brackets").agg(
    pl.struct(["title_hash", "artist_hash"]).n_unique().alias("num_song_occurrences")
).sort("num_song_occurrences", descending=True)

In [None]:
df_songs_parenthesis_count

all_brackets,num_song_occurrences
str,u32
"""[""",28366
"""]""",28363
""")""",16447
"""(""",16444
"""}""",63
"""{""",60
"""‚""",50
"""」""",30
"""「""",30
"""„""",23


In [None]:
df_songs_round_bracket = df_songs_en.filter(
  pl.col("lyrics").str.contains("\(|\)")
)
df_songs_round_bracket.shape

(16447, 41)

In [None]:
pprint.pprint(
  df_songs_round_bracket[5, "lyrics"]
)

('[Intro]\n'
 'Ocean Wisdom\n'
 'Big up Dirty Dike on the beat\n'
 'Shoutouts to RF, Black Ink\n'
 'Shoutout to Macdot, Brighton Town\n'
 'This is how we do this\n'
 "Yeah, let's go in\n"
 'Yo, yo\n'
 '\n'
 '[Verse 1]\n'
 "See me I'm walking, why cause I don't run fam\n"
 'Make a film about me find a proper fucking stuntman\n'
 "I'm Captain Wiz and I'm actually shoving dirt in a dustpan\n"
 "And blowin' it back in faces of wastemen who try and brush man\n"
 'I get a wheel in the club on the microphone\n'
 "They didn't know I handle bars, it's a minor though\n"
 "They offered me some gear but I'm over tired\n"
 "All this talkin' make a bredda wanna cycle home, yo\n"
 'I flabbergast that bredda my stamina can be coming in handy\n'
 "Flippin' the script, I'm doing a what?\n"
 "I'm telling a story, doing a Plan B\n"
 'How the fuck on earth did we become so angry?\n'
 'Couple years ago, man was just reading the dandy\n'
 "So how'd I go from reading the beano to beatin' up emos\n"
 "Swimmin'

In [None]:
df_songs_curly_bracket = df_songs_en.filter(
  pl.col("lyrics").str.contains("\{|\}")
)
df_songs_curly_bracket.shape

(72, 41)

In [None]:
pprint.pprint(
  df_songs_curly_bracket[1, "lyrics"]
)

("I ain't looking for trouble\n"
 "I'm just staying for a few drinks\n"
 'Think about a few things\n'
 'Actually, make it double\n'
 "'Cause now I feel like dancin'\n"
 "Boy just take my hands and we'll go\n"
 '\n'
 'Making all the bad decisions (Oh yeah, oh yeah, oh yeah)\n'
 'Liquid courage, ammunition  (Oh yeah, oh yeah, oh yeah)\n'
 'Pretend that we just never ended  (Oh yeah, oh yeah, oh yeah)\n'
 'Pass like all our scars are mended\n'
 '\n'
 'So I show you the cards in my hand and say that I want you, say that I want '
 'you\n'
 "This wasn't part of my plan but now that I told you, baby I told you\n"
 '\n'
 "I ain't looking for trouble\n"
 "I'm just staying for a few drinks\n"
 'Think about a few things\n'
 'Actually, make it double\n'
 "'Cause now I feel like dancin'\n"
 "Boy just take my hands and we'll go\n"
 '{ Break }\n'
 '\n'
 "I wasn't looking for trouble\n"
 'But there you were in my way\n'
 'Looking like the old days\n'
 'Shocked me with your stubble\n'
 'Caught you kiss

Round parenthesis () typically indicate repetition such as ad-libs which won't contain super meaningful information, and most other forms of parenthesis contain metadata relating to song form on the inside (ex. Verse 1, Verse 2, Break, Intro) so they don't add to the lyrical content of the song either

## Execute Bracket Data Cleaning

Remove any type of Open and Closed Brackets. Use \p{Ps} to indicate open bracket (Rust Regex) and \p{Pe} to indicate closed bracket

In [None]:
def clean_lyrics_data(df: pl.DataFrame, replace_separator_with_token = False,
                      remove_punctuation = False):
    ## Normalize whitespace, remove lines with open and closed brackets and lower case all lyrics
    df_cleaned = df.with_columns(
        pl.col("lyrics").str.replace("\s+", "\s").str.split("\n").list.eval(
            pl.element().filter(
                (~pl.element().str.contains("\p{Ps}.*\p{Pe}")) &
                (pl.element().str.len_chars() > 1)
            )
        ).list.join("\n").alias("lyrics_cleaned")
    )

    if remove_punctuation is True:
      df_cleaned = df_cleaned.with_columns(
          pl.col("lyrics_cleaned").str.replace_all(
              "[\p{Punct}p\{Separator}]", ""
          )
      )

    df_cleaned = df_cleaned.with_columns(
        pl.col("lyrics_cleaned").str.to_lowercase()
    )


    if replace_separator_with_token is True:
        df_cleaned = df_cleaned.with_columns(
            pl.col("lyrics_cleaned").str.replace_all("\n", " [SEP] ")
        )

    return df_cleaned


In [None]:
END_PUNCTUATIONS = ",.?!;…]+"

In [None]:
df_songs_en_compare_lyrics = clean_lyrics_data(df_songs_en, replace_separator_with_token=False)

In [None]:
df_songs_en_compare_lyrics.select("lyrics", "lyrics_cleaned").head()

lyrics,lyrics_cleaned
str,str
"""[Verse 1: Rich…","""two-faced bitc…"
"""[Verse 1] I wa…","""i was made to …"
"""[Intro: Ty Dol…","""yellow claw an…"
"""[Chorus] Miss …","""an ocean made …"
"""You & I both k…","""you\s& i both …"


In [None]:
pprint.pprint(
    df_songs_en_compare_lyrics[1, "lyrics"]
)

('[Verse 1]\n'
 'I was made to love her, been working at it\n'
 'Half of my life, I’ve been an addict\n'
 'And she’s been good to me\n'
 'Far as I can tell she’s happy, livin’ with her Macky\n'
 '\n'
 '[Chorus]\n'
 'So please don’t take my love away\n'
 "Please don’t take , please don't take\n"
 '\n'
 '[Verse 2]\n'
 'And where would I be, feeling lonely\n'
 'Separated from my one and only\n'
 'And what’s there left to say\n'
 'Far as I can tell that day could be on its way\n'
 '\n'
 '[Chorus]\n'
 'So please don’t take my love away\n'
 'Let my baby stay, let my baby stay\n'
 'Let my baby stay')


In [None]:
pprint.pprint(
    df_songs_en_compare_lyrics[1, "lyrics_cleaned"]
)

('i was made to love her, been working at it\n'
 'half of my life, i’ve been an addict\n'
 'and she’s been good to me\n'
 'far as i can tell she’s happy, livin’ with her macky\n'
 'so please don’t take my love away\n'
 "please don’t take , please don't take\n"
 'and where would i be, feeling lonely\n'
 'separated from my one and only\n'
 'and what’s there left to say\n'
 'far as i can tell that day could be on its way\n'
 'so please don’t take my love away\n'
 'let my baby stay, let my baby stay\n'
 'let my baby stay')


In [None]:
df_songs_en_cleaned = clean_lyrics_data(df_songs_en, replace_separator_with_token=False)

## Look for Duplicates/Remove Duplicates

Previously, we have used join to filter out duplicates based on title and artist as much as possible, but multiple artists can also cover the same song so to ensure no duplicates by, we use cosine similarity between lyrics columns in order to find clusters and remove duplicates

In [None]:
## Get Song Documents
docs = df_songs_en["lyrics"].to_pandas()

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.metrics import pairwise_distances
import cupy as cp

In [None]:
def get_duplication_clusters(docs: pd.Series):
  ## Convert Documents to TfIdf Vectors
  cu_docs = cudf.Series(docs)
  tfidf_vectorizer = TfidfVectorizer(min_df=4, ngram_range=(1,1))
  X = tfidf_vectorizer.fit_transform(cu_docs)

  ## Calculate similarity between documents in terms of TfIdf Vectors.
  ## Idea is that any possible duplicate document should have very low distance
  X_sim = pairwise_distances(X, X, metric="cosine")

  # # Get the number of rows and columns
  # n = min(X_sim.shape)

  # # Create a cupy array for the diagonal indices
  # diag_indices = cp.arange(n)

  # # Fill the diagonal with the specified value
  # X_sim[diag_indices, diag_indices] = 1

  ## Find all documents related to document in current row
  df_related_vectors = cudf.Series(
    [(row < 0.09).nonzero()[0] for row in X_sim]
).to_frame(name="cluster")
  df_related_vectors["row_number"] = (
      range(len(df_related_vectors))
  )
  # df_related_vectors.head()

  df_vector_clusters = (
      df_related_vectors["cluster"].drop_duplicates()
  ).to_frame(name="cluster")
  df_vector_clusters["cluster_number"] = (
      range(len(df_vector_clusters))
  )
  df_related_vectors_with_cluster_numbers = (
      cudf.merge(
          df_related_vectors,
          df_vector_clusters,
          how="left",
          on="cluster"
      ).sort_values("row_number")
      # Sort by row_number because merge messes up
      # original table order
  )
  return df_related_vectors_with_cluster_numbers


In [None]:
## Convert Documents to TfIdf Vectors
cu_docs = cudf.Series(docs)
tfidf_vectorizer = TfidfVectorizer(min_df=4, ngram_range=(1,1))
X = tfidf_vectorizer.fit_transform(cu_docs)

## Calculate similarity between documents in terms of TfIdf Vectors.
## Idea is that any possible duplicate document should have very low distance
X_sim = pairwise_distances(X, X, metric="cosine")

In [None]:
min_tfidf_dist_distrib = cp.where(X_sim == 0, 1, X_sim).min(axis=0).get()

In [None]:
cudf.Series(min_tfidf_dist_distrib).quantile(
    [0.05, 0.1, 0.25, 0.75, 0.9, 0.95]
)

0.05    0.142648
0.10    0.266677
0.25    0.430293
0.75    0.625533
0.90    0.682379
0.95    0.713379
dtype: float64

In [None]:
fig = px.histogram(x=min_tfidf_dist_distrib)
fig.update_layout(title="Distribution of Min Cosine Distance from TF-IDF Vectors")
fig.update_xaxes(title="Min Cosine distance Per Song Lyric")

Let's try removing duplicate songs that are 0.09 distance away from each other (or in other words, 30% similar to one another)

In [None]:
df_related_vectors = cudf.Series(
    [(row < 0.09).nonzero()[0] for row in X_sim]
).to_frame(name="cluster")
df_related_vectors["row_number"] = (
    range(len(df_related_vectors))
)
# df_related_vectors.head()

df_vector_clusters = (
    df_related_vectors["cluster"].drop_duplicates()
).to_frame(name="cluster")
df_vector_clusters["cluster_number"] = (
    range(len(df_vector_clusters))
)
df_related_vectors_with_cluster_numbers = (
    cudf.merge(
        df_related_vectors,
        df_vector_clusters,
        how="left",
        on="cluster"
    ).sort_values("row_number")
)

In [None]:
df_related_vectors_with_cluster_numbers

Unnamed: 0,cluster,row_number,cluster_number
448,[0],0,0
449,[1],1,1
450,[2],2,2
451,[3],3,3
452,[4],4,4
...,...,...,...
30793,[31675],31675,31198
30794,[31676],31676,31199
30795,[31677],31677,31200
30796,[31678],31678,31201


In [None]:
#### We are going to use cluster_number to drop duplicates later
df_songs_en = df_songs_en.with_columns(
  cluster_number=pl.Series(
      df_related_vectors_with_cluster_numbers["cluster_number"].to_pandas()
  ),
  row_number=pl.Series(
      df_related_vectors_with_cluster_numbers["row_number"].to_pandas()
  ),
)

In [None]:
df_songs_en_duplicates = df_songs_en.filter(
    ~pl.col("cluster_number").is_unique()
).sort("cluster_number").select(
    "title", "artist", "lyrics", "row_number", "cluster_number"
)

In [None]:
df_songs_en_duplicates.shape

(870, 5)

In [None]:
df_songs_en_duplicates.head(10)

title,artist,lyrics,row_number,cluster_number
str,str,str,i64,i64
"""Lay It Down""","""Lloyd""","""[Hook] Lay you…",36,36
"""Lay It Down""","""Steelix""","""[Chorus] Lay y…",4200,36
"""Dive - Recorde…","""Luke Combs""","""[Verse 1] Oh, …",59,59
"""Dive""","""Ed Sheeran""","""[Verse 1] Oh, …",19451,59
"""Santa Claus Is…","""The Crystals""","""[Intro (spoken…",131,131
"""Santa Claus Is…","""Bruce Springst…","""[Spoken Intro]…",13729,131
"""Santa Claus Is…","""Justin Bieber""","""[Intro] Santa'…",24858,131
"""Can’t Help Fal…","""Kina Grannis""","""[Verse 1] Wise…",142,142
"""Can't Help Fal…","""Michael Bublé""","""[Verse 1] Wise…",490,142
"""Can't Help Fal…","""Beck""","""[Verse 1] Wise…",2357,142


In [None]:
duplicate_distribution = df_songs_en_duplicates.group_by("cluster_number").agg(
    pl.count().alias("number_of_duplicates")
).sort("number_of_duplicates", descending=True)

In [None]:
fig = px.histogram(
    duplicate_distribution,
    x="number_of_duplicates"
)
fig.update_layout(title="Number of Duplicate Groups by Number of Duplicates")
fig.update_yaxes(title="Number of Groups")
fig.update_xaxes(title="Number of Duplicates")

## Sample Duplicates

Michael Jackson's "ABC" and "I Want Your Back" are not duplicates, but it was considered a duplicates for similarity < 0.3

In [None]:
# X_sim[42, 27384]

In [None]:
# pprint.pprint(
#     df_songs_en_duplicates.filter(
#         pl.col("row_number") == 42
#     )[0, "lyrics"]
# )

In [None]:
# pprint.pprint(
#     df_songs_en_duplicates.filter(
#         pl.col("row_number") == 27384
#     )[0, "lyrics"]
# )

In [None]:
# pprint.pprint(
#   df_songs_36[1, "lyrics"]
# )

In [None]:
# X.shape

# Features to Extract
- Lyrical Consistency
- Lyrical Care
- Rhymes
- Themes
- Metaphors
- Number of Adjectives


### Extract Themes (Using Default BERTopic Parameters)

In [None]:
def preprocess_df(df : pl.DataFrame) -> pl.DataFrame:
  ## Clean lyrics data
  df = clean_lyrics_data(df, replace_separator_with_token=False)

  ## Remove duplicates
  df_songs_no_duplicates = (
      df.unique("cluster_number",
                 maintain_order=True)
  )
  return df_songs_no_duplicates

In [None]:
def get_coherence_score(topic_model : BERTopic, docs) -> None:
    cleaned_docs = topic_model._preprocess_text(docs)
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topics = topic_model.get_topics()
    if -1 in topics:
      del topics[-1]
    print(topics.keys())
    topic_words = [
            [word for word, _ in topic_model.get_topic(topic) if word != ""] for topic in topics
    ]
    # topic_words = [[words for words, _ in topic_model.get_topic(topic)]
    #         for topic in range(len(set(topics)))]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                            texts=tokens,
                            corpus=corpus,
                            dictionary=dictionary,
                            coherence='u_mass')
    return coherence_model





In [None]:
def train_bertopic_model(docs, number_of_topics = "auto", diversity=0.6,
                         seed=42, dr="umap", cluster="hdbscan", n_neighbors=15,
                         min_cluster_size=10, min_samples=10, min_df=1):
    '''
        IMPORTANT HYPERPARAMETERS:
        =============================================================
        number_of_topics : decides number of topics in BERTopic model, but ONLY reduces topics, not increase
        diversity : decides how diverse the topics are (removes similar words from topic representation).


        ABOUT UMAP:
        ========================================================
        higher the n_neighbors, better the global structure of vectors are preserved.
        lower the n_neighbors, local structures are preserved.

        min_dist is how tightly points can be packed together.

        min_samples: measure of how conservative clustering is
        min_cluster_size : how big/small each cluster is,

        ABOUT HDBSCAN:
        =============================================
        min_cluster_size: n documents required for each cluster

    '''
    # Instantiate the vectorizer model
    vectorizer_model = CountVectorizer(
        ngram_range=(1,3),
        min_df=min_df,
        stop_words=list(STOP_WORDS | {"wanna", "hey", "yea", "yeah"})
    )

    # Instantiate the UMAP model
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=300, min_dist=0, metric='cosine',
                      random_state=seed)
    pca_model = PCA(n_components=15)

    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples = min_samples,
                            metric='euclidean', prediction_data=True)
    kmeans_model = KMeans(n_clusters=number_of_topics)


    representation_model = MaximalMarginalRelevance(diversity=diversity)
    bert_topic_model = BERTopic(
        nr_topics=number_of_topics,
        umap_model=umap_model if dr == "umap" else pca_model,
        hdbscan_model=hdbscan_model if cluster == "hdbscan" else kmeans_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        language="english",
        calculate_probabilities = False
    )

    bert_topic_model.fit(docs)
    return bert_topic_model



In [None]:
def get_topic_distribution(df : pd.DataFrame) -> pd.DataFrame:
  return df.loc[:, ["Topic", "Count"]]

def get_topic_documents(df: pd.DataFrame, topic_number : int) -> list:
  return df.loc[df["Topic"] == topic_number, "Representative_Docs"].iloc[0]

def get_topic_representation(df: pd.DataFrame) -> pd.DataFrame:
  return df.loc[:, ["Topic", "Representation"]]

In [None]:
df_songs_no_duplicates = preprocess_df(df_songs_en)

In [None]:
df_songs_en.shape, df_songs_no_duplicates.shape

((31680, 43), (31203, 44))

In [None]:
docs = df_songs_no_duplicates["lyrics_cleaned"].to_list()

In [None]:
topic_model_default = train_bertopic_model(
    docs,
    min_cluster_size=10,
    n_neighbors=10,
    diversity=0.8,
    min_samples=3,
)

#### Visualize Clusters

In [None]:
topic_model_default.visualize_barchart()

We can learn some basic topics per song, such as topic 0 about love, mind, day (love songs about how you're thinking about someone everyday). Topic 1 about Christmas, Topic 2 about sassy subjects (bad, power, bad girls). Can't really tell what Topic 3 is, and Topic 4 and and Topic 5 are very specific, respectively about faith/angels and star-crossed lovers.

In [None]:
topic_model_info = topic_model_default.get_topic_info()

In [None]:
topic_model_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,434,-1_niggas_shit_christmas_real,"[niggas, shit, christmas, real, ayy, tryna, wi...","[hussle man a shooter, that's a fact, nigga\nt..."
1,0,30333,0_fuck_mind_day_ooh,"[fuck, mind, day, ooh, nigga, love love, leave...",[run fast from my day job\nrunnin' fast from t...
2,1,277,1_ho ho_bells_claus_time,"[ho ho, bells, claus, time, mistletoe, day, me...",[we wish you a merry christmas\nwe wish you a ...
3,2,91,2_twilight twilight_bad bad bad_power_rock hea...,"[twilight twilight, bad bad bad, power, rock h...","[hangul\swoo!\nuh\nwant you to love me, love m..."
4,3,25,3_pull pull pull_manta_money_china china,"[pull pull pull, manta, money, china china, pa...","[not a swimming pool, it's an ocean\ndive in t..."
5,4,21,4_angels_bethlehem_sing_baby boy,"[angels, bethlehem, sing, baby boy, faithful f...","[o holy night, the stars are brightly shining\..."
6,5,12,5_romeo_love_time time time_met,"[romeo, love, time time time, met, starcrossed...",[sleeping\sall alone\nyou wake up with a bottl...
7,6,10,6____,"[, , , , , , , , , ]","[, , ]"


If you look at the counts per topic, we can tell Topic 0-2 topics are general but well-defined since they have higher counts.

Topic 3-5 topics are harder to make sense of, or too specific.

Topic -1 are outliers that were not identified as topics by BERTopic.

In [None]:
fig = px.bar(topic_model_info, x="Topic", y="Count")
fig.update_layout(title="Distribution of Topic Counts over Time")

#### Check Top 10 Representation of Topics

In [None]:
get_topic_representation(topic_model_info)

Unnamed: 0,Topic,Representation
0,-1,"[niggas, shit, christmas, real, ayy, tryna, wi..."
1,0,"[fuck, mind, day, ooh, nigga, love love, leave..."
2,1,"[ho ho, bells, claus, time, mistletoe, day, me..."
3,2,"[twilight twilight, bad bad bad, power, rock h..."
4,3,"[pull pull pull, manta, money, china china, pa..."
5,4,"[angels, bethlehem, sing, baby boy, faithful f..."
6,5,"[romeo, love, time time time, met, starcrossed..."
7,6,"[, , , , , , , , , ]"


#### Check Documents

In [None]:
topic_0_docs = get_topic_documents(topic_model_info, 0)
topic_1_docs = get_topic_documents(topic_model_info, 1)
topic_2_docs = get_topic_documents(topic_model_info, 2)
topic_3_docs = get_topic_documents(topic_model_info, 3)
# topic_4_docs = get_topic_documents(topic_model_info, 4)

In [None]:
pprint.pprint(
    topic_0_docs[2]
)

('woke\\sup in the mornin\n'
 'with ya on my mind\n'
 'visualize old times\n'
 "let's bring it back\n"
 'dial one time\n'
 'and ya hit me back\n'
 "think it's 'bout time\n"
 'that i needed that\n'
 'little bit a\n'
 'heaven sent her\n'
 'little bit\n'
 'of that rare light\n'
 "know that i've been\n"
 'gone so long\n'
 'that we can go\n'
 'the slow route\n'
 'home baby\n'
 "what's ya thoughts\n"
 'and how ya feel\n'
 'are you nervous\n'
 'we can bop\n'
 'a little seal\n'
 'plant your rose here\n'
 'let you get behind the wheel\n'
 'i can serve ya\n'
 'do the things in your dreams\n'
 "i'll preserve ya\n"
 'for your love life\n'
 'you should jump\n'
 "don't you hit the brake\n"
 'even pump\n'
 "i'll buy you gucci and prada\n"
 'and all you want\n'
 "and you know i won't lie\n"
 "and i won't front\n"
 'for your love life\n'
 'you should jump\n'
 "don't you hit the brake\n"
 'even pump\n'
 "i'll buy you gucci and prada\n"
 'and all you want\n'
 "and you know i won't lie\n"
 "and i won't fr

#### Evaluate Coherence of Theme Clusters

In [None]:
# u_mass_coherence_model = get_coherence_score(topic_model, docs)

dict_keys([0, 1, 2])


In [None]:
# len(topic_model.get_topics()[0])

10

In [None]:
# segmented_topics = u_mass_coherence_model.segment_topics()

In [None]:
# len(segmented_topics)

3

In [None]:
# len(segmented_topics[0])

45

Segmented Topics stores the pairwise combinations of all top 10 words inside a topic, which results in 10\*9/2 = 45 word pairs. These pairs' coherence scores will be individually computed and averaged within a topic to achieve a so-called **global coherence score**.

In [None]:
# def show_average_and_individual_coherences(topic_model : BERTopic, docs: list[list[str]]) -> None:
#   u_mass_coherence_model = get_coherence_score(topic_model, docs)
#   segmented_topics = u_mass_coherence_model.segment_topics()
#   coherence_per_topic = u_mass_coherence_model.get_coherence_per_topic(segmented_topics=segmented_topics)
#   coherence_per_topic = zip(list(topic_model.get_topics().keys()), coherence_per_topic)
#   coherence_per_topic = list(coherence_per_topic)
#   print(f"Average Coherence Across all topics: {u_mass_coherence_model.get_coherence()}")
#   print(f"Individual Coherence Per Topic: {coherence_per_topic}")
#   assert np.isclose(
#       [np.mean([score for topic, score in coherence_per_topic])],
#       [u_mass_coherence_model.get_coherence()]
#   )



In [None]:
# show_average_and_individual_coherences(topic_model, docs)

dict_keys([0, 1, 2])
Average Coherence Across all topics: -1.9630208376579485
Individual Coherence Per Topic: [(0, -1.4839430567927052), (1, -1.801483838239538), (2, -2.603635617941602)]


### Extract Themes (Try Splitting the Verse of Each Song and Running BERTopic on top of it)

In [None]:
def split_and_clean_lyrics_data(df: pl.DataFrame, replace_separator_with_token = False):
    ## Normalize whitespace, split lyrics on brackets. Keep only alphanumeric verses.
    df_cleaned = df.with_columns(
        pl.col("lyrics").str.replace("\p{Separator}+", " ")
        .str.replace_all("\[.*\]", "[]").str.split("[]").list.eval(
            pl.element().filter(
                (pl.element().str.len_chars() > 5) &
                (pl.element().str.extract("([a-z-A-Z0-9])", 1).str.len_chars() != 0)
            ).str.strip_chars().str.to_lowercase()
        ).alias("verses")
    )

    df_cleaned = df_cleaned.explode("verses")

    if replace_separator_with_token is True:
        df_cleaned = df_cleaned.with_columns(
            pl.col("verses").str.replace_all("\n", " [SEP] ")
        )

    return df_cleaned


In [None]:
def preprocess_df(df : pl.DataFrame) -> pl.DataFrame:
  ## Remove duplicates
  df_songs_no_duplicates = (
      df.unique("cluster_number",
                 maintain_order=True)
  )

  ## Clean lyrics data
  df_songs_cleaned = split_and_clean_lyrics_data(df_songs_no_duplicates, replace_separator_with_token=False)
  return df_songs_cleaned

#### Example Processing

In [None]:
df_songs_verses_sample = preprocess_df(df_songs_en.sample(1, seed=42))

In [None]:
pprint.pprint(
  df_songs_verses_sample[0, "lyrics"]
)

('[Verse 1]\n'
 'Are the trees high enough, baby?\n'
 "Leave you so high your feet won't touch the ground\n"
 'Would you look up, baby?\n'
 "It's pineapple purple skies\n"
 "Promise everything gon' be alright\n"
 "Ooh, I promise everything gon' be alright\n"
 "Trust, everything gon' be alright\n"
 'Ooh, got altitude, no storm clouds (Uh!)\n'
 "And everything gon' be alright, babe (Oh! Be alright)\n"
 "Been talkin' to you for the longest time (Woo)\n"
 "Everything gon' be alright (Ah)\n"
 '\n'
 '[Pre-Chorus]\n'
 "Got you kissin' on the sun, why you do it like that? (Oh, oh, oh)\n"
 'Lights so bright, why you do it like that? (Oh!)\n'
 "And I ain't kissed you yet\n"
 "I ain't kissed you, you, you, you (Woah)\n"
 'But everything you do\n'
 "Got me wishin' you, you, you, you\n"
 '[Chorus]\n'
 'Backslide, backslide (Uh!)\n'
 'Gimme your bass line, bass line\n'
 'Oh, I wanna know, I wanna know\n'
 'Gonna know, gonna show you, alright\n'
 "Everything gon' be alright (Whoo!)\n"
 '\n'
 '[Verse 

In [None]:
pprint.pprint(
    df_songs_verses_sample[-2, "verses"]
)


('gonna be, gonna be, baby, oh\n'
 'gonna be, gonna be, baby, oh\n'
 'right there in the pineapple skies\n'
 'why you gotta do it like that tonight?\n'
 'oh, baby')


In [None]:
pprint.pprint(
    df_songs_verses_sample[-1, "verses"]
)


('come and go, whatever\n'
 "i won't ever let you go\n"
 'please call it, call it\n'
 'gonna let you know like\n'
 "don't let go like\n"
 "it's gon' be alright!")


#### Now execute processing code on all lyrics

In [None]:
df_songs_verses = preprocess_df(df_songs_en)

In [None]:
# Memory Error
# df_verse_duplicate_clusters = get_duplication_clusters(df_songs_preprocess["verses"].to_list())
# df_songs_preprocess = df_songs_preprocess.with_columns(
#   cluster_number=pl.Series(
#       df_verse_duplicate_clusters["cluster_number_verse"].to_pandas()
#   ),
#   row_number=pl.Series(
#       df_related_vectors_with_cluster_numbers["row_number"].to_pandas()
#   ),
# )

Verses is a lot bigger (~250000 rows), so removing duplicates using cosine similarity will be an issue.

So we just opt in for a high precision low recall duplicate removal

In [None]:
df_songs_verses_unique = df_songs_verses.unique(["id", "verses"])

In [None]:
df_songs_verses.shape, df_songs_verses_unique.shape

((207616, 44), (180002, 44))

In [None]:
docs = df_songs_verses_unique["verses"].to_list()

In [None]:
topic_model = train_bertopic_model(
    docs,
    min_cluster_size=15,
    min_samples=3
)

#### Visualize Clusters

In [None]:
topic_model.visualize_barchart()

We can observe more interesting topics by splitting
each song into verses, such as desire/burn/hearts, dance, dreams/nightmare, alcohol/drugs, glory/death


In [None]:
topic_model_info = topic_model.get_topic_info()

In [None]:
topic_model_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,109902,-1_bitch_niggas_money_tryna,"[bitch, niggas, money, tryna, day, leave, ima,...",[ooh-ooh-ooh-ooh-ooh-ooh-ooh\nooh-ooh-ooh-ooh-...
1,0,5828,0_niggas_bitch_tryna_ayy,"[niggas, bitch, tryna, ayy, chorus, money, skr...",[to release my tension i write these bars\nin ...
2,1,987,1_christmas merry christmas_jingle_ho ho_snowman,"[christmas merry christmas, jingle, ho ho, sno...",[i wanna wish you a merry christmas (merry chr...
3,2,887,2_desire_burn baby burn_fuel_torches,"[desire, burn baby burn, fuel, torches, hearts...","[nobody move, don't turn around\nnobody move, ..."
4,3,874,3_dance beat_baby dance_303_dancing people,"[dance beat, baby dance, 303, dancing people, ...",[i just wanna dance with you\n(dance with you)...
...,...,...,...,...,...
1317,1316,15,1316_feel dawn day_life feeling_feeling birds_...,"[feel dawn day, life feeling, feeling birds, d...",[fish in the sea\nyou know how i feel\nriver r...
1318,1317,15,1317_lessons forced fell_bless mess_beat impos...,"[lessons forced fell, bless mess, beat impossi...","[i am a failure, i am a failure\nmy father tol..."
1319,1318,15,1318_short runway_shake demons_love runway_win...,"[short runway, shake demons, love runway, win ...",[every time we touch\nevery time we touch\neve...
1320,1319,15,1319_aunt uncle double_blue milk_starry starry...,"[aunt uncle double, blue milk, starry starry, ...","[starry, starry night\npaint your palette blue..."


There are still many outlier topics (Topic = -1) even though we have tuned min_samples to very low number. This hyperameter tells the HDBSCAN clustering algorithm, which clusters the topics, to reduce number of outliers by having less requirements to introduce "core points". However, we still get many outliers, which means verses can be overly specific and thus not enough similarities can be observed between them to build non-outlier topics.

LDA could be a better approach for text categorization with a set amount of topics



In [None]:
topic_model_info.loc[:, ["Count", "Name","Representation"]]

Unnamed: 0,Count,Name,Representation
0,109902,-1_bitch_niggas_money_tryna,"[bitch, niggas, money, tryna, day, leave, ima,..."
1,5828,0_niggas_bitch_tryna_ayy,"[niggas, bitch, tryna, ayy, chorus, money, skr..."
2,987,1_christmas merry christmas_jingle_ho ho_snowman,"[christmas merry christmas, jingle, ho ho, sno..."
3,887,2_desire_burn baby burn_fuel_torches,"[desire, burn baby burn, fuel, torches, hearts..."
4,874,3_dance beat_baby dance_303_dancing people,"[dance beat, baby dance, 303, dancing people, ..."
...,...,...,...
1317,15,1316_feel dawn day_life feeling_feeling birds_...,"[feel dawn day, life feeling, feeling birds, d..."
1318,15,1317_lessons forced fell_bless mess_beat impos...,"[lessons forced fell, bless mess, beat impossi..."
1319,15,1318_short runway_shake demons_love runway_win...,"[short runway, shake demons, love runway, win ..."
1320,15,1319_aunt uncle double_blue milk_starry starry...,"[aunt uncle double, blue milk, starry starry, ..."


#### Check Topic 10 Representation

### Data Cleaning for Future Tasks

(Make sure to you have run the code in "Data Cleaning Function")

In [None]:
def preprocess_df(df : pl.DataFrame) -> pl.DataFrame:
  ## Clean lyrics data
  df = clean_lyrics_data(df, replace_separator_with_token=False)

  ## Remove duplicates
  df_songs_no_duplicates = (
      df.unique("cluster_number",
                 maintain_order=True)
  )
  return df_songs_no_duplicates

In [None]:
df_songs_en = preprocess_df(df_songs_en)

###Number of Unique Words per Song

In [None]:
df_songs_en.shape

(31203, 44)

In [None]:
df_songs_en = df_songs_en.with_columns(
    # Pd is dash punctuation, and Pc is connector punctuation (ex. underscore)
    pl.col("lyrics").str.replace_all("[^a-zA-Z0-9]+", " ").str.split(" ").list.unique().list.len()
    .alias("number_of_unique_words")
)

### Metaphors

Get Entities (Location, Person)
as well as number of adjectives, verbs


In [None]:
# No speed difference with gpu or without
# spacy.require_gpu()

True

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")


other_pipes = [pipe for pipe in nlp.pipe_names if pipe in ["lemmatizer"]]
# print(other_pipes)
# return
nlp.disable_pipes(*other_pipes)


# df = pl.DataFrame(data)

# Function to extract adjectives, verbs, and named entities (locations and places)
def extract_features(doc):
    adjectives = len(set([token.text for token in doc if token.pos_ == 'ADJ']))
    verbs = len([token.text for token in doc if token.pos_ == 'VERB'])
    locations = len([ent.text for ent in doc.ents if ent.label_ in ('GPE', 'LOC')])
    people = len(set([ent.text for ent in doc.ents if ent.label_ == 'PERSON']))
    pronouns = len([token.text for token in doc if token.pos_ == 'PRON' and token.dep_ != 'poss'])

    dictionary = {
        'num_unique_adjectives': adjectives,
        'num_times_verbs': verbs,
        'num_times_locations': locations,
        "is_location_mentioned" : 1 if locations > 0 else 0,
        "num_unique_people": people,
        "is_people_mentioned" : 1 if people > 0 else 0,
        "num_pronouns": pronouns,
        "is_pronouns_mentioned": 1 if pronouns > 0 else 0
    }
    return dictionary

# Apply the spaCy NLP model to the text column
# df_sampled = df_songs_en.sample(1, seed=42).with_columns(
#     pl.col('lyrics_cleaned').map_elements(lambda text: extract_features(nlp(text))).alias('special_features')
# )
df_songs_en = df_songs_en.with_columns(
    pl.col('lyrics_cleaned').map_elements(lambda text: extract_features(nlp(text))).alias('special_features')
)

In [None]:
# # Extract specific features into separate columns
# df_songs_en = df_songs_en.with_columns([
#     pl.col('special_features').map_elements(lambda x: x['num_unique_adjectives']).alias('num_unique_adjectives'),
#     pl.col('special_features').map_elements(lambda x: x['num_times_verbs']).alias('num_times_verbs'),
#     pl.col('special_features').map_elements(lambda x: x['num_times_locations']).alias('num_times_locations'),
#     pl.col('special_features').map_elements(lambda x: x['is_location_mentioned']).alias('is_location_mentioned'),
#     pl.col('special_features').map_elements(lambda x: x["num_pronouns"]).alias("num_pronouns"),
#     pl.col('special_features').map_elements(lambda x: x["is_pronouns_mentioned"]).alias("is_pronouns_mentioned"),
#     pl.col('special_features').map_elements(lambda x: x["num_unique_people"]).alias("num_unique_people"),
#     pl.col('special_features').map_elements(lambda x: x["is_people_mentioned"]).alias("is_people_mentioned")
# ])

# # Drop the intermediate 'special_features' column
# df_songs_en = df_songs_en.drop('special_features')

# 3. Analyze Correlation
- PCA Biplot correlation of features with popularity, by genre


- Metaphors, adjectives, unique words etc.. over min date by genre


- Compare artist with most number days + songs on charts and artist with only few days/one-hit-wonders on charts, in terms of features, by genre


In [None]:
df_songs_en.shape

(31203, 53)

In [None]:
df_songs_en.null_count()

title_hash,artist_hash,title,artist,median_rank,highest_rank,min_date,max_date,url,album,release_date,duration_ms,median_popularity,popularity,median_streams,total_streams,num_days_on_chart,is_explicit,af_danceability,af_energy,af_key,af_loudness,af_mode,af_speechiness,af_acousticness,af_instrumentalness,af_liveness,af_valence,af_tempo,af_time_signature,title_genius,tag,artist_genius,year,views,features,lyrics,id,language_cld3,language_ft,language,cluster_number,row_number,lyrics_cleaned,number_of_unique_words,num_unique_adjectives,num_times_verbs,num_times_locations,is_location_mentioned,num_pronouns,is_pronouns_mentioned,num_unique_people,is_people_mentioned
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,2596,2593,2593,2593,2593,15339,0,0,2593,2593,2593,2593,2593,2593,2593,2593,2593,2593,2593,2593,2593,0,0,0,0,0,0,0,0,222,96,533,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_songs_en = df_songs_en.with_columns(
    pl.col("release_date").str.to_date("%Y-%m-%d")
)

In [None]:
df_songs_en = df_songs_en.with_columns(
    pl.col("release_date").dt.year().alias("release_year")
)

## Picking a Popularity Score

In [None]:
df_songs_en.select("popularity", "median_rank", "highest_rank").describe()

describe,popularity,median_rank,highest_rank
str,f64,f64,f64
"""count""",28610.0,31203.0,31203.0
"""null_count""",2593.0,0.0,0.0
"""mean""",32.435617,68.669615,39.177803
"""std""",25.907361,50.962542,43.511454
"""min""",0.0,1.0,1.0
"""25%""",0.0,33.0,11.0
"""50%""",36.0,44.0,26.0
"""75%""",53.0,108.0,44.0
"""max""",96.0,200.0,200.0


analysis by rank features could also be interesting, as it is uncorrelated from popularity and total streams. Gonna save that for another time.

In [None]:
df_songs_en.filter(
    (pl.col("median_streams").is_null())
    & (pl.col("total_streams") == 0)
).shape

(15339, 53)

In [None]:
df_songs_en.filter(
    (pl.col("total_streams") == 0)
).shape

(15339, 53)

15339 english songs have null streams. This is indicated by total streams == 0 and median streams is null because I am querying an aggregated dataset, and default behaviour polars behaviours is to sum to 0 for null values and median to null. There are some songs in our original Spotify dataset that belonged to viral50 charts, which have only null streams, thus why we have 0 total_streams values.

I can use popularity column for analysis since it has small amount of null records, but for now I will a combination of highest_rank, median_rank, number of days on chart, because if a song is truly popular, it will be very high on the charts (highest_rank high), will have a pretty good overall rank (median_rank), and have some form of longevity (number of days)

In [None]:
def min_max_column(column : str, reverse=False):
  if reverse is True:
    return (
        1 - ((pl.col(column) - pl.col(column).min())/
        (pl.col(column).max() - pl.col(column).min()))
    )

  return (
      (pl.col(column) - pl.col(column).min())/
      (pl.col(column).max() - pl.col(column).min())
  )

In [None]:
df_songs_en = df_songs_en.with_columns(
    (0.4 * min_max_column("highest_rank", reverse=True) +
    0.2 * min_max_column("median_rank", reverse=True) +
    0.2 * min_max_column("num_days_on_chart")).alias("popularity")
).with_columns(
    min_max_column("popularity")
)

In [None]:
df_songs_en.select("popularity").null_count()

popularity
u32
0


In [None]:
fig = px.histogram(
    df_songs_en,
    x="popularity"
)
fig.update_layout(title="Popularity Score Distribution")

In [None]:
fig = px.histogram(
    df_songs_en,
    x="popularity",
    facet_row="tag"
)
fig.update_layout(title="Popularity Score Distribution Per Genre",
                  height=1000)

## General EDA

Before analyzing songs by genre, going to make sense of counts and distributions

In [None]:
df_lyrics_tag_distribution = df_songs_en.group_by("tag").agg(
    pl.struct(["title_hash", "artist_hash"]).n_unique().alias("len"),
    ((pl.struct(["title_hash", "artist_hash"]).n_unique()).alias("Percent")
      /df_songs_en.shape[0] * 100).round(2)
).sort("len", descending=True).rename({"len" : "Number of Songs"})

In [None]:
fig = px.pie(df_lyrics_tag_distribution, names="tag", values="Number of Songs")
fig.update_traces(textinfo="percent+label")
fig.update_layout(title="Genre Distribution", margin=dict(b=120))

## Representation per Genre
Before we look at the features per genre, let's make sure each genre is fairly represented by artists

In [None]:
def check_artist_representation(df, descending=True):
  df_num_songs_tag_artist = df.group_by(["tag", "artist_hash"]).agg(
      pl.count().alias("number_of_songs_per_artist_tag")
  ).sort("number_of_songs_per_artist_tag", descending=descending)

  return df_num_songs_tag_artist.group_by(["tag"]).agg(
      pl.count().alias("num_artists_in_total"),
      (pl.col("number_of_songs_per_artist_tag").slice(0, pl.count() // 10).sum()
      / pl.col("number_of_songs_per_artist_tag").sum() * 100).round(2).cast(type("str"))
      .alias("top_10%_artist_song_percent")
  )

check_artist_representation(df_songs_en)

tag,num_artists_in_total,top_10%_artist_song_percent
str,u32,str
"""rb""",1157,"""44.62"""
"""rock""",2396,"""44.36"""
"""rap""",2018,"""60.69"""
"""pop""",6654,"""43.5"""
"""misc""",133,"""20.0"""
"""country""",340,"""44.47"""


Top 10% of artists own ~44% of each genres' songs, other than misc, but top 10% of rap artists own ~60% of the genre's songs

## PCA Biplot
Correlate features with popularity, to see what makes a song popular.

Let's look at the features we're interested in first.

In [None]:
def get_most_popular_songs_per_tag(tag : str, descending=True, head=20):
    most_popular_pop_songs = df_songs_en.filter(pl.col("tag") == tag).sort("popularity", descending=descending)
    return most_popular_pop_songs.head(head).select("id", "title", "artist", "duration_ms", "num_times_verbs",
                                                  'num_unique_adjectives',
                                       "is_location_mentioned", "is_people_mentioned", "popularity","lyrics")


In [None]:
interested_features = (
  df_songs_en.select("number_of_unique_words",
                     "num_times_verbs",
                    "num_unique_adjectives", "num_times_locations", "is_location_mentioned",
                    "num_pronouns", "num_unique_people", "is_people_mentioned",
                    "popularity", "num_days_on_chart")
)

In [None]:
interested_features.null_count()

number_of_unique_words,num_times_verbs,num_unique_adjectives,num_times_locations,is_location_mentioned,num_pronouns,num_unique_people,is_people_mentioned,popularity,num_days_on_chart
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0


In [None]:
interested_features.group_by("is_people_mentioned").count()

is_people_mentioned,count
i64,u32
1,8935
0,22268


In [None]:
interested_features.group_by("is_location_mentioned").count()

is_location_mentioned,count
i64,u32
0,25329
1,5874


In [None]:
df_interested_features = interested_features.drop_nulls().corr().to_pandas()

In [None]:
df_interested_features.index = df_interested_features.columns

In [None]:
fig = px.imshow(df_interested_features, color_continuous_scale=px.colors.sequential.RdBu[::-1])
fig.update_yaxes(ticksuffix=" ")
fig.update_layout(title="Correlation Matrix of Interesting Features<br>" +\
 "(To Decide what I can use to find correlation with popularity)")

unique words looks like it might be redundant with num unique adjectives and verbs. num_ronouns look redundant with num_times_verbs.

Think I will leave unique words and num_pronouns out of my analysis

In [None]:
music_features = (
  df_songs_en.select(
      pl.col("^af.*$")
  )
)

In [None]:
music_features_not_null = music_features.drop_nulls()
music_features_not_null_corr = music_features_not_null.corr().to_pandas()
music_features_not_null_corr.index = music_features_not_null_corr.columns

In [None]:
fig = px.imshow(music_features_not_null_corr, color_continuous_scale=px.colors.sequential.RdBu[::-1])
fig.update_yaxes(ticksuffix=" ")
fig.update_layout(title="Correlation Matrix of Music Features<br>" +\
 "(To Decide what I can use to find correlation with popularity)")

Only obvious correlation is energy and loudness. Make sure to not use those highly correlated features for PCA

In [None]:
from cuml.decomposition import PCA
from cuml.preprocessing import StandardScaler

In [None]:
def get_PCA(X):
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X.dropna(how="any"))
  # print(X_scaled.head())
  pca = PCA(n_components=2)
  components = pca.fit_transform(X_scaled)
  # print(pca.components_.T.shape)
  # print(cp.sqrt(pca.explained_variance_))
  loadings = pca.components_.T @ cp.diag(cp.sqrt(pca.explained_variance_))
  return components, loadings

In [None]:
# # Test
# X = cudf.DataFrame({
#     "a" : [1.0, 4.0, 5.0, 6.0],
#     "b" : [3.0, 2.0, 3.0, 9.0],
#     "c" : [2.0, 8.0, 1.0, 5.0]
# })
# a = get_PCA(X)

In [None]:
# a = pl.DataFrame({
#     "a" : [10, 100],
#     "b" : [10, 100]
# })

# a.with_columns(
#     pl.col(["a", "b"]).log(10)
# )

In [None]:
def plot_popularity_over_time():
  pass

def plot_pca_biplot(df : pl.DataFrame,
                    rows=1,
                    cols=1,
                    subplot_titles=[""],
                    facet=None,
                    target=["popularity"],
                    features=["num_unique_adjectives", "num_times_verbs",
                              "is_location_mentioned", "is_people_mentioned",
                              ],
                    # features=["af_key", "af_energy", "af_danceability",
                    #           "af_acousticness", "af_liveness", "af_tempo", "af_time_signature"]
                    group="",
                    group_in_pca=False,
                    color=px.colors.qualitative.Pastel1[0]):

  all_features = target + features

  X = cudf.DataFrame(
      df[all_features].to_pandas() if facet is None
      else df[all_features + [facet]].to_pandas()
  )

  ## Get components and loadings for each facet, or just single PCA
  components = []
  loadings = []
  if facet is not None:
    facet_values = df[facet].unique()
    subplot_titles = facet_values
    for val in facet_values:
      component, loading = get_PCA(X.loc[X[facet] == val, target + features])
      components.append(component)
      loadings.append(loading)
  else:
    component, loading = get_PCA(X)
    components.append(component)
    loadings.append(loading)

  ## Initialize Subplots
  fig = make_subplots(rows=rows, cols=cols, subplot_titles=subplot_titles)

  all_indices = [(i+1,j+1) for i in range(rows) for j in range(cols)]

  ## Plot Scatters
  for z, (i,j) in enumerate(all_indices):
      if z > len(components) - 1:
          break
      if group_in_pca is False:
        fig.add_trace(
          go.Scatter(
              x=components[z][0].to_numpy(),
              y=components[z][1].to_numpy(),
              name=group,
              mode="markers",
              marker_color=color
          ),
          row=i, col=j
        )
      else:
        fig.add_trace(
            px.scatter(
                components[z].to_numpy(),
                x=0, y=1,
                color=group
            ),
            row=i, col=j
        )
  all_features = target + features

  ## Add Arrows
  for z, (i,j) in enumerate(all_indices):
    if z > len(loadings) - 1:
        break
    # print(subplot_titles[z])
    for k, feature in enumerate(all_features):
        # print(loadings[z].loc[k])
        fig.add_annotation(
            ax=0, ay=0,
            axref=f"x{i}", ayref=f"y{j}",
            xref=f"x{i}", yref=f"y{j}",
            x=loadings[z].loc[k, 0],
            y=loadings[z].loc[k, 1],
            showarrow=True,
            arrowsize=1,
            arrowhead=1,
            xanchor="right",
            yanchor="top",
            row=i, col=j
        )

        fig.add_annotation(
            x=loadings[z].loc[k, 0],
            y=loadings[z].loc[k, 1],
            ax=0, ay=0,
            xanchor="left",
            yanchor="auto",
            text=feature,
            row=i, col=j
        )

  return fig, (loadings, components)

In [None]:
fig, pca_objects = plot_pca_biplot(df_songs_en)

Look for link between adjectives, verb, location_mentioned, and people_mentioned

In [None]:
# def improve_text_position(x):
#     """ it is more efficient if the x values are sorted """
#     # fix indentation
#     positions = ['top left', 'top right']  # you can add more: left center ...
#     return [positions[i % len(positions)] for i in range(len(x))]



In [None]:
fig.update_xaxes(title="PC1")
fig.update_yaxes(title="PC2")
fig.update_layout(title="PCA Biplot of Popularity vs. num_unique_adjectives,<br>num_times_verbs, "+\
                        "is_location_mentioned, is_people_mentioned")
# fig.update_traces(textposition=improve_text_position(df_songs_en))
fig.show()

we can see that overall, there is no strong link between number of times an adjective/verb or if a person and location is mentioned in a song. However, does this hold up for different genres?

In [None]:
figures = []

for i, tag in enumerate(df_songs_en["tag"].unique(maintain_order=True)):

  fig, _ = plot_pca_biplot(df_songs_en.filter(pl.col("tag") == tag), subplot_titles=[tag],
                           color=px.colors.qualitative.Pastel1[i % 9])
  fig.update_yaxes(title="PC1")
  fig.update_xaxes(title="PC2")
  figures.append(fig)

In [None]:
figures[0]

In [None]:
figures[1]

In [None]:
figures[2]

In [None]:
df_songs_en.filter(pl.col("tag") == "pop").select("num_times_verbs").describe()

describe,num_times_verbs
str,f64
"""count""",14046.0
"""null_count""",0.0
"""mean""",48.931796
"""std""",22.865709
"""min""",0.0
"""25%""",33.0
"""50%""",47.0
"""75%""",62.0
"""max""",412.0


In [None]:
most_popular_pop_songs = get_most_popular_songs_per_tag("pop")

In [None]:
least_popular_pop_songs = get_most_popular_songs_per_tag("pop", descending=False)

In [None]:
median = df_songs_en.filter(pl.col("tag") == "pop").select("num_times_verbs").quantile(0.5).item()

In [None]:
most_popular_pop_songs.select(
  (pl.col("num_times_verbs") <= median).sum() / most_popular_pop_songs.shape[0]
)

num_times_verbs
f64
0.6


In [None]:
least_popular_pop_songs.select(
  (pl.col("num_times_verbs") >= median).sum() / most_popular_pop_songs.shape[0]
)

num_times_verbs
f64
0.35


We observe inverse relationship between num_times_verbs and popularity, but for 20 lowest popularity songs, we only observe songs with more than median verbs 35% of the time, but for high popularity songs, we observe >60% songs with less than median verbs. Obviously analyzing the relationship based on whether popularity is above median or not doesn't verify or invalidates the relationship, but it's good to see that top songs are slightly favoured towards songs with less verbs

In [None]:
## Something Just Like This, ChainSmokers
# pprint.pprint(
#   most_popular_pop_songs.filter(pl.col("id") == 2998843)[0, "lyrics"]
# )

In [None]:
## Thunder, Imagine Dragons
# pprint.pprint(
#   most_popular_pop_songs.filter(pl.col("id") == 3044373)[0, "lyrics"]
# )

In [None]:
figures[3]

In [None]:
most_popular_rock_songs = get_most_popular_songs_per_tag("rock")
least_popular_rock_songs = get_most_popular_songs_per_tag("rock", descending=False)

In [None]:
df_songs_en.filter(pl.col("tag") == "rock").select("num_times_verbs").describe()

describe,num_times_verbs
str,f64
"""count""",6565.0
"""null_count""",0.0
"""mean""",42.943945
"""std""",20.797973
"""min""",0.0
"""25%""",29.0
"""50%""",40.0
"""75%""",54.0
"""max""",390.0


In [None]:
figures[4]

In [None]:
most_popular_country_songs = get_most_popular_songs_per_tag("country")
least_popular_country_songs = get_most_popular_songs_per_tag("country", descending=False)

In [None]:
median = df_songs_en.filter(pl.col("tag") == "country").select("num_times_verbs").quantile(0.5).item()

In [None]:
most_popular_country_songs.select(
  (pl.col("num_times_verbs") > median).sum() / most_popular_country_songs.shape[0]
)

num_times_verbs
f64
0.55


In [None]:
least_popular_country_songs.select(
  (pl.col("num_times_verbs") > median).sum() / most_popular_country_songs.shape[0]
)

num_times_verbs
f64
0.55


PCA of country songs claims we have no correlation between popularity and num_times_verbs, and both most_popular_country_songs (top 20) and least_popular_country_songs show that lack of correlation (55% chance of being greater or less than median)

In [None]:
figures[5]

We can see popularity correlating with features inside most genres. Let's see the trend of our popularity over time

## Bar Plot of Average Popularity over Min Date, with Dual Axis of Number of Songs

In [None]:
df_songs_en = df_songs_en.with_columns(
    pl.col("min_date").str.to_date("%Y-%m-%d").sort(),
)

In [None]:
def plot_over_time(df):
  df_songs_en_over_time = df.group_by_dynamic("min_date", every="2mo", label='left').agg(
    pl.col("popularity").mean().alias("avg_popularity"),
    pl.count().alias("number_of_songs"),
    pl.col("^af.*$").mean().name.map(lambda c: c + "_mean"),
    pl.col(["num_unique_adjectives", "num_times_verbs",
                              "is_location_mentioned", "is_people_mentioned",
                              ]).mean().name.map(lambda c: c + "_mean")
  )

  # Create figure with secondary y-axis
  fig = go.Figure()

  fig.add_trace(
      go.Bar(x=df_songs_en_over_time["min_date"], y=df_songs_en_over_time["number_of_songs"],
            name="number_of_songs",
            marker_color=px.colors.qualitative.Light24[3],
            yaxis="y2"
      ),
  )


  # Add traces
  for i, col in enumerate(df_songs_en_over_time.columns):
    if col not in ("avg_popularity"):
      continue

    fig.add_trace(
      go.Scatter(
          x=df_songs_en_over_time["min_date"],
          y=df_songs_en_over_time[col],
          mode="lines+markers",
          name=col,
          marker_color=px.colors.qualitative.Light24[i],
      ),
    )



  # Set x-axis title
  fig.update_xaxes(title_text="Min Date (2-month Bins)")

  # Set y-axes titles
  fig.update_layout(yaxis2=dict(title_text="number of songs"))
  fig.update_layout(yaxis=dict(title_text="values"))
  fig.update_layout(
      title_text="Average Popularity over Time",
      yaxis=dict(
          layer="above traces",
          overlaying='y2',
          side="left"
      ),
      yaxis2=dict(
          # title='Secondary Y Axis',
          side="right",
          # showgrid=True,  # Show the secondary y-axis grid
          layer='below traces'  # Move the secondary y-axis background backwards
      ),
      legend=dict(
          x=1,
          y=1,
          xanchor="right",
          yanchor="bottom"
      )
  )


  return fig

In [None]:
plot_over_time(df_songs_en)

In [None]:
figures = []
for i, tag in enumerate(df_songs_en["tag"].unique(maintain_order=True)):
  fig = plot_over_time(df_songs_en.filter(pl.col("tag") == tag))
  fig.update_layout(title=f"Average Popularity over Time for {tag}")
  figures.append(fig)

In [None]:
figures[0]

In [None]:
figures[1]

In [None]:
figures[2]

In [None]:
figures[4]

In [None]:
figures[5]