# Spotify Genius Lyrics Data Exploration

**Date:** June 13th, 2024

**Author:** Harris Zheng

**Description:** Explore Spotify & Genius Lyrics Dataset

# Imports

In [8]:
import polars as pl
import os
import logging
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import string
import re
import pprint
from polars import DataFrame
from itables import init_notebook_mode, show
init_notebook_mode(all_interactive=False)

<IPython.core.display.Javascript object>

In [9]:
# Customize Plotly figure styles
def bnw_template() -> go.layout.Template:
    bnw = go.layout.Template(
        layout=go.Layout(
            xaxis=go.layout.XAxis(showgrid=False, zeroline=True),
            # margin=go.layout.Margin(
            #     l=2,
            #     r=2
            # ),
            yaxis=go.layout.YAxis(showgrid=True, zeroline=True),
            margin=go.layout.Margin(l=100, b=100),
            font=go.layout.Font(family="Arial, Verdana", size=19),
            title=go.layout.Title(
                font=go.layout.title.Font(family="Calibri Black, Arial Black", size=25),
                # ),
            ),
            legend=go.layout.Legend(font=go.layout.legend.Font(size=17)),
            plot_bgcolor="#f0f0f0",
            paper_bgcolor="#f0f0f0",
            dragmode="pan",
            showlegend=False,
            colorway=px.colors.qualitative.D3,
            uniformtext_minsize=12,
            uniformtext_mode="hide",
        ),
        
        # Hovertemplate configurations
        data=dict(
            bar=[go.Bar(hovertemplate="<b>%{x}</b><br><i>Value</i>: %{y}")],
            scatter=[go.Scatter(hovertemplate="<b>%{x}</b><br><i>Y</i>: %{y}")],
            pie=[
                go.Pie(
                    hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}<br><i>Percent</i>: %{percent}"
                )
            ],
            histogram=[go.Histogram(hovertemplate="<b>%{x}</b><br><i>Value</i>: %{y}")],
        ),
    )
    return bnw

# Add template
pio.templates["bnw"] = bnw_template()

# If there are multiple templates delimited by +, template at end of string takes precedence
px.defaults.template = "bnw"

In [10]:
## Polars Performance Logger if needed
# performance_logger = logging.getLogger("polars_perf_logs")
# performance_logger.setLevel(logging.INFO)

# if not performance_logger.handlers:
#     fh = logging.FileHandler('./polars_perf_logs.log', encoding="utf-8")
#     formatter = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s')
#     fh.setFormatter(formatter)
#     performance_logger.addHandler(fh)

In [11]:
def get_dataset_path(curpath : str = os.getcwd()) -> str:
    '''
        Look for TESTS folder in parent folders, to link to datasets.
        (This function was not used in this notebook)
    '''
    
    while os.path.basename(curpath) != "TESTS":
        # set curpath to its parent
        curpath = os.path.normpath(os.path.join(curpath, os.pardir))
    
    return os.path.join(curpath, "datasets")

## Manually set the path that contains relevant datasets
DATASET_PATH = './data'

In [12]:
def filter_regex_in_column(df : pl.DataFrame, column: str, regex: str) -> pl.DataFrame:
    '''
        Select rows forom a dataframe where column matches a certain regex
    '''
    return df.filter(pl.col(column).str.contains(regex))

def extract_puncs(df: pl.DataFrame, column: str, regex: str) -> pl.DataFrame:
    '''
        Extract puncs extract all punctuations found from a single column,
        then counts the number of occurrences for each punctuation.

        Returns pl.DataFrame, where puncs represents the punctuation
        and len represents the number of occurrences for that punctuation

        puncs | len
        ----- | -----
        .     | 23344
        {     | 2331
    '''
    df_extraction = df.select(
        pl.col(column).str.extract_all(regex).alias("puncs").explode()
    )
    return df_extraction.group_by("puncs").len().sort("len", descending=True)

def check_punctuation_counts(df: pl.DataFrame, column: str) -> list[dict[string.punctuation, ]]:
    '''
        Return list of dictionaries,
        where each dictionary contains a key
        representing a punctuation from string.punctuation,
        and value representing the number of occurrences of
        that punctuation in a specific column
    '''
    punc_counts = []
    for punc in string.punctuation:
        punc_counts.append(
            (
                punc,
                df.filter(
                    pl.col(column).str.contains(f"[{re.escape(punc)}]")
                ).shape[0]
            )
        )
    return punc_counts

def get_regex_extracts(df: pl.DataFrame, column : str, regex: str):
    '''
        Return pl.Dataframe with a single column containing all
        punctuations found from a single column
    '''
    df_extracted = df.select(pl.col(column).str.extract_all(regex).explode())
    return df_extracted.filter(df_extracted[column] != "")


In [13]:
pl.set_random_seed(42)

# Load Datasets

In [14]:
## I want to scan the csv
df_spotify_charts = (
    pl.read_csv(os.path.join(DATASET_PATH, "merged_data.csv"),
                columns=["title", "rank", "date", "artist", "chart", "region", "streams",
                        "release_date", "popularity", "duration_ms", "track_id", "album", "explicit"]
    )
)

In [None]:
df_spotify_charts.shape

In [15]:
df_spotify_charts.null_count()

title,rank,date,artist,region,chart,streams,track_id,album,popularity,duration_ms,explicit,release_date
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
11,0,0,18,0,0,5852308,304251,323427,304251,304251,304251,304251


In [16]:
df_lyrics = pl.read_csv(os.path.join(DATASET_PATH, "song_lyrics.csv"))

In [None]:
df_lyrics.shape

In [17]:
df_lyrics.null_count()

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
165,0,0,0,0,0,0,0,90966,134322,226918


In [11]:
# df_lyrics.select(pl.exclude("lyrics"))

# Explore Song Lyrics Dataset

## Explore Language Columns

**language_cld3:** Lyrics language according to cld3model 

**language_ft:** Lyrics language according to FastText model

**language:** Lyrics language from both language_cld3 and language_ft. Returns null if language extracted from cld3model and FastText model are not equal

In [12]:
language_count = df_lyrics.group_by("language").len().to_pandas()

In [13]:
fig = px.pie(language_count, names="language", values="len")
fig.update_layout(title="Counts of column 'language'", height=600)
fig.update_traces(texttemplate="%{label}<br>%{percent}", textposition='inside',
                  textfont_size=19)
fig.add_annotation(text=f"Number of Rows: {sum(language_count['len'])}", ax=0, ay=0, x=1, y=0, xanchor="right", yanchor="top")

## We can filter for lyrics where either of the models detected en

In [14]:
df_lyric_diff_lyrics = df_lyrics.select("title","artist","language_cld3", "language_ft").filter(
    (pl.col("language_cld3") != pl.col("language_ft")) & (
        (pl.col("language_cld3") == "en") | 
        (pl.col("language_ft") == "en" )
    )
)

In [15]:
df_lyric_diff_lyrics.sample(100)

title,artist,language_cld3,language_ft
str,str,str,str
"""Hip Hop Seoul-Ja""","""Jinusean""","""en""","""ko"""
"""Love Me""","""Meldom""","""da""","""en"""
"""Purple Shoes""","""Period Tears""","""haw""","""en"""
"""Samurai""","""Juno Reactor""","""ja""","""en"""
"""SHEEEEESH""","""Elevate (Remix)""","""af""","""en"""
…,…,…,…
"""Sixteenth Note Beloved Mashoug…","""Siriya Ensemble""","""en""","""fa"""
"""Stargazer""","""Noustan & Cristian Garcia""","""id""","""en"""
"""Do You Speak English?""","""Yang Joon Il ()""","""ko""","""en"""
"""Alice Saw""","""Damon Albarn""","""pl""","""en"""


Examples of Language detection mismatches between cld3 and FastText:

https://genius.com/Genius-pagsasalin-sa-filipino-blackpink-lovesick-girls-filipino-translation-lyrics

https://genius.com/All-made-by-donut-321-roken-through-the-night-lyrics

https://genius.com/Ko-tarzan-lyrics

## Explore Null Titles

In [16]:
df_lyrics_null = df_lyrics.filter(pl.col("title").is_null())
df_lyrics_null

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
,"""misc""","""IUZ""",2012,1110,"""{I♥UZ}""","""[Verse 1] She said life ain't …",99718,"""en""","""en""","""en"""
,"""rock""","""Touch Amor""",2011,19414,"""{""Touché Amoré""}""","""I'm parting the sea between br…",106734,"""en""","""en""","""en"""
,"""rap""","""Cymek""",2013,158,"""{}""","""[VERSE 1] Chcesz mnie pokonać?…",204824,"""pl""","""pl""","""pl"""
,"""rap""","""Ahfueaefasf""",2013,294,"""{}""","""Intro: Yea Forte Dreams Rap Re…",239387,"""en""","""en""","""en"""
,"""rap""","""Riot (PL)""",2013,65,"""{Boras,Świerzy}""","""[Riot.] Wchodzę z buta w rapg…",269691,"""pl""","""pl""","""pl"""
…,…,…,…,…,…,…,…,…,…,…
,"""rap""","""StonedAKhana""",2021,2,"""{}""","""All stacks in son I'm a differ…",7581123,"""en""","""en""","""en"""
,"""pop""","""SEULEMENT""",2021,1,"""{}""","""[Paroles de ""++""] [Couplet 1]…",7590029,"""fr""","""fr""","""fr"""
,"""rock""","""Cloudwatch""",2022,2,"""{}""","""Time is a lie Bound to this ea…",7777869,"""en""","""en""","""en"""
,"""pop""","""Arrezzia Menendez""",2020,2,"""{}""","""[Lletra de ""+=""] Ja em va bé …",7842403,"""ca""","""ca""","""ca"""


In [17]:
df_lyrics_null.shape

(165, 11)

Samples:

Some song titles are not alphanumeric (For example, "|", ",", "....."), and I don't think this dataset's scraper parses them.

First row song should be , - I♥UZ https://genius.com/Iuz--annotated

Second row song should be ~ - Touche Amore (Notice the French e's are also not parsed) https://genius.com/Touche-amore--lyrics

## Check null title song lyrics

, - I♥UZ

In [18]:
pprint.pprint(df_lyrics_null[0, "lyrics"][0:600])

('[Verse 1]\n'
 "She said life ain't about smoking weed and fucking bitches grow up\n"
 "Tears rollin' down my face heavily so I smoked one\n"
 'I loved you girl I swear I did I never meant to hurt you\n'
 'But what goes around comes around and I was in the circle\n'
 "Now im feelin helpless, words can't explain my fuckin pain\n"
 'But I dont expect for you to feel bad cause I put you through the same\n'
 'My friends like "Man lets hit the club" and right now gettin gone sounds '
 'pretty good\n'
 'Cause I dont wanna think about you anymore\n'
 "But whats love if it ain't worth dyin for\n"
 '\n'
 '[Hook]\n'
 "I wasn't ready, I wasn't ready, I wasn't ready\n"
 'So')


~ - Touche Amore

In [19]:
pprint.pprint(df_lyrics_null[1, "lyrics"][0:600])

("I'm parting the sea between brightness and me\n"
 'Before I drown myself and everyone and everything\n'
 '\n'
 'I got my hands behind my back\n'
 'With two fingers overlapped\n'
 'Because I can never keep my word\n'
 'And I share with the undeserved\n'
 'For some cheap acceptance in return\n'
 'The water will shape the sides\n'
 "And I'll walk with my head held high\n"
 'And when it all comes crashing in\n'
 "It will be worth it, if I'm still breathing\n"
 '\n'
 'If actions speak louder than words\n'
 "I'm the most deafening noise you've heard\n"
 "I'll be that ringing in your ears\n"
 'That will stick around for years')


we have evidence that punctuation may be missing from "title" column in Genius Song Lyrics dataset

## Check Empty String Titles

In [20]:
df_lyrics_empty = df_lyrics.filter(
    pl.col("title").str.extract("([\p{separator}]+)", 1).str.len_chars() == pl.col("title").str.len_chars()
).select("title", "artist", "lyrics", "views")

In [21]:
df_lyrics_empty

title,artist,lyrics,views
str,str,str,i64
""" ""","""Ruby Throat""","""(...) Somebody go... Break al…",666
""" ""","""Krzysztof Kamil Baczyski""","""Raz - dwa - trzy - cztery - ni…",7
""" """,""" (Vadim Samoylov)""","""Беда, да не одна Под ""Градами""…",1820
""" ""","""Vlote koku""","""[Refrén] Dej něco žaludku Ať c…",4
""" ""","""Rakoczy""","""[Intro: Rakoczy] Osiem gwiazde…",16
""" ""","""Komil""","""[Refren] Jak flama, flama.flam…",5602
""" ""","""DonGURALesko x Shellerini x Ta…","""[Refren] Whoop, whoop, cała Po…",1333


In [22]:
df_lyrics.filter(pl.col("artist") == "Krzysztof Kamil Baczyski").select("title", "artist", "views")

title,artist,views
str,str,i64
""" ""","""Krzysztof Kamil Baczyski""",7
"""Elegia o ... chłopcu polskim""","""Krzysztof Kamil Baczyski""",250
"""Pokolenie""","""Krzysztof Kamil Baczyski""",13
"""Z lasu""","""Krzysztof Kamil Baczyski""",6
"""Pożegnanie""","""Krzysztof Kamil Baczyski""",11
…,…,…
"""Pisz do mnie listy...""","""Krzysztof Kamil Baczyski""",15
"""Samotność""","""Krzysztof Kamil Baczyski""",12
"""Romantyczność""","""Krzysztof Kamil Baczyski""",13
"""Młot""","""Krzysztof Kamil Baczyski""",4


## Check Empty String Artists

In [23]:
df_lyrics_artist_empty = df_lyrics.filter(
    pl.col("artist").str.extract("([\p{separator}]+)", 1).str.len_chars() == pl.col("artist").str.len_chars()
).select("title", "artist", "lyrics", "views")

In [24]:
df_lyrics_artist_empty.shape

(0, 4)

## Explore years column

In [25]:
df_lyrics.select("year").null_count()

year
u32
0


In [26]:
df_lyrics.select("year").dtypes

[Int64]

In [27]:
df_lyrics.select("year").describe()

statistic,year
str,f64
"""count""",5134856.0
"""null_count""",0.0
"""mean""",2010.303073
"""std""",45.011921
"""min""",1.0
"""25%""",2009.0
"""50%""",2016.0
"""75%""",2019.0
"""max""",2100.0


### Year too small?
Check songs with year <= 1000

In [28]:
show(df_lyrics.filter(pl.col("year") <= 1000).select("year").unique().sort("year"))

year
Loading... (need help?)


In [29]:
show(df_lyrics.filter(pl.col("year") <= 1000).select("id", "title", "artist", "year"))

id,title,artist,year
Loading... (need help?),,,


In [30]:
show(df_lyrics.filter((pl.col("year") >= 900) & (pl.col("year") <= 1000)).select("id", "title", "artist", "year").sample(10))

id,title,artist,year
Loading... (need help?),,,


I'm pretty positive these songs aren't from the year 1000

In [31]:
df_lyrics.filter(
    (pl.col("title") == "Flop Fo No Hoe 2")
)

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""Flop Fo No Hoe 2""","""rap""","""Big floppa""",1000,70,"""{}""","""Uh daddy Uh daddy Uh daddy Uh …",6632163,"""cy""",,


In [32]:
## Check how many songs are from year 1000 or earlier
df_lyrics.group_by((pl.col("year") <= 1000).alias("is_year_<=_1000")).agg(
    pl.len().alias("count"),
    (pl.len()  / df_lyrics.shape[0] * 100).round(2).alias("pct")
)

is_year_<=_1000,count,pct
bool,u32,f64
False,5132609,99.96
True,2247,0.04


0.04% of song years are less than year 1000

In [33]:
fig = px.histogram(df_lyrics.filter(pl.col("year") <= 1000).select("year"))
fig.update_layout(title="Distribution of years <= 1000 (Genius Lyrics)")
fig.update_xaxes(title="year")

In [34]:
show(df_lyrics.filter(pl.col("year") <= 100).select("title", "artist", "year"))

title,artist,year
Loading... (need help?),,


Example of incorrect year input: https://genius.com/Ghetto-smaxx-we-gettin-money-lyrics, https://genius.com/Mike-lu-i-rise-lyrics

In [35]:
df_lyrics.filter(
    (pl.col("title") == "We Gettin Money") & 
    (pl.col("artist") == "Ghetto Smaxx")
).select("title", "artist", "year")

title,artist,year
str,str,i64
"""We Gettin Money""","""Ghetto Smaxx""",1


In [36]:
df_lyrics.filter(
    (pl.col("title") == "I Rise") & 
    (pl.col("artist") == "Mike Lu")
).select("title", "artist", "year")

title,artist,year
str,str,i64
"""I Rise""","""Mike Lu""",1


We can see sometimes Genius Lyrics scraper only encodes part of the year. (For example, 1900 would be encoded as 900). In the above examples, we have songs from 2001 encoded as year 1. However, this is only for a very small number of cases <0.04%. Note that no all 0.04% of <1000 year songs have data encoding errors, because some song lyrics are actually from before 1000 (e.g. Holy Bible)

### Year too big?

In [37]:
show(df_lyrics.filter(pl.col("year") > 2024).select("year").unique().sort("year"))

year
Loading... (need help?)


In [38]:
show(df_lyrics.filter(pl.col("year") > 2024).select("id", "title", "artist", "year", "views"))

id,title,artist,year,views
Loading... (need help?),,,,


### Year Distribution

In [45]:
df_lyrics_year = df_lyrics.select("year")

In [52]:
df_lyrics_year_bin = df_lyrics_year.with_columns(
    pl.col("year")
    .cut([1900 + i for i in range(0, 130, 10)],
        labels=["Past-1900"] + [f"{1900 + 10*i}-{1900 + 10*i + 10}" for i in range(0, 12)] + ["2020-Future"]).alias("year_bin")
)
df_lyrics_year_bin_count = df_lyrics_year_bin.group_by("year_bin").agg(
    pl.len(),
    (pl.len().alias("percent") / df_lyrics.shape[0] * 100).round(2),
    (pl.len().alias("percent_string") / df_lyrics.shape[0] * 100).round(2).cast(pl.String) + "%"
).rename({"len" : "count"}).sort("year_bin", descending=True).with_columns(
    pl.col("percent").cum_sum().round(2).alias("cum_sum_percent")
).sort("year_bin")

In [53]:
show(df_lyrics_year_bin_count)

year_bin,count,percent,percent_string,cum_sum_percent
Loading... (need help?),,,,


In [77]:
fig = px.bar(df_lyrics_year_bin_count, x="year_bin", y="count")
fig.update_layout(title="Distribution of Song Titles Count By Year of Release (Genius Song Lyrics)")

Most songs originate from 2010-2020, followed by 2000-2010 and 2020-Future. Looking at the cumulative sum (cum_sum column), 21st century songs make up 86.58% of the Genius Lyrics dataset

## Views Distribution

In [57]:
df_lyrics.select("views").describe([0.25, 0.5, 0.75, 0.8, 0.85, 0.95])

statistic,views
str,f64
"""count""",5.134856e6
"""null_count""",0.0
"""mean""",3060.939181
"""std""",47309.800698
"""min""",0.0
…,…
"""75%""",448.0
"""80%""",692.0
"""85%""",1165.0
"""95%""",6448.0


## Genius Artists

In [58]:
df_lyrics_artist_count = df_lyrics.group_by("artist").len().sort("len", descending=True)

In [59]:
df_lyrics_genius = df_lyrics.filter(pl.col("artist").str.contains("(?i)genius"))
df_lyrics_genius.shape

(76798, 11)

In [60]:
show(df_lyrics_artist_count.head(20))

artist,len
Loading... (need help?),


In [61]:
show(df_lyrics.filter(
    pl.col("artist") == "Lil Wayne"
).select("title", "artist"))

title,artist
Loading... (need help?),


## Tag Distribution

In [63]:
df_lyrics_tag_distribution = df_lyrics.group_by("tag").agg(
    pl.len(),
    (pl.len().alias("Percent")/df_lyrics.shape[0] * 100).round(2).cast(pl.String) + "%"
).sort("len", descending=True).rename({"len" : "Number of Songs"})

In [76]:
fig = px.pie(df_lyrics_tag_distribution, names="tag", values="Number of Songs")
fig.update_traces(textinfo="percent+label")
fig.update_layout(title="Genre Distribution (Genius Song Lyrics)", margin=dict(b=120))

## Explore Join Columns

### For Title column
do Genius Song Lyric titles contain punctuation?

In [78]:
df_lyrics.select("title").null_count()

title
u32
165


#### Check normal Python punctuations

In [79]:
escaped_punc = re.escape(string.punctuation)
ODD_TITLES_REGEX = f"[{escaped_punc}]"

In [80]:
escaped_punc

'!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~'

Load characters and try match

In [81]:
puncs = pl.DataFrame({
    "punc" : [character for character in string.punctuation]
})

In [82]:
puncs.select(
    pl.col("punc").str.contains(ODD_TITLES_REGEX)
).group_by((pl.col("punc") == True).alias("is_punc_match_regex")).len()

is_punc_match_regex,len
bool,u32
True,32


regex matches all punctuations in string.punctuation

In [83]:
df_lyrics_title_punctuation = filter_regex_in_column(df_lyrics, "title", ODD_TITLES_REGEX)

In [84]:
df_lyrics_title_punctuation.shape

(436767, 11)

Genius Song Lyric titles contain punctuations, but we know from our null title exploration that some are not covered

In [85]:
df_lyrics_title_punctuation.head()

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""What Happened to That Boy?""","""rap""","""Birdman""",2002,100347,"""{Clipse}""","""[Intro: Birdman] Ayy-ayy, ayy,…",17,"""en""","""en""","""en"""
"""Losing Weight Pt. 2""","""rap""","""Cam'ron""",2002,32712,"""{""Cam\\'ron"",""Juelz Santana""}""","""[Chorus: Cam'ron] Ayo, fuck lo…",19,"""en""","""en""","""en"""
"""Mr. Carter""","""rap""","""Lil Wayne""",2008,542488,"""{JAY-Z}""","""[Produced by Infamous and Drew…",126,"""en""","""en""","""en"""
"""Encore / Curtains Down""","""rap""","""Eminem""",2004,187204,"""{""50 Cent"",""Dr. Dre""}""","""[Intro: 50 Cent & Eminem] Sh-s…",1999,"""en""","""en""","""en"""
"""C.R.E.A.M.""","""rap""","""Wu-Tang Clan""",1994,1984638,"""{}""","""[Produced by RZA] [Intro: Rae…",28,"""en""","""en""","""en"""


While not all punctuations are missing, we know certain punctuations are definitely missing, so let's find out which ones they are

In [86]:
lyrics_title_punc_counts = check_punctuation_counts(df_lyrics_title_punctuation, column="title")

In [87]:
genius_title_punctuation = pl.DataFrame(lyrics_title_punc_counts, schema=["punctuation", "number_of_songs_having_punctuation"])

In [88]:
show(genius_title_punctuation)

punctuation,number_of_songs_having_punctuation
Loading... (need help?),


We see numerous 0 count punctuations are either not present in the Genius song lyric titles, or just not parsed by the scraper. We know character like & in song titles such as "You & I" are not parsed, or + in "34+35" is not parsed

In [89]:
df_lyrics.filter(df_lyrics["title"] == "3435")

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""3435""","""rb""","""Ariana Grande""",2020,2691880,"""{}""","""[Intro] Hmm [Verse 1] You mig…",6071429,"""en""","""en""","""en"""
"""3435""","""pop""","""Darla Aguilar Zepeda""",2021,91,"""{}""","""[Intro] Hmm [Verse 1] You mig…",6630543,"""en""","""en""","""en"""
"""3435""","""pop""","""Ariana Grande""",2020,50,"""{}""","""[Intro] Hmm [Verse 1] You mig…",7696749,"""en""","""en""","""en"""
"""3435""","""pop""","""Ally Salort""",2021,4,"""{}""","""[Verse 1] You might think I'm …",7740671,"""en""","""en""","""en"""


In [90]:
df_lyrics.filter(df_lyrics["title"] == "YouI")

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""YouI""","""pop""","""BOL4""",2016,6428,"""{}""","""[Verse 1] 바보야 오늘은 안된다고 말하지마 오늘…",2917461,"""ko""","""ko""","""ko"""
"""YouI""","""rap""","""Cubes Naila""",2019,7,"""{""Reezy (USA)""}""","""Intro: [Reezy] You&I.. Bridge…",4950385,"""en""","""en""","""en"""
"""YouI""","""pop""","""Christina Zeile""",2021,10,"""{}""","""Drove five hours just to see y…",6414628,"""en""","""en""","""en"""


#### Check open and closed characters

In [91]:
df_lyrics_open_closed = extract_puncs(df_lyrics, "title", '\p{Pd}|\p{Ps}|\p{Pe}|\{Pi}|\{Pf}')

In [92]:
show(df_lyrics_open_closed)

puncs,len
Loading... (need help?),


In [93]:
# ord(")")
# "）".encode("unicode_escape")

In [94]:
df_lyrics.filter(
    pl.col("title").str.contains("）")
).select("title", "artist", "language")

title,artist,language
str,str,str
"""Controlうひょー）""","""KBM""","""ja"""
"""余华《第七天》：第三天第二节（节选）7th day 3:2 …","""Carolus""",
"""余华《第七天》：第三天第三节（节选）7th day 3:3 …","""Carolus""","""zh"""
"""Iya na yatsu（嫌な奴）""","""Kuroyagi""","""ja"""
"""Jibun（自分）""","""Kuroyagi""","""ja"""
…,…,…
"""南京风潮（Unrest in Nanjing）""","""The Wasted Years""","""zh"""
"""当窗外鸟儿都开始叫了（When the Birds Star…","""The Wasted Years""","""zh"""
"""对话（Dialogue）""","""The Wasted Years""","""zh"""
"""一跃（A Leap）""","""The Wasted Years""","""zh"""


#### Check non-Python punctuation characters

In [95]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [96]:
ODD_TITLES_REGEX = f"[^a-zA-Z0-9\s{re.escape(string.punctuation)}]"
df_lyrics_title_special = filter_regex_in_column(df_lyrics, "title", ODD_TITLES_REGEX)
df_lyrics_title_special.shape

(650735, 11)

In [97]:
"’" in string.punctuation

False

In [98]:
df_lyrics_title_special.head()

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""Steady Mobbin’""","""rap""","""Young Money""",2009,416720,"""{""Gucci Mane""}""","""[Produced by Kane Beatz] [Ver…",31,"""en""","""en""","""en"""
"""Can I Get A…""","""rap""","""JAY-Z""",1998,387385,"""{""Ja Rule"",Amil}""","""[Intro: JAY-Z] What? Well fuck…",61,"""en""","""en""","""en"""
"""I’m a Dboy""","""rap""","""Lil Wayne""",2005,29337,"""{Birdman}""","""[Intro: Lil Wayne] Yeah Okay, …",97,"""en""","""en""","""en"""
"""Day ‘N’ Nite Nightmare""","""rap""","""Kid Cudi""",2008,983979,"""{}""","""[Intro] Uh, uh, uh, uh Uh, uh …",131,"""en""","""en""","""en"""
"""La Fièvre""","""rap""","""Suprme NTM""",1995,26580,"""{""Suprême NTM""}""","""[Couplet 1 : Kool Shen] Tout a…",203,"""fr""","""fr""","""fr"""


In [99]:
df_lyrics_title_special[3, "title"]

'Day ‘N’ Nite Nightmare'

In [100]:
df_lyrics_title_special[6, "title"]

'Ain’t Got Time The W. Carter Collection'

notice apostrophes are parsed weirdly

In [101]:
## This time, get special characters without weird apostrophes
ODD_TITLES_REGEX = f"[^a-zA-Z0-9\s{re.escape(string.punctuation)}‘’]"
df_lyrics_title_special_no_apostrophe = filter_regex_in_column(df_lyrics, "title", ODD_TITLES_REGEX)

In [102]:
df_lyrics_title_special_no_apostrophe["title"].head()

shape: (10,)
Series: 'title' [str]
[
	"Can I Get A…"
	"La Fièvre"
	"A Life in the Day of Benjamin …
	"Neighborhood 2 Laïka"
	"Meiplé"
	"On the Eve of War Julio César …
	"​epaR"
	"Love Is…"
	"Laisse pas traîner ton fils"
	"Le retour du rap français"
]

#### Check songs only containing punctuations

In [103]:
## If we extract only non-letter and non-numbers from the string and their lengths still equal to the original string,
## then we only have non-letter and non-number characters
df_punc_only_songs = df_lyrics.filter(
    pl.col("title").str.extract("([^\\p{Letter}\\p{Number}]+)", 1).str.len_chars() == pl.col("title").str.len_chars()
).select("id", "title", "artist", "lyrics")

In [104]:
df_punc_only_songs.select("title", "artist").head()

title,artist
str,str
"""?""","""OutKast"""
"""$$$""","""Quinto Andar"""
"""-""","""Mr. THT"""
"""∆""","""ZiLLY"""
"""...""","""Axman13"""


In [105]:
df_punc_only_songs.n_unique(["title", "artist"])

713

In [106]:
show(df_punc_only_songs.select("id", "title", "artist"))

id,title,artist
Loading... (need help?),,


In [107]:
df_punc_only_songs.filter(pl.col("id") == 6515587).select("title", "lyrics")

title,lyrics
str,str
"""​""","""[Chorus] If it— if it makes yo…"


In [108]:
df_punc_only_songs.filter(pl.col("id") == 6515587).select("title", "lyrics")[0, "title"]

'\u200b'

In [109]:
pprint.pprint(
    df_punc_only_songs.filter(pl.col("id") == 6515587).select("title", "lyrics")[0, "lyrics"]
)

('[Chorus]\n'
 'If it— if it makes you sad today\n'
 "Then I won't come your way\n"
 'I could just stand outside\n'
 'As you cry all this away\n'
 'If you like, I guess I could stay\n'
 "But don't you know what they say? Yeah\n"
 "The sun doesn't shine sometimes\n"
 'When we need some time in the rain\n'
 '\n'
 '[Verse]\n'
 'Lie to yourself\n'
 "Like you don't feel nothing\n"
 "You ain't got time to\n"
 'Cry to yourself\n'
 'And they coming over\n'
 'Lend you a shoulder\n'
 'Hide to yourself, yeah\n'
 "She don't see eye to eye with herself\n"
 "It's getting dark before 4:30\n"
 "But you've been in bed all day, so what's еarly?\n"
 "And I've never been in so much of a hurry\n"
 "But don't worry\n"
 '[Chorus]\n'
 "I'll stay with you as long as you need mе to\n"
 'And I could just stand outside\n'
 'If it—\n'
 'If it— if it makes you sad today\n'
 "Then I won't come your way\n"
 'I could just stand outside\n'
 'As you cry all this away\n'
 'If you like, I guess I could stay\n'
 "But don't

Interesting some titles only have a zero-width space

#### Check marks

In [110]:
df_lyrics_open_closed = df_lyrics.select(
    pl.col("title").str.extract_all("\p{Mark}").alias("marks").explode()
)
show(df_lyrics_open_closed.group_by("marks").len().sort("len", descending=True))

marks,len
Loading... (need help?),


In [111]:
df_lyrics.filter(
    pl.col("title").str.contains("ြ")
).select("title", "artist")

title,artist
str,str
"""လမင်းရေးသောတမ်းခြင်း La Min Ya…","""Eternal Gosh!"""
"""မိုးမခအိပ်မက်ခရမ်းပြာ Moemakha…","""Eternal Gosh!"""
"""ကြိုး Kyoe""",""" (Lay Phyu)"""
"""ဒြပ်မဲ့ Drud Mae""","""Bigg-Y of Ontrack"""
"""ငါပြောတာကြားလား Ngar Pyaw Tar …","""Khay Mar"""
…,…
"""Official Supplement တရားဝင် ဖြ…","""Bigg-Y of Ontrack"""
"""ကြယ်တံခွန် Kyal-tagon""","""Eternal Gosh!"""
"""မျှော်လင့်ခြင်းလေး Mhyaw-lint-…",""" (Lay Phyu)"""
"""အမှတ်တရသင်္ကြန် a-mhat-ta-ya-t…",""" (Myo Kyawt Myaing)"""


#### Letters

In [112]:
df_lyrics_letters = extract_puncs(df_lyrics, "title", "\p{L}") 

In [113]:
show(df_lyrics_letters)

puncs,len
Loading... (need help?),


### For Artists column

In [114]:
df_lyrics.select("artist").null_count()

artist
u32
0


#### Check normal Python punctuation

In [115]:
escaped_punc = re.escape(string.punctuation)
ODD_CHARACTERS_REGEX = f"[{escaped_punc}]"

In [116]:
df_lyrics_artist_punc = filter_regex_in_column(df_lyrics, "title", ODD_CHARACTERS_REGEX)

In [117]:
punc_counts_artist = []
for punc in string.punctuation:
    punc_counts_artist.append(
        (
            punc,
            df_lyrics_artist_punc.filter(
                df_lyrics_artist_punc['artist'].str.contains(f"[{re.escape(punc)}]")
            ).shape[0]
        )
    )

In [118]:
genius_artist_punc_counts = pl.DataFrame(
    punc_counts_artist,
    schema=["punctuation", "number of artists containing punctuation"]
)

In [119]:
show(genius_artist_punc_counts)

punctuation,number of artists containing punctuation
Loading... (need help?),


artist doesn't appear to be missing punctuation, except for < and >

#### Check non-Python punctuation characters

In [120]:
escaped_punc = re.escape(string.punctuation)
ODD_CHARACTERS_REGEX = f"[^a-zA-Z0-9\s\{escaped_punc}]"

In [121]:
df_lyrics_artist_special = df_lyrics.filter(df_lyrics["artist"].str.contains(ODD_CHARACTERS_REGEX))
df_lyrics_artist_special.shape

(9, 11)

only 9 non-Python punctuations in artist column!

In [122]:
df_lyrics_artist_special

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""Free Bars""","""rap""","""Lil Dinero x Lil Key""",2016,269,"""{}""","""[Verse 1: Lil Dinero] I just …",2835875,"""en""","""en""","""en"""
"""Sunrise""","""rb""","""avamlmAv""",2021,8,"""{}""","""[Instrumental] Over your head…",6844407,,"""en""",
"""Feelin So Fine""","""rb""","""avamlmAv""",2021,8,"""{}""","""[Instrumental] Yeah (when you'…",6847383,"""zh""","""en""",
"""envy""","""pop""","""avamlmAv""",2021,11,"""{}""","""Her songs are warm but her hea…",6858114,"""en""","""en""","""en"""
"""spite""","""pop""","""avamlmAv""",2021,6,"""{}""","""And I want in spite of you And…",6858248,"""en""","""en""","""en"""
"""Ava’s Favorite Music 2020 - 20…","""misc""","""avamlmAv""",2021,95,"""{}""","""Dua Lipa ft. DaBaby - Levitati…",6861257,,,
"""Avas Favorite Music 2019 - 202…","""misc""","""avamlmAv""",2021,5,"""{}""","""Old Town Road - Lil Nas X Lizz…",6861323,"""en""","""en""","""en"""
"""Avas Favorite Music 2018 - 201…","""misc""","""avamlmAv""",2021,2,"""{}""","""Cardi B - I Like It Cardi B - …",6861374,"""en""",,
"""Ava’s Favorite Music 2017 - 20…","""misc""","""avamlmAv""",2021,3,"""{}""","""Charli XCX - Boys Lorde - Gree…",6861408,"""en""","""en""","""en"""


In [123]:
df_lyrics_artist_special["title"]

shape: (9,)
Series: 'title' [str]
[
	"Free Bars"
	"Sunrise"
	"Feelin So Fine"
	"envy"
	"spite"
	"Ava’s Favorite Music 2020 - 20…
	"Avas Favorite Music 2019 - 202…
	"Avas Favorite Music 2018 - 201…
	"Ava’s Favorite Music 2017 - 20…
]

In [124]:
# df_lyrics_artist_special[0, "title"]

In [125]:
df_lyrics_artist_special[1, "title"]

'\x7fSunrise'

artists sometimes have weird \x7f symbols. https://www.codetable.net/hex/7f

In [126]:
df_lyrics.filter(pl.col("artist") == "Mneskin").select("title", "artist").head()

title,artist
str,str
"""Chosen""","""Mneskin"""
"""Recovery""","""Mneskin"""
"""Beggin""","""Mneskin"""
"""Vengo dalla Luna""","""Mneskin"""
"""Let’s Get It Started""","""Mneskin"""


Notice how artists are missing special characters in their name. For example, Måneskin is missing the special character å in their name.
https://genius.com/artists/Maneskin

#### Check artist only containing punctuations

In [128]:
df_lyrics.filter(
    pl.col("artist").str.extract("([^\p{letter}\p{number}]+)", 1).str.len_chars() == pl.col("artist").str.len_chars()
).select("title", "artist")

title,artist
str,str


### Look for duplicates (on title and artist)

In [129]:
df_lyrics.shape

(5134856, 11)

In [130]:
df_lyrics.n_unique("id")

5134856

Every row is unique by id

In [131]:
df_lyrics_duplicate = (
    df_lyrics.filter(~df_lyrics.select("title", "artist").is_unique())
            .select("title", "artist", "year", "views", "features", "id", "lyrics", "language_cld3", "language_ft")
            .sort(by=["title", "artist"])
)

In [132]:
df_lyrics_duplicate.shape

(22, 9)

In [133]:
show(df_lyrics_duplicate.select("id", "title", "artist", "year", "views", "features", "language_cld3", "language_ft"))

id,title,artist,year,views,features,language_cld3,language_ft
Loading... (need help?),,,,,,,


Even though 11 songs have the same title and artist, some features are stil different.
We will drop any songs with the same title, artist and feature.

Examples of Duplicates:

In [134]:
df_lyrics.filter(
    (df_lyrics['artist'] == 'Ariana Grande') & 
    (df_lyrics["title"].str.contains("3435"))
)

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""3435""","""rb""","""Ariana Grande""",2020,2691880,"""{}""","""[Intro] Hmm [Verse 1] You mig…",6071429,"""en""","""en""","""en"""
"""3435 Remix""","""rb""","""Ariana Grande""",2021,734106,"""{""Doja Cat"",""Megan Thee Stalli…","""[Intro: Ariana Grande] Hmm [V…",6123723,"""en""","""en""","""en"""
"""3435 Remix Clean""","""pop""","""Ariana Grande""",2021,12715,"""{""Doja Cat"",""Megan Thee Stalli…","""[Intro: Ariana Grande] Hmm [V…",6381580,"""en""","""en""","""en"""
"""3435 Remix Radio Edit""","""pop""","""Ariana Grande""",2020,613,"""{""Doja Cat"",""Megan Thee Stalli…","""[Intro: Ariana Grande] Hmm [V…",6850333,"""en""","""en""","""en"""
"""3435 Clean""","""pop""","""Ariana Grande""",2020,1116,"""{}""","""[Intro] Hmm [Verse 1] You mig…",7242669,"""en""","""en""","""en"""
"""3435""","""pop""","""Ariana Grande""",2020,50,"""{}""","""[Intro] Hmm [Verse 1] You mig…",7696749,"""en""","""en""","""en"""


In [135]:
df_lyrics.filter(
    (df_lyrics['artist'].str.contains('(?i)Zedd')) & 
    (df_lyrics["title"].str.contains("(?i)Clarity"))
)

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""Clarity""","""pop""","""Zedd""",2012,592758,"""{Foxes}""","""[Click here to see why Zedd's …",91948,"""en""","""en""","""en"""
"""Clarity Brillz Remix""","""pop""","""Zedd""",2013,450,"""{Foxes}""","""High dive into frozen waves Wh…",1834203,"""en""","""en""","""en"""
"""Clarity Style of Eye Remix""","""pop""","""Zedd""",2013,280,"""{}""","""[Verse 1: Foxes] Hot dive into…",1834285,"""en""","""en""","""en"""
"""Clarity Tiësto Remix""","""pop""","""Zedd""",2015,574,"""{Foxes}""","""[Verse 1: Foxes] Hot dive into…",1834288,"""en""","""en""","""en"""
"""Clarity Vicetone Remix""","""pop""","""Zedd""",2012,878,"""{Foxes}""","""[Verse 1: Foxes] High dive int…",2083199,"""en""","""en""","""en"""
"""Clarity Zedd Union Mix""","""pop""","""Zedd""",2013,986,"""{}""","""[Verse 1: Foxes] Hot dive into…",3331690,"""en""","""en""","""en"""
"""Clarity Remix""","""pop""","""Zedd""",2014,220,"""{""Medina (DNK)""}""","""I'd dive into frozen waves Whe…",4700547,"""en""","""en""","""en"""
"""Clarity iTunes Session""","""pop""","""Zedd""",2013,42,"""{""Matthew Koma"",Foxes}""","""High dive into frozen waves Wh…",5675696,"""en""","""en""","""en"""
"""Clarity ARMNHMR NVRLEFT Remi…","""pop""","""Zedd""",2020,48,"""{Foxes}""","""[Verse 1] High dive into froze…",5743423,"""en""","""en""","""en"""


Need to join features and artist

In [136]:
df_lyrics.filter(
    (df_lyrics['artist'].str.contains('(?i)Taylor Swift')) & 
    (df_lyrics["title"].str.contains("(?i)22"))
)

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""22""","""pop""","""Taylor Swift""",2012,352410,"""{}""","""[Verse 1] It feels like a perf…",102705,"""en""","""en""","""en"""
"""22 Taylors Version""","""pop""","""Taylor Swift""",2021,30145,"""{}""","""[Verse 1] It feels like a perf…",6688200,"""en""","""en""","""en"""


We can possibly filter out all titles containing "Remix" or "Mix" in our track, but at the risk of removing any songs with remix/mix in the title

In [137]:
df_lyrics.filter(
    (df_lyrics["title"].str.contains("(?i)mix")) | 
    (df_lyrics["title"].str.contains("(?i)remix"))
)

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""Lollipop Remix""","""rap""","""Lil Wayne""",2008,580832,"""{""Kanye West"",""Static Major""}""","""[Intro: Lil Wayne] Haha Uh-huh…",7,"""en""","""en""","""en"""
"""Grindin Remix""","""rap""","""Clipse""",2002,16753,"""{""Lil Wayne"",Birdman,""Pharrell…","""[Hook: Pusha T & Pharrell] Wha…",114,"""en""","""en""","""en"""
"""Make It Rain Remix""","""rap""","""Fat Joe""",2007,135843,"""{""Lil Wayne"",T.I.,Birdman,""Ric…","""[Intro: R. Kelly, Fat Joe, Lil…",85,"""en""","""en""","""en"""
"""One More Chance / Stay With Me…","""rap""","""The Notorious B.I.G.""",1995,583928,"""{""Faith Evans""}""","""[Intro: Faith Evans] Don't go,…",87,"""en""","""en""","""en"""
"""Gettin Money Get Money Remix""","""rap""","""Junior M.A.F.I.A.""",1996,77074,"""{}""","""[Intro: The Notorious B.I.G. &…",90,"""en""","""en""","""en"""
…,…,…,…,…,…,…,…,…,…,…
"""LIES REMIX""","""pop""","""CANCEL CULTURE""",2022,8,"""{Khary,​love-sadKiD}""","""(Chorus CANCEL CULTURE) Call m…",7882577,"""en""","""en""","""en"""
"""WARNING -final mix-""","""pop""","""BUGVEL""",2022,4,"""{}""","""[BUGVEL「WARNING -final mix-」 歌…",7882645,"""ja""","""ja""","""ja"""
"""彩雲 -final mix-""","""pop""","""BUGVEL""",2020,3,"""{}""","""[BUGVEL「彩雲 -final mix-」 歌詞] い…",7882646,"""ja""","""ja""","""ja"""
"""Stupid Feelings KC Lights Remi…","""pop""","""220 KID & LANY""",2021,3,"""{""KC Lights""}""","""[Chorus] For you, it's easy to…",7882732,"""en""","""en""","""en"""


Removing remixes can potentially reduce our dataset by ~93000 rows, but that's only a ~2% reduction in size compared to 5 million rows we have

# Explore Spotify Charts Dataset

In [10]:
df_spotify_charts.shape

(26174269, 12)

In [11]:
UNIQUE_ARTIST_TITLE = df_spotify_charts.n_unique(["title", "artist"])

In [12]:
UNIQUE_ARTIST_TITLE

197535

In [141]:
df_spotify_charts.n_unique("track_id")

198078

### Check Chart Frequency

In [161]:
df_spotify_charts.select("date").null_count()

date
u32
0


#### Overall

In [142]:
df_spotify_charts_date = df_spotify_charts.select("date").unique()

In [143]:
df_spotify_charts_date = df_spotify_charts_date.sort(by="date")
df_spotify_charts_date = df_spotify_charts_date.with_columns(
    pl.col("date").str.to_date("%Y-%m-%d")
)

In [144]:
df_spotify_charts_frequency = df_spotify_charts_date - df_spotify_charts_date.shift(1)

In [145]:
df_spotify_charts_frequency.describe()

statistic,date
str,str
"""count""","""1825"""
"""null_count""","""1"""
"""mean""","""1 day, 0:00:00"""
"""std""",
"""min""","""1 day, 0:00:00"""
"""25%""","""1 day, 0:00:00"""
"""50%""","""1 day, 0:00:00"""
"""75%""","""1 day, 0:00:00"""
"""max""","""1 day, 0:00:00"""


#### Per Country

In [151]:
# Get unique chart dates per country
df_spotify_charts_unique = df_spotify_charts.select("date", "region").group_by("region").agg(
    pl.col("date").str.to_date("%Y-%m-%d").unique().alias("unique_dates")
)

In [152]:
df_spotify_charts_unique.head()

region,unique_dates
str,list[date]
"""Japan""","[2017-01-01, 2017-01-02, … 2021-12-31]"
"""Mexico""","[2017-01-01, 2017-01-02, … 2021-12-31]"
"""Belgium""","[2017-01-01, 2017-01-02, … 2021-12-31]"
"""Luxembourg""","[2017-01-01, 2017-01-02, … 2021-12-31]"
"""Malaysia""","[2017-01-01, 2017-01-02, … 2021-12-31]"


In [153]:
df_spotify_charts_unique = df_spotify_charts_unique.explode(
    "unique_dates"
)

In [154]:
## Calculate chart update frequency over region
df_spotify_charts_frequency = df_spotify_charts_unique.with_columns(
    (pl.col("unique_dates") - pl.col("unique_dates").shift()).over("region")
)

In [155]:
df_spotify_charts_frequency_agg = df_spotify_charts_frequency.group_by("region").agg(
    pl.median("unique_dates").alias("median frequency"),
    pl.min("unique_dates").alias("shortest frequency"),
    pl.max("unique_dates").alias("longest frequency"),
).sort("region")

In [156]:
show(df_spotify_charts_frequency_agg)

region,median frequency,shortest frequency,longest frequency
Loading... (need help?),,,


## Check viral50/top200 stream counts

In [157]:
df_spotify_charts.group_by('chart').len()

chart,len
str,u32
"""viral50""",5852308
"""top200""",20321961


In [158]:
df_spotify_charts.filter(pl.col("chart") == "viral50").null_count()

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
4,0,0,18,0,0,5852308,301211,304451,301211,301211,301211


viral50 charts always have null streams. Null records for streams is OK

Btw, viral50 songs are the most popular songs based on blog, social media (web analytics). This can be very interesting for analysis, even if streams data is not available. (We can compare songs that are popular on media vs. songs that are popular on spotify based on streams)
https://community.spotify.com/t5/Your-Library/What-s-the-difference-between-daily-top-50-and-viral-top-50/td-p/4973312


In [160]:
df_spotify_charts.filter(pl.col("chart") == "top200").null_count()

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
7,0,0,0,0,0,0,3040,18976,3040,3040,3040


top200 charts have no null streams as expected

## Check null title

In [None]:
df_spotify_charts_null_title = df_spotify_charts.filter(pl.col("title").is_null())

In [None]:
show(df_spotify_charts_null_title)

Searching the ID of the song, I think the title of the song is called "NA", which was accidentally parsed as a null character:
https://open.spotify.com/track/4cP6KmNvTFkLHZo6fVJq0c

In [None]:
df_spotify_charts_null_title.n_unique("title")

## Check Release Dates

In [162]:
df_spotify_charts_unique_titles = df_spotify_charts.unique(["title", "artist"])

In [164]:
df_spotify_charts_release_dates = df_spotify_charts_unique_titles.with_columns(
    pl.col("release_date").str.to_date("%Y-%m-%d", strict=False)
    .fill_null(
        pl.col("release_date").str.to_date("%Y-%m", strict=False)
    ).fill_null(
        pl.col("release_date").str.to_date("%Y", strict=False)
    # ).alias("release_date_explore")
)

In [174]:
df_spotify_charts_release_dates = df_spotify_charts_release_dates.with_columns(
    pl.col("release_date").dt.year()
    .cut([1900 + i for i in range(0, 130, 10)],
        labels=["Past-1900"] + [f"{1900 + 10*i}-{1900 + 10*i + 10}" for i in range(0, 12)] + ["2020-Future"]).alias("release_year_bin")
).sort("release_year_bin")
df_spotify_charts_release_year_count = df_spotify_charts_release_dates.group_by("release_year_bin").agg(
    pl.len(),
    (pl.len().alias("percent") / df_spotify_charts_unique_titles.shape[0] * 100).round(2),
    (pl.len().alias("percent_string") / df_spotify_charts_unique_titles.shape[0] * 100).round(2).cast(pl.String) + "%"
).rename({"len" : "count"}).sort("release_year_bin", descending=True).with_columns(
    pl.col("percent").cum_sum().round(2).alias("cum_sum")
)

In [175]:
show(df_spotify_charts_release_year_count)

release_year_bin,count,percent,percent_string,cum_sum
Loading... (need help?),,,,


In [176]:
fig = px.bar(df_spotify_charts_release_year_count.sort("release_year_bin", descending=False), x="release_year_bin", y="count")
fig.update_layout(title="Count of Spotify songs over Release Year Bins [Right Closed Intervals]")

## Check popularity

In [17]:
df_spotify_charts_unique_title = df_spotify_charts.unique(["title", "artist"])

In [25]:
show(df_spotify_charts.filter(
    (pl.col("title").str.contains("Levitating")) &
    (pl.col("artist").str.contains("Dua Lipa"))
).select("title", "date", "streams", "popularity").head(20))

title,date,streams,popularity
Loading... (need help?),,,


In [18]:
df_spotify_charts_unique_title.null_count()

title,rank,date,artist,region,chart,streams,track_id,album,popularity,duration_ms,explicit,release_date
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1,0,0,1,0,0,97314,17987,18290,17987,17987,17987,17987


## Analyze duplicates in charts dataset

reminder that one song can appear multiple times in the charts dataset, as a song can stay in the top 200 charts for a
very long period of time (so one song can have multiple rows in the charts datasets)

In [177]:
df_spotify_charts.shape

(26174269, 12)

In [178]:
df_spotify_charts.n_unique(subset=["title", "artist", "chart", "date", "region", "album"])

26173054

In [179]:
df_spotify_charts_not_unique = (
    df_spotify_charts.filter(~df_spotify_charts.select("title", "artist", "chart", "date", "region", "album").is_unique())
    .sort("title", "artist", "chart", "region", "album", "date")
)
df_spotify_charts_not_unique.shape

(2430, 12)

In [180]:
show(df_spotify_charts_not_unique)

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
Loading... (need help?),,,,,,,,,,,


In [181]:
df_spotify_charts_not_unique.n_unique(["title", "artist"])

92

not sure why 92 songs appeared on spotify charts on the same date and region multiple times. Maybe a song is not unique on title, artist, chart, date, region, album.

If you check 95. south by J. Cole, this is the non-explicit version: https://open.spotify.com/track/4GSBObGq3DuiKFlytKBbEw

And this is the explicit version: https://open.spotify.com/track/5R691ipUYRDYW6ehapjoj6

So now we know the same song can have both an explicit and non-explicit version


In [182]:
df_spotify_charts.n_unique(subset=["title", "artist", "chart", "date", "region", "album", "explicit"])

26173170

In [183]:
df_spotify_charts_not_unique_explicit = (
    df_spotify_charts.filter(~df_spotify_charts.select("title", "artist", "chart", "date", "region", "album", "explicit").is_unique())
    .sort("title", "artist", "chart", "region", "album", "explicit", "date")
)
df_spotify_charts_not_unique_explicit.shape

(2198, 12)

In [184]:
df_spotify_charts_not_unique_explicit.n_unique(["title", "artist"])

80

Cut down the number a bit now

In [185]:
show(df_spotify_charts_not_unique_explicit)

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
Loading... (need help?),,,,,,,,,,,


4th Dimension - KIDS SEE GHOSTS, Louis Prima has duplicate track ids for... I'm not sure why they have duplicate track ids.

https://open.spotify.com/track/6JyEh4kl9DLwmSAoNDRn5b

https://open.spotify.com/track/5cQMfg4AEejKUfNA44ZGep

In [186]:
df_spotify_charts_not_unique_explicit_track_id = (
    df_spotify_charts.filter(~df_spotify_charts.select("title", "artist", "chart", "date", "region", "album", "explicit", "track_id").is_unique())
    .sort("title", "artist", "chart", "region", "album", "explicit", "track_id", "date")
)
df_spotify_charts_not_unique_explicit_track_id.shape

(1640, 12)

In [187]:
df_spotify_charts_not_unique_explicit_track_id.n_unique(["title", "artist"])

57

In [188]:
show(df_spotify_charts_not_unique_explicit_track_id)

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
Loading... (need help?),,,,,,,,,,,


Not sure why the above tracks are not unique. Considering they have the same rank and all too, I think they are truly duplicates, unlike some tracks which are simply unique on track_id

## Hidden form of duplicates

Some song titles can have only differences in punctuation

In [190]:
df_spotify_charts_unique_title = df_spotify_charts.unique(["title", "artist"])

In [191]:
df_spotify_charts_unique_title.filter(
    (pl.col("title").str.contains("(?i)What is Love")) & 
    (pl.col("artist") == "TWICE")
).unique(["title", "artist"])

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
str,i64,str,str,str,str,f64,str,str,f64,bool,str
"""What is Love? - Japanese ver.""",111,"""2019-03-01""","""TWICE""","""Japan""","""top200""",9336.0,"""0auI49a6lE4phcRVY8hKiY""","""What is Love? (Japanese ver.)""",208237.0,False,"""2019-02-08"""
"""What is Love?""",47,"""2018-05-01""","""TWICE""","""Hong Kong""","""top200""",4419.0,"""2PhG74FocRobLC8pP9cIBI""","""What is Love?""",208238.0,False,"""2018-04-09"""
"""What is Love""",157,"""2020-06-01""","""TWICE""","""Japan""","""top200""",19288.0,"""1IX47gefluXmKX4PrTBCRM""","""Summer Nights""",208240.0,False,"""2018-07-09"""


Without punctuation, some songs are actually the same song. In the above example. "What is Love?" and "What is Love" are the same songs but the prior title has an extra question mark in it

In [193]:
def create_semantic_hash(column):
    return (
        pl.when(pl.col(column).str.extract("([^\\p{letter}]\\p{number}+)", 1).str.len_chars() == pl.col(column).str.len_chars()).then(
            ## If we only have non-letter and non-number characters, then don't do anything to title
            pl.col(column)
        ).otherwise(
            ## If we have letters or numbers, then extract everything except for non-meaningful punctuation
            ## \P in unicode means exclusion
            pl.col(column).str.extract_all("[\\P{Punct}@$]").list.join("")
        ).alias(f"{column}_hash")
    )


In [194]:
## Remove spaces
df_spotify_charts_unique_title = df_spotify_charts_unique_title.with_columns(
    pl.col("title").str.replace_all("\s+", " "),
    pl.col("artist").str.replace_all("\s+", " ")
)

In [195]:
df_spotify_charts_unique_title = df_spotify_charts_unique_title.with_columns(
    create_semantic_hash("title"),
    create_semantic_hash("artist")
)

In [196]:
df_spotify_charts_unique_title.filter(
    (pl.col("title").str.contains("(?i)What is Love")) & 
    (pl.col("artist") == "TWICE")
).unique(["title", "artist"])

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date,title_hash,artist_hash
str,i64,str,str,str,str,f64,str,str,f64,bool,str,str,str
"""What is Love?""",47,"""2018-05-01""","""TWICE""","""Hong Kong""","""top200""",4419.0,"""2PhG74FocRobLC8pP9cIBI""","""What is Love?""",208238.0,False,"""2018-04-09""","""What is Love""","""TWICE"""
"""What is Love? - Japanese ver.""",111,"""2019-03-01""","""TWICE""","""Japan""","""top200""",9336.0,"""0auI49a6lE4phcRVY8hKiY""","""What is Love? (Japanese ver.)""",208237.0,False,"""2019-02-08""","""What is Love Japanese ver""","""TWICE"""
"""What is Love""",157,"""2020-06-01""","""TWICE""","""Japan""","""top200""",19288.0,"""1IX47gefluXmKX4PrTBCRM""","""Summer Nights""",208240.0,False,"""2018-07-09""","""What is Love""","""TWICE"""


In [197]:
# Unique on title and artist, but not on title and artist with no punctuation
df_spotify_charts_hidden_duplicates = df_spotify_charts_unique_title.filter(
    (df_spotify_charts_unique_title.select("title", "artist").is_unique()),
    (~df_spotify_charts_unique_title.select("title_hash", "artist_hash").is_unique())
)


In [198]:
df_spotify_charts_hidden_duplicates.shape

(156, 14)

In [199]:
df_spotify_charts_hidden_duplicates.n_unique(["title_hash", "artist_hash"])

78

In [200]:
show(df_spotify_charts_hidden_duplicates.select("title", "artist", "title_hash", "artist_hash").sort("title_hash", "artist"))

title,artist,title_hash,artist_hash
Loading... (need help?),,,


track id unique but title and artist not unique

In [None]:
df_spotify_charts_unique_title.filter(
    (pl.col("track_id").is_unique()) &
    (~df_spotify_charts_unique_title.select("title", "artist").is_unique())
)

## Check Join Columns

We saw that earlier, Genius Song Lyrics dataset does not parse some characters (french characters, emojis, unicode surrogate pairs) in artist column and is missing some punctuation in titles column.
Let's check if Spotify dataset features store them

In [None]:
df_spotify_charts_unique_title = df_spotify_charts.unique(["title", "artist"])

### For Title
do Spotify Charts titles contain punctuation?

In [201]:
escaped_punc = re.escape(string.punctuation)
PUNC_REGEX = f"[{escaped_punc}]"

In [202]:
df_spotify_title_punc = filter_regex_in_column(df_spotify_charts_unique_title, "title", PUNC_REGEX)
df_spotify_title_punc.shape

(47026, 14)

In [203]:
df_spotify_title_punc.head()

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date,title_hash,artist_hash
str,i64,str,str,str,str,f64,str,str,f64,bool,str,str,str
"""Keep My Head 2 Da Sky - Re-Mas…",188,"""2018-06-15""","""KH""","""Thailand""","""top200""",1916.0,"""4DalGLF3M4nk2bZmmtDaFf""","""Keep My Head 2 Da Sky (Re-Mast…",381597.0,False,"""2018-06-11""","""Keep My Head 2 Da Sky ReMaste…","""KH"""
"""Love Thru the Computer (feat. …",37,"""2019-06-24""","""Gucci Mane""","""Iceland""","""viral50""",,"""3dxioHwH6wvCkQwt0rgvQt""","""Delusions of Grandeur""",172888.0,True,"""2019-06-21""","""Love Thru the Computer feat Ju…","""Gucci Mane"""
"""spróbuję / STRAŻNIK""",43,"""2018-03-20""","""KRÓL""","""Poland""","""viral50""",,"""2N0GE0JAxqiWLdUyJyCvAz""","""Przewijanie na podglądzie""",215586.0,False,"""2018-03-16""","""spróbuję STRAŻNIK""","""KRÓL"""
"""กอดเสาเถียง (Story จักรวาลไทบ้…",185,"""2020-01-11""","""ปรีชา ปัดภัย""","""Thailand""","""top200""",6135.0,"""7MBpIcTE8HZOCbIDbnFIqd""","""กอดเสาเถียง (Story จักรวาลไทบ้…",227343.0,False,"""2019-11-01""","""กอดเสาเถียง Story จักรวาลไทบ้า…","""ปรีชา ปัดภัย"""
"""10AM/Save The World (feat. Guc…",35,"""2018-11-02""","""Metro Boomin""","""Bulgaria""","""top200""",1143.0,"""6qZANDqrSAvfuSjcxBP1PD""","""NOT ALL HEROES WEAR CAPES""",226320.0,True,"""2018-11-02""","""10AMSave The World feat Gucci …","""Metro Boomin"""


While not all punctuations are missing, we know certain punctuations are definitely missing, so let's find out which ones are

In [206]:
punc_counts = check_punctuation_counts(df_spotify_title_punc, column="title")

In [207]:
# df_spotify_title_not_alphanumeric.filter(
#     df_spotify_title_not_alphanumeric["title"].str.contains("\{")
# )

In [208]:
pl_punctuation = pl.DataFrame(punc_counts, schema=["punctuation", "number_of_songs_having_punctuation"])

In [209]:
df_lyrics.filter(
    (pl.col("title").str.contains("3435")) & 
    (pl.col("artist") == "Ariana Grande")
).unique("title")

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""3435 Remix""","""rb""","""Ariana Grande""",2021,734106,"""{""Doja Cat"",""Megan Thee Stalli…","""[Intro: Ariana Grande] Hmm [V…",6123723,"""en""","""en""","""en"""
"""3435 Remix Clean""","""pop""","""Ariana Grande""",2021,12715,"""{""Doja Cat"",""Megan Thee Stalli…","""[Intro: Ariana Grande] Hmm [V…",6381580,"""en""","""en""","""en"""
"""3435 Remix Radio Edit""","""pop""","""Ariana Grande""",2020,613,"""{""Doja Cat"",""Megan Thee Stalli…","""[Intro: Ariana Grande] Hmm [V…",6850333,"""en""","""en""","""en"""
"""3435""","""rb""","""Ariana Grande""",2020,2691880,"""{}""","""[Intro] Hmm [Verse 1] You mig…",6071429,"""en""","""en""","""en"""
"""3435 Clean""","""pop""","""Ariana Grande""",2020,1116,"""{}""","""[Intro] Hmm [Verse 1] You mig…",7242669,"""en""","""en""","""en"""


In [210]:
df_spotify_charts_unique_title.filter(
    (pl.col("title").str.contains("34\+35")) 
    & 
    (pl.col("artist") == "Ariana Grande")
).unique("title")["title"].to_list()

['34+35', '34+35 Remix (feat. Doja Cat, Megan Thee Stallion) - Remix']

Spotify chart titles look well populated with punctuations

What about special characters?


In [211]:
SPECIAL_REGEX = f"[^a-zA-Z0-9\s{re.escape(string.punctuation)}]"
df_spotify_title_special = filter_regex_in_column(df_spotify_charts_unique_title, "title", SPECIAL_REGEX)
df_spotify_title_special.shape

(43487, 14)

In [213]:
punc_counts = check_punctuation_counts(df_spotify_title_special, column="title")

In [214]:
show(pl.DataFrame(punc_counts, schema=["punctuation", "number_of_songs_have_punctuation"]))

punctuation,number_of_songs_have_punctuation
Loading... (need help?),


In [215]:
df_spotify_title_special.head(10)

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date,title_hash,artist_hash
str,i64,str,str,str,str,f64,str,str,f64,bool,str,str,str
"""ЧАСИКИ""",24,"""2020-11-01""","""20TOKENS""","""Russia""","""viral50""",,"""4V1x5C2gNOpi0GeKn3UO2k""","""ЧАСИКИ""",178623.0,False,"""2020-10-05""","""ЧАСИКИ""","""20TOKENS"""
"""Lé Luôn""",40,"""2020-08-01""","""Rich Choi, YelowX3""","""Vietnam""","""viral50""",,"""6e2H7hWkmmQiJrUZurYOvY""","""Lé Luôn""",149875.0,True,"""2020-06-04""","""Lé Luôn""","""Rich Choi YelowX3"""
"""Ça plane pour moi""",27,"""2017-02-13""","""Guillaume Canet""","""France""","""viral50""",,"""1BsggzH6aiEvC6ONeY074q""","""Rock'N'Roll (Bande originale d…",289146.0,False,"""2017-02-10""","""Ça plane pour moi""","""Guillaume Canet"""
"""spróbuję / STRAŻNIK""",43,"""2018-03-20""","""KRÓL""","""Poland""","""viral50""",,"""2N0GE0JAxqiWLdUyJyCvAz""","""Przewijanie na podglądzie""",215586.0,False,"""2018-03-16""","""spróbuję STRAŻNIK""","""KRÓL"""
"""กอดเสาเถียง (Story จักรวาลไทบ้…",185,"""2020-01-11""","""ปรีชา ปัดภัย""","""Thailand""","""top200""",6135.0,"""7MBpIcTE8HZOCbIDbnFIqd""","""กอดเสาเถียง (Story จักรวาลไทบ้…",227343.0,False,"""2019-11-01""","""กอดเสาเถียง Story จักรวาลไทบ้า…","""ปรีชา ปัดภัย"""
"""Hva' Du Værd (feat. Smiley)""",123,"""2021-06-01""","""L.O.C.""","""Denmark""","""top200""",10811.0,"""4pCB7A8b8ERNaZ1p7SJHoF""","""FUCK L.O.C.""",169845.0,True,"""2021-05-14""","""Hva Du Værd feat Smiley""","""LOC"""
"""Alles für mich (feat. Moe Phoe…",117,"""2018-09-01""","""Fard""","""Germany""","""top200""",72780.0,"""337IpkRKsb4oEFnsU84eLX""","""Habuubz, Volume 1""",200173.0,True,"""2018-08-31""","""Alles für mich feat Moe Phoeni…","""Fard"""
"""下一束光""",12,"""2019-09-02""","""吳林峰""","""Hong Kong""","""viral50""",,"""5VUVf5tMTfL8UwdOmiRxqJ""","""下一束光""",316695.0,False,"""2019-08-12""","""下一束光""","""吳林峰"""
"""Lo siento (feat. Sofía Reyes)""",174,"""2019-06-03""","""Beret""","""Spain""","""top200""",33986.0,"""5dITA6jTX7f1gHwEAyS3YZ""","""Lo siento (feat. Sofía Reyes)""",232655.0,False,"""2019-05-16""","""Lo siento feat Sofía Reyes""","""Beret"""
"""Back to the Streets (feat. Jhe…",117,"""2020-10-23""","""Jhené Aiko, Saweetie""","""Canada""","""top200""",41871.0,"""3MEruRteiUZXkStfTlZqRn""","""Back to the Streets (feat. Jhe…",189496.0,True,"""2020-10-23""","""Back to the Streets feat Jhené…","""Jhené Aiko Saweetie"""


In [216]:
unique_titles_spotify = df_spotify_charts.unique(subset=["title", "artist"]).select("title", "artist")

In [217]:
unique_titles_spotify = unique_titles_spotify.with_columns(
    pl.col("title").str.strip_chars()
)
punc_only_titles = unique_titles_spotify.filter(
    pl.col("title").str.extract("([^\\p{letter}\\p{number}]+)", 1).str.len_chars() == pl.col("title").str.len_chars()
).select("title", "artist")

In [219]:
punc_only_titles.head()

title,artist
str,str
"""?""","""Samey, Yzomandias"""
"""???""","""JóiPé, Króli"""
""".""","""girl in red"""
"""...""","""Don Bigg"""
"""!!!""","""nns, kkn, Ryu"""


In [218]:
punc_only_titles.n_unique(["title", "artist"])

28

28 songs with only punctuation/symbols as title in spotify dataset

### For Artists

In [223]:
df_spotify_charts_unique_title = df_spotify_charts.unique(["title", "artist"])

In [222]:
escaped_punc = re.escape(string.punctuation)
PUNC_REGEX = f"[{escaped_punc}]"

In [224]:
df_spotify_artist_punc = filter_regex_in_column(df_spotify_charts_unique_title, "artist", PUNC_REGEX)
df_spotify_artist_punc.shape

(58577, 12)

In [225]:
punc_counts = check_punctuation_counts(df_spotify_artist_punc, column="artist")

In [226]:
df_punc_counts_artist = pl.DataFrame(
    punc_counts,
    schema=["punctuation", "number of artists containing punctuation"]
)

In [227]:
show(df_punc_counts_artist)

punctuation,number of artists containing punctuation
Loading... (need help?),


artist doesn't appear to be missing punctuation, except for { and }

what about special characters?

In [228]:
SPECIAL_CHARACTERS_REGEX = f"[^a-zA-Z0-9\s{escaped_punc}]"
df_spotify_charts_artist_special = filter_regex_in_column(df_spotify_charts_unique_title, "artist", SPECIAL_CHARACTERS_REGEX)
df_spotify_charts_artist_special.shape

(21202, 12)

In [229]:
df_spotify_charts_artist_special.head()

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
str,i64,str,str,str,str,f64,str,str,f64,bool,str
"""Mi Vida""",153,"""2019-10-01""","""José José""","""Mexico""","""top200""",79620.0,"""5R7C9FzhN3l7T01xASxcQ4""","""Mi Vida""",249000.0,False,"""1982-10-29"""
"""寂寞時分""",49,"""2019-07-27""","""李狐""","""Taiwan""","""viral50""",,"""3MnxJz9riNDivCgI75Ix9p""","""狐思亂想""",283125.0,False,"""2019-06-22"""
"""下半場""",40,"""2020-11-01""","""痞爺""","""Taiwan""","""viral50""",,"""4Sa8XJXil0LkuCW3WFSHVE""","""下半場""",141333.0,False,"""2020-09-30"""
"""Take Me To Church""",39,"""2017-12-18""","""Chris Kläfford""","""Sweden""","""viral50""",,"""353c8ahyopeJhiJroiHCN2""","""Treading Water - EP""",212320.0,False,"""2017-12-15"""
"""Noi Minh Dung Chan (Nhac Phim …",8,"""2019-02-01""","""Mỹ Tâm""","""Vietnam""","""top200""",4739.0,"""4fnCQeozjJTzN1pKWuWeBO""","""Noi Minh Dung Chan (Nhac Phim …",242093.0,False,"""2019-01-25"""


In [230]:
unique_titles_spotify = df_spotify_charts.unique(subset=["title", "artist"]).select("title", "artist")

In [231]:
unique_titles_spotify.filter(
    pl.col("artist").str.extract("([^\\p{letter}\\p{number}]+)", 1).str.len_chars() == pl.col("artist").str.len_chars()
).select("title", "artist")

title,artist
str,str
"""The Long Walk""","""!!!"""
"""Kim, Thurston and Me""","""!!!"""


1 artist with punctuation/symbol artist name

# Further Join Columns Exploration: Finding Metadata Strings in Title and Artist 

## Little Wayne (Example)

In [232]:
show(df_lyrics.filter(
    pl.col("artist") == "Lil Wayne"
).select("title", "artist"))

title,artist
Loading... (need help?),


## Check Features

### Spotify charts features

In [233]:
df_spotify_charts_unique_titles = df_spotify_charts.unique(subset=["title", "artist"])

In [234]:
df_spotify_charts_feature = df_spotify_charts_unique_titles.filter(
    pl.col("title").str.contains("(?i)[\p{Punct}\p{Separator}](feat|feature|ft)[\p{Punct}\p{Separator}]")
)

In [235]:
df_spotify_charts_feature.shape

(13003, 12)

In [236]:
df_spotify_charts_feature.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.06582630926164984

Big impact! Almost makes up 6.58% of Spotify Songs

In [237]:
df_spotify_charts_feature.head(5)

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
str,i64,str,str,str,str,f64,str,str,f64,bool,str
"""Siren Remix (Feat. UNEDUCATED …",86,"""2021-11-01""","""HOMIES""","""South Korea""","""top200""",1959.0,"""2S9FuNX2H2cCwxcIoTUtbM""","""Siren Remix""",197093.0,False,"""2021-03-20"""
"""HIGHEST IN THE ROOM (feat. ROS…",197,"""2020-05-01""","""Travis Scott""","""Greece""","""top200""",2328.0,"""7h0d2h0fUmzbs7zeFigJPn""","""JACKBOYS""",244873.0,True,"""2019-12-27"""
"""Don't Go (feat. J.Hyamm & Davi…",48,"""2017-08-19""","""LYAN""","""Poland""","""viral50""",,"""1FVQsWsWJrrkGeD5qFD2yf""","""Don't Go (feat. J.Hyamm & Davi…",225400.0,False,"""2017-07-28"""
"""FULU$ (feat. Musiye & Blueface…",128,"""2019-10-01""","""Farid Bang""","""Germany""","""top200""",74917.0,"""6kUyJHjBWQgmZo4WWW4KXt""","""FULU$ (feat. Musiye & Blueface…",135706.0,True,"""2019-09-13"""
"""Un altro mondo (feat. Neverbh)""",36,"""2019-12-02""","""Erbe Officinali""","""Italy""","""viral50""",,"""3ZV1NsTsQYenKowGEh77Z5""","""Un altro mondo""",204031.0,False,"""2019-10-08"""


features do exist in titles column

In [238]:
feat_extracts_spotify = get_regex_extracts(df_spotify_charts_feature, "title", "(?i)feat.{1}")
feat_extracts_spotify.group_by("title").len().sort("len", descending=True)

title,len
str,u32
"""feat.""",12663
"""Feat.""",228
"""feat """,10
"""FEAT.""",10
"""Feat """,8
"""featu""",2
"""feate""",2
"""feat,""",1


In [239]:
ft_extracts_spotify = get_regex_extracts(df_spotify_charts_feature, "title", "(?i)ft.{1}")
ft_extracts_spotify.group_by("title").len().sort("len", descending=True)

title,len
str,u32
"""ft.""",94
"""ft """,33
"""fte""",26
"""Ft.""",20
"""ft)""",6
…,…
"""ftp""",1
"""FT.""",1
"""fto""",1
"""fth""",1


In [240]:
feature_extracts_spotify = get_regex_extracts(df_spotify_charts_feature, "title", "(?i)feature")
feature_extracts_spotify.group_by("title").len().sort("len", descending=True)

title,len
str,u32
"""feature""",2


In [241]:
df_spotify_charts_feature.filter(pl.col("title").str.contains("(?i)feature"))["title"].to_list()

['I Believe (feat. Demi Lovato) - As featured in the Walt Disney Pictures\' "A WRINKLE IN TIME"',
 'SAYONARA feature phone']

In [242]:
# strange_ft = df_spotify_charts_unique_titles.select(
#     pl.col("title").str.extract("(?i)([\p{Punct}\p{Separator}](feat|feature|ft)[\p{Uppercase_Letter}])", 1)
# )

In [243]:
strange_extract = get_regex_extracts(df_spotify_charts_unique_titles, "title", "([\p{Punct}\p{Separator}](feat|feature|ft)[A-Z])")

In [244]:
strange_extract.group_by("title").len()

title,len
str,u32


Feature example

In [245]:
show(df_spotify_charts_feature.filter(pl.col("title").str.contains("(?i)feature")))

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
Loading... (need help?),,,,,,,,,,,


In [246]:
df_lyrics.filter(
    (pl.col("title").str.contains("(?i)I Believe")) & 
    (pl.col("artist").str.contains("(?i)DJ Khaled"))
)

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""I Believe""","""pop""","""DJ Khaled""",2018,48756,"""{""Demi Lovato""}""","""[Intro: Demi Lovato & DJ Khale…",3537959,"""en""","""en""","""en"""


In [247]:
show(
    df_spotify_charts_unique_title.filter(pl.col("title").str.contains("(?i)Only the Young")).unique("title").select("title", "artist")
)

title,artist
Loading... (need help?),


In [248]:
show(
    df_lyrics.filter(
        (pl.col("title").str.contains("(?i)Only the Young")) & 
        (pl.col("artist").str.contains("(?i)Taylor Swift"))
    ).select("title", "artist", "features")
)

title,artist,features
Loading... (need help?),,


feat. Example

In [249]:
show(df_spotify_charts_feature.filter(pl.col("title").str.contains("(?i)feat\.")))

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
Loading... (need help?),,,,,,,,,,,


In [250]:
df_spotify_feature_sample = df_spotify_charts_feature.filter(
    (pl.col("title").str.contains("Levitating")) &
    (pl.col("artist") == "Dua Lipa")
)

In [251]:
show(df_spotify_feature_sample.select("track_id", "title", "artist"))

track_id,title,artist
Loading... (need help?),,


In [252]:
df_lyrics_sample = df_lyrics.filter(
    (pl.col("title").str.contains("Levitating")) &
    (pl.col("artist") == "Dua Lipa")
).unique(subset=["title", "artist"], maintain_order=True)

In [253]:
show(df_lyrics_sample.select("title", "artist", "features"))

title,artist,features
Loading... (need help?),,


In [254]:
df_spotify_feature_sample = df_spotify_charts_feature.filter(
    (pl.col("title").str.contains("Tied Up")) &
    (pl.col("artist") == "Major Lazer")
)

In [255]:
show(df_spotify_feature_sample.select("track_id", "title", "artist"))

track_id,title,artist
Loading... (need help?),,


In [256]:
df_lyrics_sample = df_lyrics.filter(
    (pl.col("title").str.contains("Tied Up")) &
    (pl.col("artist") == "Major Lazer")
).unique(subset=["title", "artist"], maintain_order=True)

In [257]:
show(df_lyrics_sample.select("title", "artist", "features"))

title,artist,features
Loading... (need help?),,


In [258]:
df_spotify_feature_sample = df_spotify_charts_feature.filter(
    (pl.col("title").str.contains("MOOD 4 EVA"))
)
show(df_spotify_feature_sample.select("track_id", "title", "artist"))


track_id,title,artist
Loading... (need help?),,


In [259]:
df_lyrics_sample = df_lyrics.filter(
    (pl.col("title").str.contains("MOOD 4 EVA"))
).unique(subset=["title", "artist"], maintain_order=True)
show(df_lyrics_sample.select("title", "artist", "features"))

title,artist,features
Loading... (need help?),,


## Check Remastered

In [260]:
df_spotify_charts_remaster = df_spotify_charts_unique_titles.filter(pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]remaster"))

In [261]:
df_spotify_charts_remaster.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.003913230566735009

0.3% impact

In [262]:
df_spotify_charts_remaster.head()

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
str,i64,str,str,str,str,f64,str,str,f64,bool,str
"""La Stagione Dell'Amore - Remas…",33,"""2021-05-18""","""Franco Battiato""","""Italy""","""top200""",137340.0,"""7lhMwmewNFbRl8EuVZ9eFb""","""Orizzonti Perduti (2008 Remast…",229693.0,False,"""1983"""
"""Mo Money Mo Problems (feat. Pu…",177,"""2017-02-02""","""The Notorious B.I.G., Diddy""","""Australia""","""top200""",15603.0,"""4INDiWSKvqSKDEu7mh8HFz""","""Life After Death (2014 Remaste…",257399.0,True,"""1997-03-04"""
"""Cykady na Cykladach - 2011 Rem…",29,"""2018-07-28""","""Maanam""","""Poland""","""top200""",29477.0,"""0SwAC0yvUxGq2frHlNCfvE""","""Złota Kolekcja""",179226.0,False,"""2006-02-10"""
"""Wicked Game - Remastered""",46,"""2017-01-05""","""Chris Isaak""","""Slovakia""","""viral50""",,"""5l91KHr9uF8wdSdtRoZba8""","""Best Of Chris Isaak""",286880.0,False,"""2006-05-05"""
"""Into My Arms - 2011 Remastered…",38,"""2017-11-05""","""Nick Cave & The Bad Seeds""","""Slovakia""","""viral50""",,"""407ltk0BtcZI8kgu0HH4Yj""","""The Boatman's Call (2011 Remas…",256120.0,False,"""1997"""


In [263]:
df_lyrics.filter(
    (pl.col("title").str.contains("(?i)we will rock you")) & 
    (pl.col("artist").str.contains("(?i)queen"))
).select("title", "artist", "year")

title,artist,year
str,str,i64
"""We Will Rock You""","""Queen""",1977
"""We Will Rock You Remix 1991 Ru…","""Queen""",1991
"""We Will Rock You""","""Killer Queenz""",2017
"""We Will Rock You Live at Knebw…","""Queen""",1986
"""We Will Rock You - LP JC Remi…","""Queen""",2012
…,…,…
"""We Will Rock You Live Killers""","""Queen""",1979
"""We Will Rock You 2 Live Killer…","""Queen""",1979
"""We Will Rock You""","""Queen + Paul Rodgers""",2005
"""We Will Rock You Live at Rock …","""Queen""",2020


## Check Prod

In [264]:
df_spotify_charts_prod = df_spotify_charts_unique_titles.filter(pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]prod\."))
df_spotify_charts_prod.shape

(418, 12)

In [265]:
df_spotify_charts_prod.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.0021160806945604575

0.2% impact for prod

In [266]:
df_spotify_charts_prod.head()

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
str,i64,str,str,str,str,f64,str,str,f64,bool,str
"""Gabbiano / Moonrock (feat. Laz…",51,"""2018-12-18""","""Dani Faiv""","""Italy""","""top200""",79873.0,"""0bevh9FciBkb57ZqQk4uDQ""","""Gabbiano / Moonrock (feat. Laz…",185957.0,True,"""2018-12-18"""
"""Atlanta (Prod. Vizio & Pepe)""",22,"""2017-03-03""","""Gata Cattana""","""Spain""","""viral50""",,"""0u6BbwhvkxckUQHee7J5xp""","""Inéditos 2015""",178000.0,False,"""2016-02-09"""
"""YOSHI (feat. Fabri Fibra) - pr…",108,"""2020-05-01""","""MACHETE, Dani Faiv, tha Suprem…","""Italy""","""top200""",51333.0,"""3Hh0ZcGIqIkSiJVZGLcVLx""","""MACHETE MIXTAPE 4""",211994.0,True,"""2019-07-05"""
"""No McDonald's (feat. MadMan) -…",115,"""2020-07-03""","""Jack The Smoker""","""Italy""","""top200""",77799.0,"""5JGFrPZ2klpI7PJcWtrVaW""","""Ho Fatto Tardi""",170971.0,True,"""2020-07-03"""
"""In To You (feat. g1nger) - Pro…",88,"""2021-08-26""","""JAY B""","""Thailand""","""top200""",10556.0,,,,,


In [267]:
df_spotify_charts_prod = df_spotify_charts_unique_titles.filter(pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]prod[\p{Letter}]"))

## Check Remix

In [268]:
df_spotify_charts_remix = df_spotify_charts_unique_titles.filter(pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]remix|remix"))
df_spotify_charts_remix.shape

(4097, 12)

In [269]:
df_spotify_charts_remix.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.020740628243096162

2.07% impact

In [270]:
df_spotify_charts_remix.head()

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
str,i64,str,str,str,str,f64,str,str,f64,bool,str
"""Siren Remix (Feat. UNEDUCATED …",86,"""2021-11-01""","""HOMIES""","""South Korea""","""top200""",1959.0,"""2S9FuNX2H2cCwxcIoTUtbM""","""Siren Remix""",197093.0,False,"""2021-03-20"""
"""HIGHEST IN THE ROOM (feat. ROS…",197,"""2020-05-01""","""Travis Scott""","""Greece""","""top200""",2328.0,"""7h0d2h0fUmzbs7zeFigJPn""","""JACKBOYS""",244873.0,True,"""2019-12-27"""
"""Caught in Between - Remix""",31,"""2020-04-20""","""Gabe, Vintage Culture, Bondi""","""Brazil""","""viral50""",,"""7LLndwVJOXV2PDAqvHyCcy""","""Caught in Between (Remix)""",194810.0,False,"""2020-03-27"""
"""Deixe Me Ir - Remix""",43,"""2017-11-19""","""Everyonne""","""Brazil""","""viral50""",,"""7hah5UQfk3M6Z2lo7ZkV0n""","""1Kilo Deixe Me Ir Everyonne Re…",304456.0,False,"""2017-10-30"""
"""Mercury May (Brunelle Remix)""",36,"""2017-05-25""","""Thorsteinn Einarsson""","""Austria""","""viral50""",,,,,,


In [271]:
df_spotify_charts_remix[4, "title"]

'Mercury May (Brunelle Remix)'

In [272]:
df_lyrics.filter(
    (pl.col("title").str.contains("Thoiry"))
)

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""Thoiry""","""rap""","""Quentin40 & Puritano""",2017,65300,"""{}""","""[Testo di ""Thoiry""] [Intro] […",3323246,"""it""","""it""","""it"""
"""Thoiry Remix""","""rap""","""Achille Lauro""",2018,332163,"""{Gemitaiz,Quentin40,Puritano}""","""[Testo di ""Thoiry Remix"" ft. Q…",3461417,"""it""","""it""","""it"""


In [273]:
df_spotify_charts_remix[2, "title"], df_spotify_charts_remix[2, "artist"]

('Caught in Between - Remix', 'Gabe, Vintage Culture, Bondi')

In [274]:
df_lyrics.filter(
    (pl.col("title").str.contains("Despacito")) &
    (pl.col("artist").str.contains("Luis Fonsi"))
).select("title", "artist", "features")

title,artist,features
str,str,str
"""Despacito""","""Luis Fonsi""","""{""Daddy Yankee""}"""
"""Despacito Versión Salsa""","""Luis Fonsi""","""{""Víctor Manuelle""}"""
"""Despacito Versión Pop""","""Luis Fonsi""","""{}"""
"""Despacito Remix""","""Luis Fonsi & Daddy Yankee""","""{""Justin Bieber""}"""
"""Despacito Major Lazer MOSKA R…","""Luis Fonsi & Daddy Yankee""","""{""Major Lazer"",MOSKA}"""
…,…,…
"""Despacito Hindi Version - Uday…","""Luis Fonsi""","""{}"""
"""Despacito One World: Together …","""Luis Fonsi""","""{}"""
"""Despacito Échame La Culpa""","""Luis Fonsi & Helene Fischer""","""{}"""
"""Despacito Versión Urbana/Sky""","""Luis Fonsi & Daddy Yankee""","""{""Sky Rompiendo""}"""


## Check Various Artists

In [275]:
df_spotify_various_artists = df_spotify_charts_unique_titles.filter(
    pl.col("artist").str.contains("(?i)Various Artist")
).select("title", "artist")

In [276]:
df_spotify_various_artists

title,artist
str,str
"""Wat Wil Je Van Me (feat. Anu-D…","""Various Artists, Ilayah"""
"""擁抱世界擁抱你 - 2017臺北世界大學運動會主題曲""","""Various Artists"""
"""MAJHE AALE""","""AP Dhillon, Shinda Kahlon, Gur…"
"""Exiliada del sur""","""Inti-Illimani, Various Artists"""
"""Bao Zhu Yi Sheng Da Di Chun-Hu…","""Various Artists"""
…,…
"""福气灵门猴犀利""","""Various Artists"""
"""Takbir Raya""","""Various Artists"""
"""Lippulaulu""","""Various Artists"""
"""Just Love Ngayong Christmas""","""Various Artists"""


## Check Original Version

In [277]:
df_spotify_original_version = df_spotify_charts_unique_titles.filter(
    pl.col("title").str.contains("(?i)Original Version")
).select("title", "artist")

In [278]:
show(df_spotify_original_version)

title,artist
Loading... (need help?),


## Check Extended Version

In [279]:
df_spotify_extended_version = df_spotify_charts_unique_titles.filter(
    pl.col("title").str.contains("(?i)Extended Version")
).select("title", "artist")

In [280]:
df_spotify_extended_version

title,artist
str,str
"""Ты далеко - Extended Version""","""Gurevich & Leonov"""
"""Hai Phut Hon - Extended Versio…","""N'Small, Pháo"""
"""Arab dak - Extended Version""","""Diekel"""
"""More Life - Extended Version""","""Yasin"""
"""Sideways - Extended Version""","""Mat McHugh, The YUM YUM's"""
…,…
"""I'll Never Love Again - Extend…","""Lady Gaga"""
"""Love Nwantiti Tiktok - Extende…","""The Sham Tape"""
"""Sesame Street (Extended Versio…","""Joey Trap"""
"""Dolce Vita (Extended Version)""","""Ryan Paris"""


## Check Rerecorded

In [281]:
df_spotify_rerecorded_version = df_spotify_charts_unique_titles.filter(
    pl.col("title").str.contains("(?i)Rerecorded")
).select("title", "artist")

In [282]:
df_spotify_rerecorded_version

title,artist
str,str
"""Cinta Terakhir - Rerecorded Ve…","""Aiman Sidek, Syazmin Saprudin"""
"""I've Never Been to Me - Rereco…","""Charlene"""
"""Gps - Rerecorded version""","""Skaodi"""
"""リライト (2016 Version) - 2016 Rer…","""ASIAN KUNG-FU GENERATION"""
"""Fame - Rerecorded""","""Irene Cara"""
"""Take Me Home, Country Roads - …","""John Denver"""


## Check Edit

In [283]:
df_spotify_edit_version = df_spotify_charts_unique_titles.filter(
    pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]Edit")
).select("title", "artist")

In [284]:
df_spotify_edit_version.shape

(1401, 2)

In [285]:
df_spotify_edit_version.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.007092414002581821

0.7% impact for edit version

In [286]:
show(df_spotify_edit_version)

title,artist
Loading... (need help?),


## Check Mix

In [287]:
df_spotify_mix_version = df_spotify_charts_unique_titles.filter(
    (pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]Mix")) &
    ~(pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]Remix"))
).select("title", "artist")

In [288]:
df_spotify_mix_version.shape

(861, 2)

In [289]:
show(df_spotify_mix_version)

title,artist
Loading... (need help?),


## Check Instrumental

In [290]:
df_spotify_instrumental_version = df_spotify_charts_unique_titles.filter(
    pl.col("title").str.contains("(?i)[^\\p{Letter}\\p{Number}]Instrumental|instrumental")
).select("title", "artist")

In [291]:
df_spotify_instrumental_version.shape

(133, 2)

In [292]:
show(df_spotify_instrumental_version)

title,artist
Loading... (need help?),


## Check With

## Check Rework

## Get Totals for both datasets

In [293]:
def generate_word_matches_regex(characters_to_match : list, begin=False, middle=False, end=False):
    final_regexes = []
    match_characters = f"({'|'.join(characters_to_match)})"
    middle_regex = f"[\p{{Punct}}\p{{Separator}}]{match_characters}[\p{{Punct}}\p{{Separator}}]"
    begin_regex = f"^{match_characters}[\p{{Punct}}\p{{Separator}}]"
    end_regex = f"[\p{{Punct}}\p{{Separator}}]{match_characters}$"
    if begin:
        final_regexes.append(begin_regex)
    if middle:
        final_regexes.append(middle_regex)
    if end:
        final_regexes.append(end_regex)
    return "(?i)" + "|".join(final_regexes)

In [294]:
feature_prod_match = generate_word_matches_regex(["feat", "feature", "ft", "prod"], middle=True)
remake_match = generate_word_matches_regex(["remaster", "remastered", "remix", "mix", "edit"], middle=True, end=True)
version_match = generate_word_matches_regex(["original version", "extended version", "rerecorded", "instrumental"], middle=True, end=True)

In [295]:
feature_prod_match

'(?i)[\\p{Punct}\\p{Separator}](feat|feature|ft|prod)[\\p{Punct}\\p{Separator}]'

In [296]:
df_spotify_charts_title_feature_count = filter_regex_in_column(df_spotify_charts_unique_titles, "title", feature_prod_match)
df_spotify_charts_title_remake_count = filter_regex_in_column(df_spotify_charts_unique_titles, "title", remake_match)
df_spotify_charts_title_version_count = filter_regex_in_column(df_spotify_charts_unique_titles, "title", version_match)

In [297]:
df_spotify_charts_title_feature_count.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.06749183688966512

In [298]:
df_spotify_charts_title_remake_count.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.03499126737033943

In [299]:
df_spotify_charts_title_version_count.shape[0] / df_spotify_charts_unique_titles.shape[0]

0.0010276659832434759

In [300]:
df_lyrics_title_feature_count = filter_regex_in_column(df_lyrics, "title", feature_prod_match)
df_lyrics_title_remake_count = filter_regex_in_column(df_lyrics, "title", remake_match)
df_lyrics_title_version_count = filter_regex_in_column(df_lyrics, "title", version_match)

In [301]:
df_lyrics_title_feature_count.shape[0] / df_lyrics.shape[0]

0.006254313655533865

In [302]:
df_lyrics_title_remake_count.shape[0] / df_lyrics.shape[0]

0.020117214582064228

In [303]:
df_lyrics_title_version_count.shape[0] / df_lyrics.shape[0]

0.0007439351755920711

# Exploring Other Languages

In [304]:
chinese_song_lyrics = df_lyrics.filter(pl.col("language") == "zh").sort("views", descending=True)

In [305]:
chinese_song_lyrics.head()

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""月亮代表我的心 Yue Liang Dai Biao Wo …","""pop""","""(Teresa Teng)""",1979,102025,"""{""鄧麗君 (Teresa Teng)""}""","""[副歌] 你問我愛你有多深 我愛你有幾分 我的情也真 我的愛…",684979,"""zh""","""zh""","""zh"""
"""Darkseid""","""pop""","""Grimes & PAN""",2020,91680,"""{""Grimes & 潘PAN""}""","""[Chorus: Grimes] Unrest is in …",4843398,"""zh""","""zh""","""zh"""
"""以後別做朋友 Let’s Not Be Friends An…","""pop""","""(Eric Chou)""",2014,86012,"""{""周興哲 (Eric Chou)""}""","""習慣聽你分享生活細節 害怕破壞完美的平衡點 保持著距離一顆心…",3101217,"""zh""","""zh""","""zh"""
"""中国姑娘 Chinese Girl""","""rap""","""Tizzy T""",2018,78283,"""{VAVA}""","""[Tizzy T「中国姑娘」歌词] [Verse 1: T…",3930512,"""zh""","""zh""","""zh"""
"""你不知道的痛 The Hurts You Never Kne…","""pop""","""KZ Tandingan""",2018,64113,"""{}""","""[KZ 谭定安《你不知道的痛》歌词] [Intro] 你还…",3535084,"""zh""","""zh""","""zh"""


# Remix Example Zedd

In [307]:
df_lyrics_sample = df_lyrics.filter(
    (pl.col("title").str.contains("Clarity")) &
    (pl.col("artist") == "Zedd")
).unique(subset=["title", "artist"])

In [308]:
df_lyrics_sample

title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
str,str,str,i64,i64,str,str,i64,str,str,str
"""Clarity Vicetone Remix""","""pop""","""Zedd""",2012,878,"""{Foxes}""","""[Verse 1: Foxes] High dive int…",2083199,"""en""","""en""","""en"""
"""Clarity Brillz Remix""","""pop""","""Zedd""",2013,450,"""{Foxes}""","""High dive into frozen waves Wh…",1834203,"""en""","""en""","""en"""
"""Clarity ARMNHMR NVRLEFT Remi…","""pop""","""Zedd""",2020,48,"""{Foxes}""","""[Verse 1] High dive into froze…",5743423,"""en""","""en""","""en"""
"""Clarity Zedd Union Mix""","""pop""","""Zedd""",2013,986,"""{}""","""[Verse 1: Foxes] Hot dive into…",3331690,"""en""","""en""","""en"""
"""Clarity Tiësto Remix""","""pop""","""Zedd""",2015,574,"""{Foxes}""","""[Verse 1: Foxes] Hot dive into…",1834288,"""en""","""en""","""en"""
"""Clarity""","""pop""","""Zedd""",2012,592758,"""{Foxes}""","""[Click here to see why Zedd's …",91948,"""en""","""en""","""en"""
"""Clarity Remix""","""pop""","""Zedd""",2014,220,"""{""Medina (DNK)""}""","""I'd dive into frozen waves Whe…",4700547,"""en""","""en""","""en"""
"""Clarity iTunes Session""","""pop""","""Zedd""",2013,42,"""{""Matthew Koma"",Foxes}""","""High dive into frozen waves Wh…",5675696,"""en""","""en""","""en"""
"""Clarity Style of Eye Remix""","""pop""","""Zedd""",2013,280,"""{}""","""[Verse 1: Foxes] Hot dive into…",1834285,"""en""","""en""","""en"""


In [309]:
## Maybe id increases with year? Turns out there isn't a strong correlation. I'm just guessing here lol
## Cuz I've noticed the remix of one song always shave higher ids than the original song. 
df_lyrics.filter(pl.col("year") >= 1000).select(
    pl.corr(pl.col("id"), pl.col("year"))
)

id
f64
0.310121


In [310]:
df_spotify_charts_sample = df_spotify_charts_unique_titles.filter(
    (pl.col("title").str.contains("(?i)clarity")) &
    (pl.col("artist").str.contains("(?i)zedd"))
).unique(subset=["title", "artist"])

In [311]:
df_spotify_charts_sample

title,rank,date,artist,region,chart,streams,track_id,album,duration_ms,explicit,release_date
str,i64,str,str,str,str,f64,str,str,f64,bool,str
"""Clarity""",198,"""2017-01-09""","""Zedd, Foxes""","""United Kingdom""","""top200""",32861.0,"""60wwxj6Dd9NJlirf84wr2c""","""Clarity""",271426.0,False,"""2012-01-01"""


Sometimes features are simply embedded in the artist column in the Spotify Charts dataset