# 1. Import the dataset

Using the `pandas` link from HuggingFace.

Note that you have to run this every time (Colab does not save variables)

In [33]:
import pandas as pd
import tqdm as notebook_tqdm
import nbformat
spotify = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")

In [6]:
# import some libraries
import numpy as np
import plotly.express as px
pd.set_option('display.max_columns', None)
pd.options.mode.copy_on_write = True
import plotly.express as px

# 2. Getting started

Print out the columns of the dataset.
Print out the first 20 rows of the dataset.

In [7]:
spotify.columns
# spotify.head(20)

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

In [8]:
# Get 20 random rows
spotify.sample(20)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
18895,18895,5FBvGDfcEshfwMMTwR1kgm,Richard Pryor;Lena Horne,The Wiz,The Good Witch Glinda,21,69000,False,0.195,0.143,10,-17.805,1,0.0346,0.947,0.00228,0.348,0.0739,83.077,3,comedy
98423,98423,1pzOQkOi72TdKITN8Swg1s,Alan Menken,Beauty And The Beast: The Broadway Musical,Entr'acte/Wolf Chase,26,269280,False,0.295,0.183,2,-17.833,1,0.0385,0.78,0.214,0.0983,0.17,143.202,4,show-tunes
106928,106928,6EPJ0JkvHEWARjK5lq0Bqf,Markus Krunegård;Klara Söderberg,LUGNA HITS,Det var en gång i Lissabon,0,236333,False,0.591,0.488,0,-6.33,1,0.0265,0.756,0.0,0.108,0.434,97.009,4,swedish
48123,48123,2p9sprAX27D6U5cuSE9NsM,Rakim,The Master,When I B On Tha Mic,65,222933,True,0.836,0.484,2,-7.332,1,0.327,0.0454,0.0,0.0557,0.839,95.941,4,hardcore
99903,99903,4hv9mEWi4911k1uhU4fPEH,Ufuk Beydemir,Sevda Gibi,Ay Tenli Kadın,67,197967,False,0.533,0.541,6,-7.854,1,0.0252,0.217,4e-06,0.11,0.335,144.135,4,singer-songwriter
33055,33055,43ZyHQITOjhciSUUNPVRHc,Lil Pump,Lil Pump,Gucci Gang,64,124055,True,0.936,0.523,5,-6.71,1,0.0597,0.239,0.0,0.117,0.699,119.889,4,emo
95458,95458,4Ak08qsbY05C2fn6lcirCK,Joe Arroyo,32 Grandes Exitos,Falta La Plata,30,187733,False,0.76,0.539,6,-7.164,1,0.05,0.359,5.2e-05,0.0717,0.922,107.963,4,salsa
52047,52047,23EblUTQTswA21CfIzBL6z,George Jones,George Jones - 16 Biggest Hits,If Drinkin' Don't Kill Me (Her Memory Will),29,191293,False,0.596,0.333,6,-13.118,1,0.0304,0.689,0.0,0.201,0.239,89.554,3,honky-tonk
5296,5296,4JFFfy6ZQcvi9ggYE8hdc2,Kato,Sleep Anime Lofi Beats,Again Lofi (Fullmetal Alchemist Brotherhood),57,92917,False,0.332,0.107,4,-23.231,0,0.0399,0.795,0.854,0.108,0.128,179.117,4,anime
80399,80399,73C5Q80RK1T5CA68F6foTl,Harish Raghavendra;Srilekha Parthasarathy;Franko,Lesa Lesa,Yedho Ondru,61,336962,False,0.709,0.535,9,-8.152,0,0.0357,0.164,0.0,0.0877,0.749,99.965,4,pop-film


In [9]:
# Check the shape of spotify dataset
spotify.shape

(114000, 21)

In [10]:
# Sanity check: get the counts of each artist and track_name combination
counts = spotify.groupby(['artists', 'track_name']).size().reset_index(name='count')
print(counts)
print("There are " + str(sum(counts['count'] != 1)) + " artist, track_name combinations that are non-unique.")

                     artists  \
0                     !nvite   
1                     !nvite   
2      "Puppy Dog Pals" Cast   
3      "Puppy Dog Pals" Cast   
4        "Weird Al" Yankovic   
...                      ...   
81338                     黃妃   
81339                    黃小琥   
81340                    黃敏華   
81341                龍藏Ryuzo   
81342                龍藏Ryuzo   

                                              track_name  count  
0                                               pagadoff      1  
1                                              strolling      1  
2                                     Going on a Mission      1  
3                        Puppy Dog Pals Main Title Theme      1  
4      Amish Paradise (Parody of "Gangsta's Paradise"...      1  
...                                                  ...    ...  
81338                                              溫暖的所在      1  
81339                                              沒那麽簡單      1  
81340                

## Sanity checks!

- Are there any entries with null values
- Do numbers fall in the expected range


In [11]:
# popularity between 0 and 100
sum(spotify['popularity'] < 0) + sum(spotify['popularity'] > 100)

0

In [12]:
# danceability between 0.0 and 1.0
sum(spotify['danceability'] < 0.0) + sum(spotify['danceability'] > 1.0)

0

In [13]:
# energy is between 0.0 to 1.0
sum(spotify['energy'] < 0.0) + sum(spotify['energy'] > 1.0)

0

In [14]:
# mode is 0 or 1
sum(x not in [0,1] for x in spotify['mode'])

0

In [15]:
# speechiness between 0.0 and 1.0
sum(spotify['speechiness'] < 0.0) + sum(spotify['speechiness'] > 1.0)

0

In [16]:
# acousticness between 0.0 and 1.0
sum(spotify['acousticness'] < 0.0) + sum(spotify['acousticness'] > 1.0)

0

In [None]:
# instrumentalness between 0.0 and 1.0
sum(spotify['instrumentalness'] < 0.0) + sum(spotify['instrumentalness'] > 1.0)

In [None]:
# liveness between 0.0 and 1.0
sum(spotify['liveness'] < 0.0) + sum(spotify['liveness'] > 1.0)

In [None]:
# valence between 0.0 and 1.0
sum(spotify['valence'] < 0.0) + sum(spotify['valence'] > 1.0)

In [None]:
# positive tempo
sum(spotify['tempo'] < 0.0)

In [None]:
# time signature between 3 and 7 (inclusive)
sum(spotify['time_signature'] < 3) + sum(spotify['time_signature'] > 7)

## Let's visualize some missing values!

In [34]:
# identify whether each tempo value is zero
# and group by genre (index)
# count number of zero values
zero_tempo_by_genre = spotify.set_index("track_genre")["tempo"].eq(0).groupby(level=0).sum()

# Convert the result to a DataFrame 
zero_tempo_by_genre_df = zero_tempo_by_genre.reset_index()

# Create a bar chart 
px.bar(zero_tempo_by_genre_df,
       x='track_genre',
       y='tempo',  # The count of zero tempo values
       labels={'0': 'Number of zero tempo values', 'genre': 'Genre'},
       title="Zero Tempo Values by Genre")

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [29]:
# identify whether each valence value is zero
# and group by genre (index)
# count number of zero values
zero_valence_by_genre = spotify.set_index("track_genre")["valence"].eq(0).groupby(level=0).sum()

# Convert the result to a DataFrame 
zero_valence_by_genre_df = zero_valence_by_genre.reset_index()

# Create a bar chart 
px.bar(zero_valence_by_genre_df,
       x='track_genre',
       y='valence',  # The count of zero valence values
       labels={'0': 'Number of zero valence values', 'genre': 'Genre'},
       title="Zero Valence Values by Genre")

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [35]:
# Filter time_signatures for values that are either less than 3 or greater than 7
invalid_time_signatures = spotify.set_index("track_genre")["time_signature"] \
    .apply(lambda x: x < 3 or x > 7)  # Create a boolean series where True indicates invalid values

# Group by track_genre and sum the invalid counts
time_signatures = invalid_time_signatures.groupby(level=0).sum()

time_signatures_df = time_signatures.reset_index()

# Create a bar chart 
px.bar(time_signatures_df,
       x = 'track_genre',
       y='time_signature',  # The count of invalid time signature values
       labels={'0': 'Number of invalid time signatures', 'genre': 'Genre'},
       title="Invalid Time Signature by Genre")


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# 3. Data cleaning

- Remove duplicate rows (same artist, same song, different genre or album)
  - These will have different track IDs
- Replace missing values
- Remove "Unnamed: 0" column (which is just the row number)

- Missing value:
  - Explicit = unknown
  - Key = -1

- Time signatures < 3 and > 7
  - Time signature of 0, usually means "sleep" genre

Warning: try not to use `inPlace = True` because it will modify the original DataFrame. For example, if you `drop_duplicates inPlace`, the original spotify DataFrame will now never contain duplicates.

`drop_duplicates` has a `subset` argument. It will consider two rows duplicates if they have the same values for `subset`.

In [25]:
# Remove duplicate rows (the same song by same artist under different genre or album)
spotify_new = spotify.drop_duplicates(subset=['artists', 'track_name'], keep='first')
spotify_new.shape

(81344, 21)

In [None]:
# Imputation for rows with missing values
