In [31]:
# This notebook will clean and pre process our data
# Dataset: https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset?resource=download
# steps:
#     - Remove duplicates
#     - drop rows with missing values, NaN's, etc
#     - Verify format (numbers are ints, strings are trimmed, etc.)
# This will result in a cleaned dataset ready for EDA
# Then I'll apply some pre processing to get the data ready for MLMs
#     - Remove extreme outliers for tempo and duration
#     - Bins / undersampling to balance skewed features
#     - normalize/scale features

import pandas as pd
import numpy as np

In [32]:
# cell for loading data
df = pd.read_csv('/content/dataset.csv')
display(df.head())

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [33]:
# dropping rows with missing or NaN values
# print shape before and after to see if there's a difference
before = df.shape
df = df.dropna()
print(f"Shape before: {before}, shape after: {df.shape}, dropped {before[0] - df.shape[0]} rows with missing values")

# remove duplicates
# before and after for this too
before = df.shape
df = df.drop_duplicates(subset = ["track_name", "artists", "duration_ms"])
print(f"Shape before: {before}, shape after: {df.shape}, dropped {before[0] - df.shape[0]} rows with duplicate values")

# export csv ready for Exploratory Data Analysis
df.to_csv('/content/cleaned_dataset_EDA.csv', index = False)
print("Dataset for EDA exported.")

Shape before: (114000, 21), shape after: (113999, 21), dropped 1 rows with missing values
Shape before: (113999, 21), shape after: (83074, 21), dropped 30925 rows with duplicate values
Dataset for EDA exported.


In [34]:
# Pre Processing for  MLM
# remove extreme outliers
# remove songs under 30 seconds or over 20 minutes
min_duration = 30 * 1000
max_duration = 20 * 60 * 1000

# According to Arabesque conservatory of music (https://www.arabesqueconservatory.com/blog/types-of-tempo-in-music/)
# there are types of tempo and all the music in the dataset falls within a type so i'm not gonna touch tempo for now
# BUT I will remove songs that have a tempo of 0
before = df.shape[0]
df = df[(df['duration_ms'] >= min_duration) & (df['duration_ms'] <= max_duration) & (df["tempo"] > 0)]
print(f"Removed {before - df.shape[0]} songs for duration and tempo outliers")

# Due to the extreme number of songs with popularity between 0 and 2 I'll bin and undersample popularity
bins = list(range(0,101,5))
labels = [f"{i}-{i+5}" for i in range(0,100,5)]
df["popularity_bin"] = pd.cut(df["popularity"], bins=bins, labels=labels, include_lowest=True, right=False)

bin_counts = df["popularity_bin"].value_counts().sort_index()
print("Popularity bin counts before balancing: ")
print(bin_counts)
print("Popularity bin mean before balancing: ")
print(bin_counts.mean())
# mean is target
target_n = int(bin_counts.mean())

balanced_bins = []
for b in labels:
  bin = df[df["popularity_bin"] == b]
  if len(bin) > target_n:
    bin = bin.sample(n = target_n, random_state = 42)
  balanced_bins.append(bin)

balanced_df = pd.concat(balanced_bins).sample(frac = 1, random_state = 42).reset_index(drop=True)
print("Shape after balancing: ", balanced_df.shape)
print("Popularity bin counts after balancing: ")
print(balanced_df["popularity_bin"].value_counts().sort_index())

df = balanced_df

print(df.describe())

Removed 228 songs for duration and tempo outliers
Popularity bin counts before balancing: 
popularity_bin
0-5        6937
5-10       2205
10-15      3412
15-20      5546
20-25     10194
25-30      6978
30-35      5796
35-40      6987
40-45      7817
45-50      6747
50-55      5655
55-60      5340
60-65      3857
65-70      2491
70-75      1493
75-80       827
80-85       390
85-90       134
90-95        29
95-100       10
Name: count, dtype: int64
Popularity bin mean before balancing: 
4142.25
Shape after balancing:  (56268, 22)
Popularity bin counts after balancing: 
popularity_bin
0-5       4142
5-10      2205
10-15     3412
15-20     4142
20-25     4142
25-30     4142
30-35     4142
35-40     4142
40-45     4142
45-50     4142
50-55     4142
55-60     4142
60-65     3857
65-70     2491
70-75     1493
75-80      827
80-85      390
85-90      134
90-95       29
95-100      10
Name: count, dtype: int64
          Unnamed: 0    popularity   duration_ms  danceability        energy  \
coun