In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# Load data
file = '../../uncleaned_data/data_by_artist_w_genres.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
0,"""Cats"" 1981 Original London Cast",0.5985,0.4701,267072.0,0.376203,0.010261,0.28305,-14.4343,0.20915,114.1288,0.35832,38.2,5,1,10,['show tunes']
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,31.538462,5,1,26,[]
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.571429,0,1,7,[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.407407,0,1,27,[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,42.0,5,1,7,[]


In [3]:
# Shape of df
df.shape

(32539, 16)

In [4]:
# Take a look at all the columns and check if there's any NaN in the df
df.isna().any()

artists             False
acousticness        False
danceability        False
duration_ms         False
energy              False
instrumentalness    False
liveness            False
loudness            False
speechiness         False
tempo               False
valence             False
popularity          False
key                 False
mode                False
count               False
genres              False
dtype: bool

In [5]:
# Doesn't look like there's any NaN
# Check data types of each column
df.dtypes

artists              object
acousticness        float64
danceability        float64
duration_ms         float64
energy              float64
instrumentalness    float64
liveness            float64
loudness            float64
speechiness         float64
tempo               float64
valence             float64
popularity          float64
key                   int64
mode                  int64
count                 int64
genres               object
dtype: object

##### Since the "genres" column consists of values of lists, an empty list could potentially be regarded as non-NaN values, thus, we will check if there's indeed any missing artist names in this dataset.

In [6]:
len(df.loc[df['genres'] == '[]'])

13563

Looks like there are more than 1/3 of rows from the datasets do not have a genre labeled. Let's take a look at who these artists are to get an idea if we would like to remove these.

In [7]:
df.loc[df['genres'] == '[]'].head(20)

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,31.538462,5,1,26,[]
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.571429,0,1,7,[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.407407,0,1,27,[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,42.0,5,1,7,[]
5,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.591167,0.484333,218504.5,0.300608,0.007042,0.176067,-18.5795,0.104958,122.517833,0.4245,33.166667,9,1,24,[]
6,"""Mama"" Helen Teagarden",0.725,0.637,135533.0,0.512,0.186,0.426,-20.615,0.21,134.819,0.885,0.0,8,1,2,[]
7,"""Test for Victor Young""",0.927,0.734,175693.0,0.474,0.0762,0.737,-10.544,0.256,132.788,0.902,3.0,10,1,2,[]
9,$0dg0d,0.11,0.732,160171.0,0.822,9e-06,0.154,-5.387,0.155,141.948,0.74,0.0,7,0,2,[]
12,$tar$eed,0.0375,0.923,130375.0,0.172,0.0202,0.0827,-15.646,0.385,100.046,0.176,0.0,10,0,4,[]
18,(Con La Participación de Marc Anthony),0.538,0.731,361440.0,0.794,2.4e-05,0.0736,-4.182,0.0408,88.003,0.873,42.0,5,1,2,[]


From the artist names it looks like many of could be relatively more niche "artists". Let's also check if there are artists with many songs that do not have a labeled genre.

In [8]:
df.sort_values('count', ascending=False).head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres
9761,Francisco Canaro,0.984488,0.658213,176800.691098,0.286427,0.542779,0.198878,-12.114052,0.110551,123.000891,0.742906,0.049701,2,1,3179,"['tango', 'vintage tango']"
27354,Tadeusz Dolega Mostowicz,0.451377,0.680885,132738.188915,0.222287,1e-05,0.204249,-23.232556,0.955717,105.357162,0.565923,0.0,1,1,2562,[]
32374,Эрнест Хемингуэй,0.351897,0.696763,116385.067234,0.188663,2.8e-05,0.372711,-18.23663,0.926439,111.517753,0.587283,0.035745,11,1,2350,[]
32373,Эрих Мария Ремарк,0.298241,0.693759,121067.432203,0.200422,0.000231,0.196048,-19.254032,0.921459,110.731847,0.536123,0.000942,0,1,2124,[]
9832,Frank Sinatra,0.743164,0.383003,189404.126132,0.233773,0.019481,0.23363,-14.441736,0.04939,109.540857,0.363877,28.363763,5,1,1435,"['adult standards', 'easy listening', 'lounge']"


It appears there are three non-English artists in the top 5 artists with the most songs, that, do not have genre labels. Considering these rows are a big part of the dataset and they also appear in top song counts, we will decide not to remove them for now. We can remove them any time throughout the project on a case by case basis.

As a result, we feel comfortable enough with the existing dataset so we decided to use as is.

In [9]:
# Export file as a CSV, without index, but with header
df.to_csv("../../cleaned_data/cleaned_data_by_artist.csv", index=False, header=True)