In [56]:
import pandas as pd 
import numpy as np 
import math
from random import randrange
import random
import string
import category_encoders as ce 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

In [57]:
# drop rows with invalid values and destringify the list of artists 
def cleanArtists(songs):
    songs['artists'] = songs['artists'].apply(lambda x: x[1:-1].split(', ') if(type(x) == str and len(x)) else [])
    songs['artists'] = songs['artists'].apply(lambda x: list(map(lambda y: y[1:-1], x)) )
    he = ce.HashingEncoder(cols=['artists'], n_components=500)
    songs = songs.join(pd.DataFrame(he.fit_transform(songs['artists'], songs['popularity']),
                          index=songs.index))
    return songs

In [58]:
# drop rows with invalid values or out of range (797 rows)
def cleanYear(songs):
     songs = songs.dropna(subset=['year'])
     songs.loc[songs.year > 1900, 'year'] = 2020 - songs.year
     songs = songs.rename(columns={'year': 'yearsSinceCreation'})
     return songs

In [59]:
# merging groups of songs with the same name and artists taking mean values for the other columns
def mergeDuplicates(songs):
    songs = songs.groupby(['artists', 'name'], as_index=False).agg({
        'valence':np.average,
        'yearsSinceCreation':np.average,
        'acousticness':np.average,
        'danceability':np.average,
        'duration_ms':np.average,
        'energy':np.average,
        'instrumentalness':np.average,
        'liveness':np.average,
        'loudness':np.average,
        'tempo':np.average,
        'speechiness':np.average,
        'popularity':np.average,
        'explicit':np.average,
        'mode':np.average
    })
    songs.loc[:, 'explicit'] = round(songs.explicit)

    return songs

In [60]:
def removeEmpty(songs):
    for col in songs.columns:
        songs = songs.dropna(subset=[col])
    return songs

In [61]:
def dropCols(songs):
    songs.drop(columns=['id', 'release_date'])
    return songs

In [62]:
songs = pd.read_csv('spotify_training.csv')[0:500]
songs = cleanYear(songs)
songs = removeEmpty(songs)
songs = dropCols(songs)
print(len(songs))
songs = mergeDuplicates(songs)
for col in songs.columns:
    print(col, '\n')


423
artists 

name 

valence 

yearsSinceCreation 

acousticness 

danceability 

duration_ms 

energy 

instrumentalness 

liveness 

loudness 

tempo 

speechiness 

popularity 

explicit 

mode 



In [63]:
songs = cleanArtists(songs)
print(songs.head())


                           artists                                  name  \
0               [Alexander O'Neal]  (What Can I Say) To Make You Love Me   
1  [Anita O'Day, The Three Sounds]   (Fly Me To The Moon) In Other Words   
2               [Herman's Hermits]             ( What A) Wonderful World   
3            [The Lovin' Spoonful]                 (Till I) Run With You   
4                     [The O'Jays]              (They Call Me) Mr. Lucky   

   valence  yearsSinceCreation  acousticness  danceability  duration_ms  \
0    0.902                33.0         0.276         0.661     264933.0   
1    0.327                58.0         0.870         0.561     231840.0   
2    0.967                54.0         0.589         0.759     119573.0   
3    0.900                51.0         0.151         0.584     114427.0   
4    0.726                48.0         0.670         0.527     198733.0   

   energy  instrumentalness  liveness  ...  col_490  col_491  col_492  \
0   0.828          

In [64]:
songs.describe()

Unnamed: 0,valence,yearsSinceCreation,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,tempo,...,col_490,col_491,col_492,col_493,col_494,col_495,col_496,col_497,col_498,col_499
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,...,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,0.56285,38.461735,0.410542,0.557584,220901.117347,0.553458,0.119347,0.207419,-10.759037,125.274404,...,0.002551,0.0,0.0,0.0,0.0,0.0,0.0,0.002551,0.0,0.0
std,0.261147,20.173064,0.345129,0.169589,89460.422706,0.25624,0.26495,0.177873,5.352514,25.435072,...,0.050508,0.0,0.0,0.0,0.0,0.0,0.0,0.050508,0.0,0.0
min,0.0,0.0,0.0,0.0,52520.0,0.0,0.0,0.0,-60.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.35875,23.0,0.071412,0.447,163739.5,0.34825,0.0,0.096825,-13.036875,109.47225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.579,42.0,0.3555,0.577,202080.0,0.56675,0.00019,0.1421,-10.073,123.3345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.782,53.0,0.721,0.681,250246.75,0.77625,0.026325,0.26075,-7.19,139.95675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.972,94.0,0.995,0.942,996000.0,0.9895,0.972,0.978,-1.352,181.22,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [65]:
songs.drop(columns=['name'])
print(songs.head())

                           artists                                  name  \
0               [Alexander O'Neal]  (What Can I Say) To Make You Love Me   
1  [Anita O'Day, The Three Sounds]   (Fly Me To The Moon) In Other Words   
2               [Herman's Hermits]             ( What A) Wonderful World   
3            [The Lovin' Spoonful]                 (Till I) Run With You   
4                     [The O'Jays]              (They Call Me) Mr. Lucky   

   valence  yearsSinceCreation  acousticness  danceability  duration_ms  \
0    0.902                33.0         0.276         0.661     264933.0   
1    0.327                58.0         0.870         0.561     231840.0   
2    0.967                54.0         0.589         0.759     119573.0   
3    0.900                51.0         0.151         0.584     114427.0   
4    0.726                48.0         0.670         0.527     198733.0   

   energy  instrumentalness  liveness  ...  col_490  col_491  col_492  \
0   0.828          