This notebook demonstrates how to predict a song's popularity from the respective emotional valence and arousal. 
 * These features are taken from the Spotify API and are stored in the songs_metadata.json file. The popularity score is thresholded and binarized so that the problem is transposed to a binary classification task of balanced classes.
 * We have selected to use a simple SVM classifier (no parameter tuning). 
 * The dataset is split into train (50% - 50%) 
 * The overall performance of the classifier is rather low (58%, just 8% better than random guess)
 * Also, the 2D decision boundary is visualize through a simple application of SVM predictions on fixed grid points  

In [8]:
import pandas as pd
import numpy as np
from sklearn import svm
import plotly
# GET THE DATA #
metadata = pd.read_json('songs_metadata.json')           # read the song metadata file using pandas
metadata = metadata.dropna()                             # drop columns with NaN values
print(metadata.columns)
X = np.array([metadata['spotify-valence'],metadata['spotify-energy']]).T    # create the feature matrix from two columns of our data
y = np.array(metadata['spotify-popularity'])                                # create the y array (target values) from the respective column of our data
y_mean = np.median(y)                                                      # threshold values to make the classification task
y[y<y_mean] = 0
y[y>=y_mean] = 1
# TRAIN AND TEST #
# split data 50 - 50 into train and test (there are obviously smarter ways to do that, but we will see that later):
X_train = X[::2, :]
y_train = y[::2]
X_test = X[1::2, :]
y_test = y[1::2]

clf = svm.SVC()                           # initialize the classifier
clf.fit(X_train, y_train)                 # train the classifier
y_pred = clf.predict([[100, 1]])
y_pred = clf.predict(X_test)
print(f'classification accuracy: {100*(np.count_nonzero(y_test==y_pred) / len(y_pred)):.2f}%')

# VISUALIZE #
import plotly.graph_objects as go
x_ = np.arange(np.min(X_train[:,0]), np.max(X_train[:,0]), (np.max(X_train[:,0]) - np.min(X_train[:,0])) / 20)
y_ = np.arange(np.min(X_train[:,1]), np.max(X_train[:,1]), (np.max(X_train[:,1]) - np.min(X_train[:,1])) / 20)
Z = np.zeros((len(y_), len(x_)))
for ix, xx in enumerate(x_):
    for iy, yy in enumerate(y_):
        Z[iy, ix] = clf.predict([[xx, yy]])

fig = go.Figure(data=[
    go.Heatmap(z=Z, y=y_, x=x_, colorscale=[[0.0, 'rgb(20, 20, 200)'], [1.0, 'rgb(200, 20, 20)']])], 
    layout=go.Layout(title='Popularity Classifier', xaxis=dict(title="valence",), yaxis=dict(title="energy",)))
plotly.offline.iplot(fig)

Index(['artist', 'lastfm-listener-count', 'lastfm-play-count',
       'spotify-acousticness', 'spotify-albumName', 'spotify-artistName',
       'spotify-count_countries', 'spotify-danceability', 'spotify-date',
       'spotify-duration_ms', 'spotify-energy', 'spotify-genre01',
       'spotify-genre02', 'spotify-genre03', 'spotify-genre04',
       'spotify-genre05', 'spotify-instrumentalness', 'spotify-key',
       'spotify-liveness', 'spotify-loudness', 'spotify-mode',
       'spotify-num_of_tracks_in_album', 'spotify-popularity',
       'spotify-speechiness', 'spotify-tempo', 'spotify-time_signature',
       'spotify-trackName', 'spotify-track_no', 'spotify-valence', 'track'],
      dtype='object')
classification accuracy: 59.26%
