This notebook demonstrates how to predict a musical gender from the respective "song speechiness" (a spotify attribute) and emotional arousal (energy). 
 * These features are taken from the Spotify API and are stored in the songs_metadata.json file. 
 * The musical genre class is searched against all 5 genre columns in the dataset
 * We have selected to use a simple SVM classifier (no parameter tuning). 
 * The dataset is split into train (50% - 50%) 
 * The overall performance of the classifier is rather low (58%, just 8% better than random guess)
 * Also, the 2D decision boundary is visualize through a simple application of SVM predictions on fixed grid points  

In [13]:
import pandas as pd
import numpy as np
from sklearn import svm
import plotly
# GET THE DATA #
metadata = pd.read_json('songs_metadata.json')           # read the song metadata file using pandas
metadata = metadata.dropna()                             # drop columns with NaN values

class_names = ['country', 'rap']

data_1 = metadata[metadata['spotify-genre01'].str.contains(class_names[0], na=False) |
     metadata['spotify-genre02'].str.contains(class_names[0], na=False) |
     metadata['spotify-genre03'].str.contains(class_names[0], na=False) |
     metadata['spotify-genre04'].str.contains(class_names[0], na=False) |
     metadata['spotify-genre05'].str.contains(class_names[0], na=False)]

data_2 = metadata[metadata['spotify-genre01'].str.contains(class_names[1], na=False) |
     metadata['spotify-genre02'].str.contains(class_names[1], na=False) |
     metadata['spotify-genre03'].str.contains(class_names[1], na=False) |
     metadata['spotify-genre04'].str.contains(class_names[1], na=False) |
     metadata['spotify-genre05'].str.contains(class_names[1], na=False)]

print(len(data_1))
print(len(data_2))

X = np.concatenate([np.array([data_1['spotify-speechiness'], data_1['spotify-energy']]).T,
                    np.array([data_2['spotify-speechiness'], data_2['spotify-energy']]).T])
#y = np.concatenate([np.zeros((len(data_1),)),np.ones((len(data_2),))])
y = np.array([class_name_1 for _ in range(len(data_1))] + [class_name_2 for _ in range(len(data_2))])

# TRAIN AND TEST #
# split data 50 - 50 into train and test (there are obviously smarter ways to do that, but we will see that later):
X_train = X[::2, :]
y_train = y[::2]
X_test = X[1::2, :]
y_test = y[1::2]

clf = svm.SVC()                           # initialize the classifier
clf.fit(X_train, y_train)                 # train the classifier
y_pred = clf.predict([[100, 1]])
y_pred = clf.predict(X_test)
print(f'classification accuracy: {100*(np.count_nonzero(y_test==y_pred) / len(y_pred)):.2f}%')

import plotly.graph_objects as go
x_ = np.arange(np.min(X_train[:,0]), np.max(X_train[:,0]), (np.max(X_train[:,0]) - np.min(X_train[:,0])) / 50)
y_ = np.arange(np.min(X_train[:,1]), np.max(X_train[:,1]), (np.max(X_train[:,1]) - np.min(X_train[:,1])) / 50)
Z = np.zeros((len(y_), len(x_)))
for ix, xx in enumerate(x_):
    for iy, yy in enumerate(y_):
        Z[iy, ix] = class_names.index(clf.predict([[xx, yy]]))
print(Z.shape)

# VISUALIZE #
fig = go.Figure(data=[
    go.Scatter(x=X_train[y_train==class_names[0],0], y=X_train[y_train==class_names[0],1], mode='markers', name='y=0',),
    go.Scatter(x=X_train[y_train==class_names[1],0], y=X_train[y_train==class_names[1],1], mode='markers', name='y=1',),
    go.Heatmap(z=Z, y=y_, x=x_, colorscale=[[0.0, 'rgb(20, 20, 200)'], [1.0, 'rgb(200, 20, 20)']])], 
    layout=go.Layout(title=f'{class_names[0]} (blue) Vs {class_names[1]} (Red)', 
                     xaxis=dict(title="speechiniess",), yaxis=dict(title="energy",)))
plotly.offline.iplot(fig)




237
300
classification accuracy: 73.51%
(50, 50)
