# Neural Network Spotify Playlist


### Data Pre-Processing

Prepare the data for the neural network. This includes splitting the data into a training and testing dataset, Scaling the data, and encoding the categorical target values

In [45]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [46]:
#reading into compiled csv
data = Path("Resources/spotify_main_raw_data.csv")
df = pd.read_csv(data)
df.shape

(454, 22)

In [47]:
#dropping irrelevant columns
raw_data = df.drop(['Unnamed: 0',"track","type","id","uri","track_href","analysis_url","time_signature"],axis=1)

# Encoding the Artist's column
spot_df = pd.get_dummies(raw_data, columns=["artist"])
spot_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,artist_Yang Da Il,artist_Young T & Bugsey,artist_ZZ Top,artist_Zion.T,artist_blink-182,artist_dj poolboi,artist_gianni & kyle,artist_rum.gold,artist_林依晨,artist_王俊凱
0,0.564,0.364,10,-5.845,0,0.0631,0.759,0.0,0.0839,0.591,...,0,0,0,0,0,0,0,0,0,0
1,0.701,0.519,1,-6.382,1,0.0516,0.314,0.0,0.207,0.498,...,0,0,0,0,0,0,0,0,0,0
2,0.309,0.74,7,-5.917,0,0.0456,0.00854,0.0258,0.119,0.166,...,0,0,0,0,0,0,0,0,0,0
3,0.552,0.637,5,-6.568,1,0.0445,0.464,1.6e-05,0.136,0.333,...,0,0,0,0,0,0,0,0,0,0
4,0.655,0.885,7,-4.116,1,0.0438,0.00117,0.000473,0.0448,0.938,...,0,0,0,0,0,0,0,0,0,0


In [48]:
spotify_data = spot_df.drop(["playlist"], axis=1)
spotify_data.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,artist_Yang Da Il,artist_Young T & Bugsey,artist_ZZ Top,artist_Zion.T,artist_blink-182,artist_dj poolboi,artist_gianni & kyle,artist_rum.gold,artist_林依晨,artist_王俊凱
0,0.564,0.364,10,-5.845,0,0.0631,0.759,0.0,0.0839,0.591,...,0,0,0,0,0,0,0,0,0,0
1,0.701,0.519,1,-6.382,1,0.0516,0.314,0.0,0.207,0.498,...,0,0,0,0,0,0,0,0,0,0
2,0.309,0.74,7,-5.917,0,0.0456,0.00854,0.0258,0.119,0.166,...,0,0,0,0,0,0,0,0,0,0
3,0.552,0.637,5,-6.568,1,0.0445,0.464,1.6e-05,0.136,0.333,...,0,0,0,0,0,0,0,0,0,0
4,0.655,0.885,7,-4.116,1,0.0438,0.00117,0.000473,0.0448,0.938,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Standardize data
spotify_scaled = StandardScaler().fit_transform(spotify_data)

In [50]:
spotify = np.array(spotify_data)
spotify

array([[ 0.564,  0.364, 10.   , ...,  0.   ,  0.   ,  0.   ],
       [ 0.701,  0.519,  1.   , ...,  0.   ,  0.   ,  0.   ],
       [ 0.309,  0.74 ,  7.   , ...,  0.   ,  0.   ,  0.   ],
       ...,
       [ 0.562,  0.934,  5.   , ...,  0.   ,  0.   ,  0.   ],
       [ 0.743,  0.446,  4.   , ...,  0.   ,  0.   ,  0.   ],
       [ 0.503,  0.581,  6.   , ...,  0.   ,  0.   ,  0.   ]])

In [51]:
# Initialize PCA model
pca = PCA(n_components=8)

# Get two principal components for the data.
spotify_pca = pca.fit_transform(spotify_scaled)

df_spotify_pca = pd.DataFrame(
    data=spotify_pca, columns=["principal component 1", 
                               "principal component 2", 
                               "principal component 3", 
                               "principal component 4", 
                               "principal component 5", 
                               "principal component 6",
                               "principal component 7",
                               "principal component 8"]
                                                        )
df_spotify_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8
0,1.876556,-2.079369,0.429767,-2.888062,0.658185,-1.356652,-0.583383,-0.873052
1,0.421989,-1.271517,-1.314892,1.669279,-0.136069,-0.411536,1.176187,-1.047807
2,0.021593,6.201081,-0.219186,-2.838112,-4.244688,-2.236713,6.906013,2.406996
3,0.633654,0.276765,-0.714865,0.220098,0.295061,-1.186853,1.17732,-0.05181
4,-2.464008,-0.76016,-2.279879,-0.467851,0.567997,-1.052548,-1.301199,0.502535


In [52]:
# Fetch the explained variance
variance_ratio = pca.explained_variance_ratio_

print(f"We have {round((sum(variance_ratio)*100),2)}% of the information in the original dataset")

We have 5.48% of the information in the original dataset


In [53]:
#appending playlist column back on PCA list
df_spotify_pca['playlist'] = raw_data['playlist']

In [54]:
df_spotify_pca

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,playlist
0,1.876556,-2.079369,0.429767,-2.888062,0.658185,-1.356652,-0.583383,-0.873052,ryan
1,0.421989,-1.271517,-1.314892,1.669279,-0.136069,-0.411536,1.176187,-1.047807,ryan
2,0.021593,6.201081,-0.219186,-2.838112,-4.244688,-2.236713,6.906013,2.406996,ryan
3,0.633654,0.276765,-0.714865,0.220098,0.295061,-1.186853,1.177320,-0.051810,ryan
4,-2.464008,-0.760160,-2.279879,-0.467851,0.567997,-1.052548,-1.301199,0.502535,ryan
...,...,...,...,...,...,...,...,...,...
449,-1.012177,1.187173,0.418757,1.234964,6.074706,-0.924686,2.608544,0.539105,alex
450,-0.000413,1.481097,-0.320832,1.762389,1.760652,-0.708704,1.540892,0.180073,alex
451,-1.621853,1.175019,-1.307763,0.382647,0.248354,-0.792650,1.185612,-0.041043,alex
452,0.495694,-0.206841,0.803348,-0.572994,-0.096422,1.052658,1.506360,-1.164708,alex


In [55]:
#Verifying playlist classifications
df_spotify_pca.playlist.value_counts()

ryan     100
sarah    100
abdul    100
terry     99
alex      55
Name: playlist, dtype: int64

In [56]:
# Define the features X set and the target y vector
X = df_spotify_pca.drop("playlist",axis=1)
y = df_spotify_pca.loc[:, ["playlist"]]

In [13]:
# Split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Acknowledge the dimension of both test and training data
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((330, 8), (110, 8), (330, 1), (110, 1))

In [14]:
# Scale the training and testing input features using StandardScaler
X_scaler = StandardScaler()
X_scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Apply One-hot encoding to the target labels
enc = OneHotEncoder()
enc.fit(y_train)

encoded_y_train = enc.transform(y_train).toarray()
encoded_y_test = enc.transform(y_test).toarray()
encoded_y_train[0]

array([0., 0., 0., 0., 1.])

# Build a Deep Neural Network

In [16]:
# Create a sequential model
model = Sequential()

In [17]:
# Add the first layer where the input dimensions are the X.shape[1] = 8 columns of the training data
model.add(Dense(50, activation='relu', input_dim = X.shape[1]))

# Add the 2nd layer where the input dimensions are the 12 columns of the training data
model.add(Dense(50, activation='relu', input_dim = 50))

# Add output layer
model.add(Dense(5, activation="softmax", input_dim = 50))

In [18]:
# The output layer has 8 columns that are one-hot encoded
y_train.playlist.value_counts()

# 5 output since there're 5 playlists
number_outputs = 5

In [19]:
# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])


In [20]:
# Print the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                450       
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dense_2 (Dense)             (None, 5)                 255       
                                                                 
Total params: 3,255
Trainable params: 3,255
Non-trainable params: 0
_________________________________________________________________


In [21]:
# Use the training data to fit (train) the model
# @NOTE: Experiment with the number of training epochs to find the minimum iterations required to achieve a good accuracy
model.fit(
    X_train_scaled,
    encoded_y_train,
    epochs=10,
    shuffle=True,
    verbose=2
)

Epoch 1/10
11/11 - 0s - loss: 1.6154 - accuracy: 0.2606 - 464ms/epoch - 42ms/step
Epoch 2/10
11/11 - 0s - loss: 1.5141 - accuracy: 0.3606 - 9ms/epoch - 818us/step
Epoch 3/10
11/11 - 0s - loss: 1.4490 - accuracy: 0.4576 - 7ms/epoch - 636us/step
Epoch 4/10
11/11 - 0s - loss: 1.3915 - accuracy: 0.5152 - 7ms/epoch - 676us/step
Epoch 5/10
11/11 - 0s - loss: 1.3495 - accuracy: 0.5424 - 8ms/epoch - 727us/step
Epoch 6/10
11/11 - 0s - loss: 1.3084 - accuracy: 0.5394 - 8ms/epoch - 726us/step
Epoch 7/10
11/11 - 0s - loss: 1.2753 - accuracy: 0.5515 - 8ms/epoch - 729us/step
Epoch 8/10
11/11 - 0s - loss: 1.2481 - accuracy: 0.5455 - 8ms/epoch - 725us/step
Epoch 9/10
11/11 - 0s - loss: 1.2205 - accuracy: 0.5606 - 8ms/epoch - 699us/step
Epoch 10/10
11/11 - 0s - loss: 1.1996 - accuracy: 0.5697 - 8ms/epoch - 756us/step


<keras.callbacks.History at 0x1c68a683588>

# Evaluate the Model on Test & Training Data

In [22]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(X_test_scaled, encoded_y_test, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

4/4 - 0s - loss: 1.2603 - accuracy: 0.5000 - 79ms/epoch - 20ms/step
Normal Neural Network - Loss: 1.2603180408477783, Accuracy: 0.5


In [23]:
y_index = y_test.index.values

y_tracks = []

#to identify which track has been selected for y_test
for index in y_index:
    track = df.at[index, "track"]
    y_tracks.append(track)

In [24]:
# Make predictions
predicted = model.predict(X_test_scaled)
predicted = enc.inverse_transform(predicted).flatten().tolist()

# creating a prediction table
results = pd.DataFrame({
    "Index": y_test.index.values,
    "Track": y_tracks,
    "Actual": y_test.playlist.values,
    "Predicted": predicted
})
results.head(10)

Unnamed: 0,Index,Track,Actual,Predicted
0,283,Hit and Run,terry,abdul
1,369,Ultralight Beam,abdul,abdul
2,207,I Miss You,terry,terry
3,300,Don't Stop 'Til You Get Enough,abdul,terry
4,429,Grenade - Acoustic,alex,sarah
5,422,Better,alex,sarah
6,47,All Rise,ryan,sarah
7,350,"Or Nah (feat. The Weeknd, Wiz Khalifa & DJ Mus...",abdul,abdul
8,232,Money For Nothing,terry,abdul
9,4,Given Up,ryan,terry


In [25]:
# Print the Classification Report
from sklearn.metrics import classification_report
print(classification_report(results.Actual, results.Predicted))

              precision    recall  f1-score   support

       abdul       0.61      0.65      0.62        31
        alex       0.00      0.00      0.00        16
        ryan       0.33      0.21      0.26        19
       sarah       0.38      0.75      0.51        20
       terry       0.62      0.67      0.64        24

    accuracy                           0.50       110
   macro avg       0.39      0.45      0.41       110
weighted avg       0.43      0.50      0.45       110



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
## Evaluate the Model on Training Data

In [27]:
# Identifying data shape
X_train.shape, y_train.shape

((330, 8), (330, 1))

In [28]:
#using 20% of training data as validation data
-X_train.shape[0]*0.2

-66.0

In [29]:
#ADD VALIDATION DATA
#https://www.tensorflow.org/guide/keras/train_and_evaluate
x_val = X_train_scaled[-58:]
y_val = encoded_y_train[-58:]
x_train = X_train_scaled[:-58]
y_train = encoded_y_train[:-58]

In [30]:
x_val.shape,y_val.shape,x_train.shape,y_train.shape

((58, 8), (58, 5), (272, 8), (272, 5))

In [31]:
#We call fit(), which will train the model by slicing the data into "batches" of size batch_size, 
#and repeatedly iterating over the entire dataset for a given number of epochs.

print("Fit model on training data")
history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=2,
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    validation_data=(x_val, y_val),
)

Fit model on training data
Epoch 1/2
Epoch 2/2


In [32]:
history.history

{'loss': [1.1433531045913696, 1.132018804550171],
 'accuracy': [0.5845588445663452, 0.591911792755127],
 'val_loss': [1.3667395114898682, 1.3716624975204468],
 'val_accuracy': [0.5, 0.5]}