Spotify Project

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf

spotify = pd.read_csv('spotify.csv')
print(spotify.shape)
spotify.head()

In [None]:
spotify.dtypes

We should check if there are any duplicates in our song list

In [None]:
duplicated = spotify[spotify['track_id'].duplicated(keep = False)]
duplicated.groupby('track_id')['track_id'].count()

Let us see if these songs have any different attribute than their id

In [None]:
track_counts = spotify.track_id.value_counts()
repeated_tracks = track_counts[track_counts>1]
for track in repeated_tracks.index[:3]:
  display(spotify[spotify.track_id==track].head())
  print('_'*150)

It seems that only 'track_genre' is different for these repeated rows..
But let us check some other columns

In [None]:
columns_to_check = ['artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy','key',
                    'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness','valence', 'tempo', 'time_signature']
for track in repeated_tracks.index:
  temp_df = spotify.loc[spotify.track_id==track,columns_to_check]
  try:
    assert temp_df.duplicated(keep=False).all(), 'There is/are differences in the repeated rows other than "track_genre" column'
  except AssertionError as e:
    print(f"An assertion error occurred for {track=}: {e}")
    break

We have another different column

In [None]:
temp_df

There is a very small difference in the 'popularity' column which we can accept if we exclude

In [None]:
df = spotify[spotify['track_id'].duplicated(keep = 'first') == False]
df.dropna(inplace=True)
df = df.set_index('track_id')
df

Most likely 'track_genre' will have some effect so we should use it

In [None]:
grouped = spotify.groupby('track_genre')['danceability'].mean()
grouped.sort_values(ascending = False)[0:20]

In [None]:
plt.figure(figsize = (20,3), dpi = 100)
sns.barplot(x = grouped.sort_values(ascending = False)[0:].index, y = grouped.sort_values(ascending = False)[0:].values)
plt.xlabel('Track genre')
plt.ylabel('Danceability')
plt.xticks(rotation = 90)
plt.title('Most Danceable Genres')
plt.show()

Seems like we were right so let's convert these to codes

In [None]:
track_genre = df.track_genre.astype('category').cat.codes
track_genre.name = 'track_genre'

In [None]:
columns_to_select = ['popularity','duration_ms', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                     'instrumentalness', 'liveness','valence', 'tempo', 'time_signature', 'danceability']
df_selected = pd.concat([track_genre, df[columns_to_select]],axis=1)

Now let us see if we have some missing data

In [None]:
df_selected.isna().sum()

Create heatmap

In [None]:
corr_matrix = df_selected.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix, mask=np.triu(corr_matrix), annot=True, fmt='.2f');

Let us also make a plot of variables sorted according to 'abs of correlation' with 'danceability'

In [None]:
corr_data = pd.DataFrame(corr_matrix.danceability[:-1])
corr_data.columns = ['Corr']
corr_data['Abs_Corr'] = corr_data.Corr.abs()

corr_data['Color'] = ['b' if x >= 0 else 'r' for x in corr_data.Corr]

corr_data_sorted = corr_data.sort_values(by='Abs_Corr')
corr_data_sorted.Abs_Corr.plot(kind='barh', color=corr_data.Color, figsize=(10, 5))
plt.xlabel('Absolute Value of Correlation (Blue: Positive corr., Red: Negative corr.)'); plt.grid()
plt.ylabel('Variable Name'); plt.title('Variables Sorted by "Absolute Value of Correlation" with Danceability');

'valence', 'loudness', 'time_signature', 'instrumentalness', and 'acousticness' are very important features (corr-coeff > 0.15)

Standardize values

In [None]:
scaler = MinMaxScaler()
df_selected_scaled = scaler.fit_transform(df_selected)
df_selected = pd.DataFrame(df_selected_scaled, columns=df_selected.columns)
df_selected

Create and train a fully connected neural network to predict 'danceability'

In [None]:
X = df_selected.iloc[:,:-1] 
y = df_selected['danceability'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(14,)),  
    tf.keras.layers.Dense(128, activation='relu'), 
    tf.keras.layers.Dense(64, activation='relu'),  
    tf.keras.layers.Dense(32, activation='relu'),  
    tf.keras.layers.Dense(1) 
])

model.compile(optimizer='adam', loss='mse')

In [None]:
model.fit(X_train, y_train, epochs=10, verbose=0, validation_data=(X_test, y_test))

Now let's evaluate the model and try to understand what is going on

In [None]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(f"Mean Squared Error for Train Data: {mse_train_lr:.4g}")
print(f"Mean Squared Error for Test Data:  {mse_test_lr:.4g}")

# Calculate the "R2 Score" of the model on train and test data..
# The "R2 Score", also known as the coefficient of determination, is a measure of how well the model's predictions match the actual data.
# An "R2 Score" of 1 indicates perfect predictions, while an "R2 Score" of 0 indicates that the model is no better than a model that
# would simply predict the mean of the target variable for all observations.
r2_train_lr = r2_score(y_train, y_pred_train)
r2_test_lr = r2_score(y_test, y_pred_test)
print(f"R2 Score for Train Data: {r2_train_lr:.4g}")
print(f"R2 Score for Test Data:  {r2_test_lr:.4g}")

Around 0.6 R^2 is not too bad but it's still not good enough. Let's see some plots

In [None]:
residuals_train = y_train - tf.squeeze(y_pred_train, axis=1)
residuals_test = y_test - tf.squeeze(y_pred_test, axis=1)

In [None]:
_, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].hist(residuals_train, bins=20, edgecolor='black')
axs[0].set_title('Histogram of Training Residuals'); axs[0].set_xlabel('Residual'); axs[0].set_ylabel('Frequency')
axs[1].hist(residuals_test, bins=20, edgecolor='black')
axs[1].set_title('Histogram of Test Residuals'); axs[1].set_xlabel('Residual'); axs[1].set_ylabel('Frequency');

As we can see the residuals are normally distributed which means that our fully connected neural network is an acceptable choice to explain the variability in the output/target variable.
Now let us see a scatter plot of the prediction and the actual value.

In [None]:
_, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].scatter(y_train, y_pred_train, c='blue', marker='.', alpha=.1)
axs[0].set_title('Scatter Plot for Training Data'); axs[0].set_xlabel('Actual'); axs[0].set_ylabel('Predicted')
axs[1].scatter(y_test, y_pred_test, c='blue', marker='.', alpha=.1)
axs[1].set_title('Scatter Plot for Training Data'); axs[1].set_xlabel('Actual'); axs[1].set_ylabel('Predicted');

Let's try another technique to predict 'danceability'

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train);

In [None]:
y_pred_train = rf_regressor.predict(X_train)
y_pred_test = rf_regressor.predict(X_test)

mse_train_rf = mean_squared_error(y_train, y_pred_train)
mse_test_rf = mean_squared_error(y_test, y_pred_test)
print(f"Mean Squared Error on Train Data: {mse_train_rf:.4g}")
print(f"Mean Squared Error on Test Data:  {mse_test_rf:.4g}")

# Calculate the "R2 Score" of the model on train and test data..
r2_train_rf = r2_score(y_train, y_pred_train)
r2_test_rf = r2_score(y_test, y_pred_test)
print(f"R2 Score for Train Data: {r2_train_rf:.4g}")
print(f"R2 Score for Test Data:  {r2_test_rf:.4g}")

Seems like Random Forest performs better but we should summarize how much better!
It is important to notice that on the test data the performance of the model is way worse than on the train data. It is coming from the nature of the decision trees but it is still better than the neural network.

In [None]:
performance_comparison_table = pd.DataFrame(index=['Linear Regressor','RandomForest Regressor'])
performance_comparison_table['MSE (Train)'] = [mse_train_lr,mse_train_rf]
performance_comparison_table['MSE (Test)'] = [mse_test_lr,mse_test_rf]
performance_comparison_table['R2 (Train)'] = [r2_train_lr,r2_train_rf]
performance_comparison_table['R2 (Test)'] = [r2_test_lr,r2_test_rf]
performance_comparison_table

To close this study we should see how well our model performs.

In [None]:
y_pred_all = rf_regressor.predict(X)
_, axs = plt.subplots(2, 1, figsize=(10, 10))
axs[0].scatter(y, y_pred_all, c='blue', marker='.', alpha=.1); axs[0].set_aspect('equal')
axs[0].set_title('Scatter Plot of Actual and Predicted Danceability'); axs[0].set_xlabel('Actual'); axs[0].set_ylabel('Predicted')
axs[1].plot(np.column_stack([y, y_pred_all]), alpha=.5); axs[1].legend(['Actual','Predicted'])
axs[1].set_title('Line plots of Actual and Predicted Danceability'); axs[1].set_xlabel('Index'); axs[1].set_ylabel('Value');