In [7]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.metrics import mean_squared_error
%pylab inline

from sklearn.ensemble import RandomForestRegressor
from category_encoders import OrdinalEncoder, TargetEncoder, OneHotEncoder
from sklearn.model_selection import GroupShuffleSplit

from sklearn.preprocessing import StandardScaler

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor

Populating the interactive namespace from numpy and matplotlib


In [16]:
## Importing Data
df = pd.read_csv('./data/SpotifyFeatures.csv')
## Target
y = df.popularity

# 1.0 - Superficial Exploratory Data Analysis

In [22]:
## Looking the full data
df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [23]:
## Checking NULL
df.isnull().sum()

genre               0
artist_name         0
track_name          0
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [24]:
print("Popularity max value:",df.popularity.max())
print("Popularity min value;",df.popularity.min())

Popularity max value: 100
Popularity min value; 0


In [25]:
df.groupby('genre').mean().popularity.sort_values()

genre
Children's Music     4.252637
A Capella            9.302521
Movie               12.174097
Opera               13.335628
Comedy              21.342630
Anime               24.258729
Ska                 28.612351
Classical           29.282195
Soundtrack          33.954800
Blues               34.742879
World               35.524077
Reggae              35.589328
Reggaeton           37.742915
Electronic          38.056095
Jazz                40.824383
Country             46.100416
Soul                47.027836
Folk                49.940209
Alternative         50.213430
R&B                 52.308719
Children’s Music    54.659040
Indie               54.701561
Dance               57.275256
Hip-Hop             58.423131
Rock                59.619392
Rap                 60.533795
Pop                 66.590667
Name: popularity, dtype: float64

# 2.0 - Splitting, Validation, Baseline

In [81]:
grp_splitter = GroupShuffleSplit(n_splits=1,random_state=0)

rows_train, rows_test = next(grp_splitter.split(df,y,groups=df['track_id']))

Xtrain, Xtest = df.iloc[rows_train].drop('popularity',axis=1), df.iloc[rows_test].drop('popularity',axis=1)
ytrain, ytest = df.popularity.iloc[rows_train], df.popularity.iloc[rows_test]

### 2.1 - Baselines

In [83]:
## Baseline Genre
genre_base = df.iloc[rows_train].groupby('genre')['popularity'].mean()
p_base = df.iloc[rows_test].genre.map(genre_base)
mean_squared_error(ytest,p_base)

94.45642029514539

In [93]:
## Baseline Artist
artist_base = df.iloc[rows_train].groupby('artist_name').popularity.mean()
p_base = df.iloc[rows_test].genre.map(artist_base).fillna(ytrain.mean())
mean_squared_error(ytest,p_base)


334.28313875548116

# 3.0 - Embeddings

In [107]:
## Encoders

ordinal_encoder = OrdinalEncoder(cols=['genre','artist_name'], handle_unknown='return_nan')
Xtrain2 = ordinal_encoder.fit_transform(Xtrain,ytrain)
Xtest2 = ordinal_encoder.transform(Xtest)

Xtest2['genre'] = Xtest2['genre'].fillna(28)
Xtest2['artist_name'] = Xtest2['artist_name'].fillna(13587)

In [118]:
## Genre Embedding
genre_input = layers.Input((1,))
genre_embedding = layers.Embedding(29,5)(genre_input)
genre_embedding = layers.Flatten()(genre_embedding)

## Artist Embedding

artist_name_input = layers.Input((1,))
artist_embedding = layers.Embedding(13588,5)(artist_name_input)
artist_embedding = layers.Flatten()(artist_embedding)

concat = layers.concatenate([genre_embedding, artist_embedding])
hidden1 = layers.Dense(10, activation='relu')(concat)
drop1 = layers.Dropout(0.5)(hidden1)
out = layers.Dense(1)(hidden1)

In [119]:
mdl = keras.Model([genre_input, artist_name_input], out)
mdl.compile(loss='mse',optimizer='adam')

In [120]:
mdl.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 5)         145         input_8[0][0]                    
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 5)         67940       input_9[0][0]                    
____________________________________________________________________________________________

In [122]:
train_X = [Xtrain2[['genre']].values, Xtrain2[['artist_name']].values]
test_X = [Xtest2[['genre']].values, Xtest2[['artist_name']].values]

In [132]:
mdl.fit(train_X,ytrain.values, validation_data=[test_X,ytest.values],batch_size=64, epochs=5)

Train on 186358 samples, validate on 46367 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fbeff60de80>

# 4.0 - Neural Netwowrk + Numeric Features

In [140]:
Xtrain2_num = Xtrain.select_dtypes(include=np.number)
Xtest2_num = Xtest.select_dtypes(include=np.number)

scaler = StandardScaler()
Xtrain2_num = scaler.fit_transform(Xtrain2_num)
Xtest2_num = scaler.transform(Xtest2_num)

In [143]:
## Genre Embedding
genre_input = layers.Input((1,))
genre_embedding = layers.Embedding(29,5)(genre_input)
genre_embedding = layers.Flatten()(genre_embedding)

## Artist Embedding
artist_name_input = layers.Input((1,))
artist_embedding = layers.Embedding(13588,5)(artist_name_input)
artist_embedding = layers.Flatten()(artist_embedding)

## Numeric Features
nums = layers.Input((Xtrain2_num.shape[1],))

concat = layers.concatenate([genre_embedding, artist_embedding, nums])
hidden1 = layers.Dense(10, activation='relu')(concat)
drop1 = layers.Dropout(0.5)(hidden1)
out = layers.Dense(1)(hidden1)

In [145]:
mdl = keras.Model([genre_input, artist_name_input, nums], out)
mdl.compile(loss='mse', optimizer='adam')

In [146]:
mdl.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_23 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 1, 5)         145         input_22[0][0]                   
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 1, 5)         67940       input_23[0][0]                   
____________________________________________________________________________________________

In [147]:
train_X = [Xtrain2[['genre']].values, Xtrain2[['artist_name']].values, Xtrain2_num]
test_X = [Xtest2[['genre']].values, Xtest2[['artist_name']].values, Xtest2_num]

In [148]:
mdl.fit(train_X, ytrain.values, validation_data=[test_X, ytest.values], batch_size=64, epochs=5)

Train on 186358 samples, validate on 46367 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fbefdbcdb38>