In [108]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# To avoid KNN distances being printed as scientific notation for consistency
np.set_printoptions(suppress=True, precision=6) 
pd.set_option('display.max_columns', None)

Dataset was obtained from https://data.mendeley.com/datasets/3t9vbwxgr5/3

In [224]:
df = pd.read_csv('tcc_ceds_music.csv')
df.drop(columns = ['Unnamed: 0'], inplace = True)

df

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,0.000598,0.000598,0.048857,0.017104,0.263751,0.000598,0.039288,0.000598,0.000598,0.000598,0.000598,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.137110,sadness,1.000000
1,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,0.001284,0.001284,0.118034,0.001284,0.212681,0.051124,0.001284,0.001284,0.001284,0.331745,0.647540,0.954819,0.000002,0.325021,0.263240,world/life,1.000000
2,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.002770,0.002770,0.002770,0.002770,0.002770,0.002770,0.158564,0.250668,0.002770,0.323794,0.002770,0.002770,0.002770,0.002770,0.002770,0.225422,0.456298,0.585288,0.840361,0.000000,0.351814,0.139112,music,1.000000
3,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,0.001548,0.021500,0.001548,0.411536,0.001548,0.001548,0.001548,0.129250,0.001548,0.001548,0.081132,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.775350,0.743736,romantic,1.000000
4,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.001350,0.001350,0.417772,0.001350,0.001350,0.001350,0.463430,0.001350,0.001350,0.001350,0.001350,0.001350,0.029755,0.001350,0.068800,0.001350,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,mack 10,10 million ways,2019,hip hop,cause fuck leave scar tick tock clock come kno...,78,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,0.391651,0.001350,0.435089,0.001350,0.001350,0.001350,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,obscene,0.014286
28368,m.o.p.,ante up (robbin hoodz theory),2019,hip hop,minks things chain ring braclets yap fame come...,67,0.001284,0.001284,0.035338,0.001284,0.001284,0.001284,0.066324,0.203889,0.318910,0.058152,0.134955,0.001284,0.001284,0.040811,0.001284,0.001284,0.662082,0.789580,0.004607,0.000002,0.922712,0.797791,obscene,0.014286
28369,nine,whutcha want?,2019,hip hop,get ban get ban stick crack relax plan attack ...,77,0.001504,0.154302,0.168988,0.001504,0.039755,0.001504,0.035401,0.001504,0.356685,0.001504,0.068684,0.001504,0.001504,0.001504,0.001504,0.001504,0.663165,0.726970,0.104417,0.000001,0.838211,0.767761,obscene,0.014286
28370,will smith,switch,2019,hip hop,check check yeah yeah hear thing call switch g...,67,0.001196,0.001196,0.001196,0.001196,0.048359,0.001196,0.001196,0.001196,0.492434,0.103614,0.001196,0.202659,0.001196,0.070867,0.001196,0.001196,0.883028,0.786888,0.007027,0.000503,0.508450,0.885882,obscene,0.014286


Most columns are numeric with different scales. Some columns like genre and topics are categorical though.

In [20]:
df.dtypes

artist_name                  object
track_name                   object
release_date                  int64
genre                        object
lyrics                       object
len                           int64
dating                      float64
violence                    float64
world/life                  float64
night/time                  float64
shake the audience          float64
family/gospel               float64
romantic                    float64
communication               float64
obscene                     float64
music                       float64
movement/places             float64
light/visual perceptions    float64
family/spiritual            float64
like/girls                  float64
sadness                     float64
feelings                    float64
danceability                float64
loudness                    float64
acousticness                float64
instrumentalness            float64
valence                     float64
energy                      

In [111]:
# Remove columns that won't be used
df_processed = df.drop(columns=['artist_name', 'track_name', 'release_date', 'lyrics', 'topic'])

In [112]:
df_processed.isna().sum()

genre                       0
len                         0
dating                      0
violence                    0
world/life                  0
night/time                  0
shake the audience          0
family/gospel               0
romantic                    0
communication               0
obscene                     0
music                       0
movement/places             0
light/visual perceptions    0
family/spiritual            0
like/girls                  0
sadness                     0
feelings                    0
danceability                0
loudness                    0
acousticness                0
instrumentalness            0
valence                     0
energy                      0
age                         0
dtype: int64

# Scaling
We will use min max scaling on numeric features to prevent certain features from dominating the distance metric for computing nearest neighbors in KNN.
For categorical/object features, we will use one-hot encoding to convert each categorical variable into a k binary variables where k is the number of unique categories in that variable.
https://rukshanpramoditha.medium.com/data-preprocessing-for-k-nearest-neighbors-knn-d447cd3b3aea

In [226]:
numeric_cols = ['len', 'dating', 'violence', 'world/life', 'night/time', 'shake the audience', 'family/gospel', 'romantic', 'communication', 'obscene', 'music', 'movement/places', 'light/visual perceptions', 'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability', 'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy', 'age']
categorical_cols = ['genre']

df_numeric = df_processed[numeric_cols]
df_non_numeric = df_processed[categorical_cols]

In [225]:
minMaxScaler = MinMaxScaler()
data_numeric_scaled = minMaxScaler.fit_transform(df_numeric)
df_numeric_scaled = pd.DataFrame(data_numeric_scaled, index = df_processed.index, columns = numeric_cols)
df_numeric_scaled.head(3)

Unnamed: 0,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,age
0,0.474747,0.000475,0.064658,0.00032,0.000317,0.000631,0.089113,0.017884,0.408124,0.000311,0.040766,0.000492,0.00047,0.000508,0.000528,0.387319,0.121944,0.356571,0.454119,0.997992,0.904569,0.339448,0.13711,1.0
1,0.252525,0.054442,0.098311,0.460738,0.001022,0.00201,0.049023,0.001062,0.001538,0.001003,0.12308,0.001567,0.318198,0.082293,0.001682,0.001018,0.001038,0.330264,0.64754,0.954819,2e-06,0.325021,0.26324,1.0
2,0.116162,0.00383,0.002532,0.002578,0.002549,0.004999,0.004552,0.168293,0.387858,0.002501,0.338165,0.003898,0.003724,0.004023,0.004183,0.002533,0.234876,0.456319,0.585288,0.840361,0.0,0.351814,0.139112,1.0


In [228]:
df_non_numeric_encoded = pd.get_dummies(df_non_numeric, columns = categorical_cols)
df_non_numeric_encoded.head(3)

Unnamed: 0,genre_blues,genre_country,genre_hip hop,genre_jazz,genre_pop,genre_reggae,genre_rock
0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0


In [230]:
df_scaled = pd.concat([df_numeric_scaled, df_non_numeric_encoded], axis=1)

We will use the cosine distance metric to compute the nearest neighbors.

https://www.kdnuggets.com/2020/11/most-popular-distance-metrics-knn.html

In [216]:
knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(df_scaled.values)

In [209]:
def print_similar_songs(df, query_song_indices, distances, similar_indices):
    # Print the most similar songs to each song that was queried
    num_songs = len(query_song_indices)
    for i in range(num_songs):
        artist_name = df.loc[query_song_indices[i]].artist_name
        song_name = df.loc[query_song_indices[i]].track_name
        similar_artist_name = df.loc[similar_indices[i]].artist_name.values
        similar_song_name = df.loc[similar_indices[i]].track_name.values
        print(f'Song #{i+1}: "{song_name}" by "{artist_name}": ')
        for j in range(1, len(distances[0])):
            print(f'\t"{similar_song_name[j]}" by "{similar_artist_name[j]}" -- distance of {distances[i][j]}')

In [223]:
# Find the most similar songs for the 10 most recent Beatles songs
beatles_songs_indices = df[df.artist_name == 'the beatles'].tail(10).index

distances, indices = knn.kneighbors(df_scaled.loc[beatles_songs_indices].values)
print_similar_songs(df, beatles_songs_indices, distances, indices)

Song #1: "across the universe" by "the beatles": 
	"all you need is love" by "the beatles" -- distance of 0.04246260848934447
	"yellow submarine" by "the beatles" -- distance of 0.0445341796694948
	"this time tomorrow" by "the kinks" -- distance of 0.045965085969016495
	"whole wide world" by "wreckless eric" -- distance of 0.048422938741139454
	"the grand illusion" by "styx" -- distance of 0.05071767547624639
	"gotta get the first plane home" by "the kinks" -- distance of 0.051868513007982675
	"rotten peaches" by "elton john" -- distance of 0.052237937575092985
	"wild world" by "yusuf / cat stevens" -- distance of 0.053270399276399316
	"woman of the world" by "aerosmith" -- distance of 0.05376258029009373
Song #2: "maggie mae" by "the beatles": 
	"i'm one" by "the who" -- distance of 0.0316153157913307
	"away rio" by "the kingston trio" -- distance of 0.054318688667635895
	"driven to tears" by "the police" -- distance of 0.058319134853625165
	"doncha bother me" by "the rolling stones" 

We will now train an autoencoder from scratch to find embeddings for the training data, then we will use KNN on the embeddings to calculate the most similar songs.

In [231]:
num_features = df_scaled.shape[1]
num_features

31

In [175]:
num_features = df_scaled.shape[1]

input_layer = Input(shape=(num_features,))

# Encoder maps training data to a dimension larger than num_features = 31
# Use some L2 regulariztion and a dropout layer to reduce overfitting
encoded = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(input_layer)
encoded = Dropout(0.2)(encoded)

# Decoder maps intermediate data back to the original dimension of 31
# I tried using L2 regularization here, but I got weird results likely because
# the weights are not similarly distributed across the features for the embeddings.
decoded = Dense(num_features, activation='sigmoid')(encoded)

# Build the autoencoder and encoder models
autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

# Use the Adam optimizer as that is basically the standard.
# Loss is MSE to reduce the error in encoding the training data.
autoencoder.compile(optimizer=Adam(learning_rate=1e-4), loss='mse')

# Use early stopping to prevent overfitting.
early_stopping = EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True, verbose=1
)

# Train the autoencoder 
autoencoder.fit(
    df_scaled, 
    df_scaled, 
    epochs=100,
    batch_size=32, 
    shuffle=True, 
    validation_split=0.1,
    callbacks=[early_stopping]
)

embeddings = encoder.predict(df_scaled)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: early stopping


In [211]:
knn_embeddings = NearestNeighbors(n_neighbors=10, metric='cosine')
knn_embeddings.fit(embeddings)

In [214]:
# Find similar songs using the embeddings
distances, indices = knn_embeddings.kneighbors(embeddings[beatles_songs])
print_similar_songs(df, beatles_songs_indices, distances, indices)

Song #1: "across the universe" by "the beatles": 
	"all you need is love" by "the beatles" -- distance of 2.6702880859375e-05
	"wild world" by "yusuf / cat stevens" -- distance of 3.2782554626464844e-05
	"look what you've done" by "the rolling stones" -- distance of 4.2319297790527344e-05
	"maggie mae" by "the beatles" -- distance of 6.198883056640625e-05
	"magnet and steel" by "walter egan" -- distance of 6.35385513305664e-05
	"bloody well right" by "supertramp" -- distance of 6.35385513305664e-05
	"you make loving fun" by "fleetwood mac" -- distance of 6.914138793945312e-05
	"if i laugh" by "yusuf / cat stevens" -- distance of 7.045269012451172e-05
	"tommy can you hear me?" by "the who" -- distance of 7.259845733642578e-05
Song #2: "maggie mae" by "the beatles": 
	"stop, look, listen (to your heart)" by "diana ross" -- distance of 3.349781036376953e-05
	"wild world" by "yusuf / cat stevens" -- distance of 3.4868717193603516e-05
	"all you need is love" by "the beatles" -- distance of 