# Training the Model for Spotify Similar Song Finder

This notebook contains code for the Exploratory Data Analysis and Model Training sections of the project.

Due to not being able to get past Spotify's API rate limits, I will be using an already processed CSV file created by Cameron Watts.

### Imports

In [2]:
from __future__ import print_function

import os
import json
import time
import sys
from dotenv import load_dotenv

import pandas as pd
import numpy as np  
import seaborn as sn
import gradio as gr

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors



import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

### Load data

In [3]:
df=pd.read_csv('../data/spotify_data.csv')

In [4]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms_x,album_name,...,type,id,uri,track_href,analysis_url,duration_ms_y,time_signature,artist_pop,genres,track_pop
0,0,0,0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
1,1,7734,73,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
2,2,14037,14,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
3,3,21536,42,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
4,4,24404,1,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67494,67494,67496,37,Jon D,3uCHI1gfOUL5j5swEh0TcH,spotify:artist:5HCypjplgh5uQezvBpOfXN,I Don't Know,spotify:album:2KEQtuVl1cYsTYtVRUrNVi,189183,Roots,...,audio_features,3uCHI1gfOUL5j5swEh0TcH,spotify:track:3uCHI1gfOUL5j5swEh0TcH,https://api.spotify.com/v1/tracks/3uCHI1gfOUL5...,https://api.spotify.com/v1/audio-analysis/3uCH...,189184,4,47,unknown,27
67495,67495,67499,40,Big Words,0P1oO2gREMYUCoOkzYAyFu,spotify:artist:0sHN89qak07mnug3LVVjzP,The Answer,spotify:album:5jrsRHRAmetu5e7RRBoxj7,263679,"Hollywood, a Beautiful Coincidence",...,audio_features,0P1oO2gREMYUCoOkzYAyFu,spotify:track:0P1oO2gREMYUCoOkzYAyFu,https://api.spotify.com/v1/tracks/0P1oO2gREMYU...,https://api.spotify.com/v1/audio-analysis/0P1o...,263680,4,39,australian_r&b,37
67496,67496,67500,41,Allan Rayman,2oM4BuruDnEvk59IvIXCwn,spotify:artist:6Yv6OBXD6ZQakEljaGaDAk,25.22,spotify:album:3CbNgBzI7r9o0F6VjH9sTY,189213,Roadhouse 01,...,audio_features,2oM4BuruDnEvk59IvIXCwn,spotify:track:2oM4BuruDnEvk59IvIXCwn,https://api.spotify.com/v1/tracks/2oM4BuruDnEv...,https://api.spotify.com/v1/audio-analysis/2oM4...,189213,4,55,canadian_contemporary_r&b modern_alternative_rock,49
67497,67497,67501,42,Jon Jason,4Ri5TTUgjM96tbQZd5Ua7V,spotify:artist:77bNdkKYBBmc30CisCA6tE,Good Feeling,spotify:album:2dZ7oVNQBeLlpoUYfbEsJP,194720,Good Feeling,...,audio_features,4Ri5TTUgjM96tbQZd5Ua7V,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,https://api.spotify.com/v1/tracks/4Ri5TTUgjM96...,https://api.spotify.com/v1/audio-analysis/4Ri5...,194720,4,4,unknown,16


In [5]:
df.dtypes

Unnamed: 0.1          int64
Unnamed: 0            int64
pos                   int64
artist_name          object
track_uri            object
artist_uri           object
track_name           object
album_uri            object
duration_ms_x         int64
album_name           object
name                 object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
id                   object
uri                  object
track_href           object
analysis_url         object
duration_ms_y         int64
time_signature        int64
artist_pop            int64
genres               object
track_pop             int64
dtype: object

In [6]:
df = df.drop(columns=['Unnamed: 0', "Unnamed: 0.1", "pos", "artist_uri", "album_uri", "duration_ms_x", "album_name", "name", "type", "id", "track_href", "analysis_url", "duration_ms_y", "time_signature", "artist_pop", "track_pop"])

In [7]:
df.drop_duplicates(subset=['uri'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,artist_name,track_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,genres
0,Missy Elliott,0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.1210,0.03110,0.006970,0.0471,0.810,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,dance_pop hip_hop hip_pop pop pop_rap r&b rap ...
1,Britney Spears,6I9VzXrHxO9rA9A5euc8Ak,Toxic,0.774,0.838,5,-3.914,0,0.1140,0.02490,0.025000,0.2420,0.924,143.040,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,dance_pop pop post-teen_pop
2,Beyoncé,0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,0.664,0.758,2,-6.583,0,0.2100,0.00238,0.000000,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,dance_pop pop r&b
3,Justin Timberlake,1AWQoqb9bSvzTjaLralEkT,Rock Your Body,0.892,0.714,4,-6.055,0,0.1410,0.20100,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT,dance_pop pop
4,Shaggy,1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.05610,0.000000,0.3130,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H,pop_rap reggae_fusion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34435,Jon D,3uCHI1gfOUL5j5swEh0TcH,I Don't Know,0.669,0.228,2,-12.119,1,0.0690,0.79200,0.065000,0.0944,0.402,83.024,spotify:track:3uCHI1gfOUL5j5swEh0TcH,unknown
34436,Big Words,0P1oO2gREMYUCoOkzYAyFu,The Answer,0.493,0.727,1,-5.031,1,0.2170,0.08730,0.000000,0.1290,0.289,73.259,spotify:track:0P1oO2gREMYUCoOkzYAyFu,australian_r&b
34437,Allan Rayman,2oM4BuruDnEvk59IvIXCwn,25.22,0.702,0.524,7,-10.710,1,0.0793,0.33200,0.055300,0.2980,0.265,140.089,spotify:track:2oM4BuruDnEvk59IvIXCwn,canadian_contemporary_r&b modern_alternative_rock
34438,Jon Jason,4Ri5TTUgjM96tbQZd5Ua7V,Good Feeling,0.509,0.286,8,-14.722,1,0.1230,0.40200,0.000012,0.1310,0.259,121.633,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,unknown


## EDA

In [8]:
df.isna().sum()

artist_name         0
track_uri           0
track_name          0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
uri                 0
genres              0
dtype: int64

In [9]:
df.isnull().sum()   

artist_name         0
track_uri           0
track_name          0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
uri                 0
genres              0
dtype: int64

In [10]:
df.dropna(inplace=True)
df.shape

(34440, 16)

In [11]:
df.dtypes

artist_name          object
track_uri            object
track_name           object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
uri                  object
genres               object
dtype: object

In [12]:
df.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,34440.0,34440.0,34440.0,34440.0,34440.0,34440.0,34440.0,34440.0,34440.0,34440.0,34440.0
mean,0.58474,0.636573,5.21324,-7.608404,0.665128,0.091289,0.263789,0.07698,0.197472,0.486098,121.683623
std,0.164268,0.225408,3.583948,3.963361,0.471953,0.101199,0.30055,0.218456,0.167389,0.243577,29.294573
min,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.476,0.49,2.0,-9.17,0.0,0.0351,0.0202,0.0,0.0958,0.292,98.5805
50%,0.592,0.669,5.0,-6.699,1.0,0.0489,0.126,8e-06,0.128,0.477,120.823
75%,0.705,0.818,8.0,-5.005,1.0,0.096,0.445,0.00268,0.252,0.677,140.03025
max,0.988,1.0,11.0,2.766,1.0,0.962,0.996,0.995,1.0,0.998,219.297


In [13]:
df.hist(figsize=(20,20))

array([[<Axes: title={'center': 'danceability'}>,
        <Axes: title={'center': 'energy'}>,
        <Axes: title={'center': 'key'}>],
       [<Axes: title={'center': 'loudness'}>,
        <Axes: title={'center': 'mode'}>,
        <Axes: title={'center': 'speechiness'}>],
       [<Axes: title={'center': 'acousticness'}>,
        <Axes: title={'center': 'instrumentalness'}>,
        <Axes: title={'center': 'liveness'}>],
       [<Axes: title={'center': 'valence'}>,
        <Axes: title={'center': 'tempo'}>, <Axes: >]], dtype=object)

For starters, I want to use all columns. Maybe later on PCA will be a good thing to do.

In [14]:
df_num = df.select_dtypes(include = ['float64', 'int64'])

In [15]:
corr_matrix = df_num.corr()
sn.set (rc = {'figure.figsize':(12, 4)})
sn.heatmap(corr_matrix, annot=True)

<Axes: >

### Encoding & Scaling

In [16]:
numeric_cols = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
categorical_cols = ['key', 'mode']

# Create the preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply the preprocessing pipeline to your DataFrame
df_processed = preprocessing_pipeline.fit_transform(df_num)

num_cols_transformed = numeric_cols
cat_cols_transformed = preprocessing_pipeline.named_transformers_['cat'].get_feature_names_out(categorical_cols)

# Combine the transformed column names
all_cols_transformed = num_cols_transformed + cat_cols_transformed.tolist()

# Convert the processed NumPy array back to a DataFrame
df_processed = pd.DataFrame(df_processed, columns=all_cols_transformed)


In [17]:
df_processed

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key_0,...,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,mode_0,mode_1
0,-0.774222,1.943562,0.782713,-0.320483,-0.898355,0.127016,0.293592,0.128946,1.329789,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.794851,1.152160,0.893624,-0.237948,0.266018,0.932153,0.224420,0.729032,1.797820,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.869782,0.482512,0.538707,-0.352390,-0.822482,0.258725,1.173063,-0.765498,0.882286,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.208917,1.870509,0.343502,-0.351318,-0.868484,0.391947,0.491226,-0.707023,1.358528,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.691040,1.633089,-0.135636,-0.352390,0.690186,0.760074,-0.197528,-0.919113,0.689326,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34435,1.757508,0.512951,-1.812620,-0.054842,-0.615775,-1.138090,-0.220256,-1.319705,-0.345268,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34436,-0.587229,-0.558486,0.401176,-0.352390,-0.409067,0.650317,1.242235,-1.653048,-0.809193,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34437,0.226958,0.713845,-0.499426,-0.099245,0.600573,-0.782579,-0.118474,0.628295,-0.907726,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
34438,0.459867,-0.461083,-1.555305,-0.352337,-0.397119,-1.794865,0.313356,-0.001728,-0.932359,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [18]:
# df_processed.hist(figsize=(20,20))

### Model Training

In [19]:

def transform_query(track_uri):
    audio_features = sp.audio_features(track_uri)[0]
    track_data = []
    track_dict = {
        'acousticness': audio_features['acousticness'],
        'danceability': audio_features['danceability'],
        'energy': audio_features['energy'],
        'instrumentalness': audio_features['instrumentalness'],
        'liveness': audio_features['liveness'],
        'loudness': audio_features['loudness'],
        'speechiness': audio_features['speechiness'],
        'tempo': audio_features['tempo'],
        'valence': audio_features['valence'],
        'key': audio_features['key'],
        'mode': audio_features['mode']
    }
    
    track_data.append(track_dict)
    query_data = pd.DataFrame(track_data)
    return query_data

In [20]:
knn_model = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='euclidean')
knn_model.fit(df_processed) # I'm using all the data for KNN

# Function to find similar songs to the input URI
def find_similar_songs(track_uri):

    query_data = transform_query(track_uri)
    
    # Scale the query data using the same scaler
    query_data_scaled = preprocessing_pipeline.transform(query_data)
    query_data_scaled_df = pd.DataFrame(query_data_scaled, columns=all_cols_transformed)

    # Find the most similar songs using the KNN model
    distances, indices = knn_model.kneighbors(query_data_scaled_df, n_neighbors=10)

    # Retrieve the Artist Name, Song Name, and Track URI of the most similar songs
    similar_songs = []
    for index in indices[0]:
        artist_name = df.iloc[index]['artist_name']
        song_name = df.iloc[index]['track_name']
        similar_uri = df.iloc[index]['uri']
        
        track_id = similar_uri.split(":")[-1]
        full_url = f"https://open.spotify.com/track/{track_id}"

        similar_songs.append((artist_name, song_name, full_url))
        
    return similar_songs

In [21]:

# Replace 'YOUR_TRACK_URI' with the track URI you want to find similar songs to
similar_songs = find_similar_songs('https://open.spotify.com/track/6rDaCGqcQB1urhpCrrD599?si=2ac7add2ea054ab2')

# Print the results
for song in similar_songs:
    
    print(f"Artist: {song[0]}, Song: {song[1]}, Track URL: {song[2]}")


Artist: Mase, Song: What You Want (feat. Total), Track URL: https://open.spotify.com/track/6hxn98poTu1O4YZfafvC18
Artist: A$AP Rocky, Song: Goldie, Track URL: https://open.spotify.com/track/31G9RaSaDOI2NWcpnIp734
Artist: 2Pac, Song: Skandalouz, Track URL: https://open.spotify.com/track/1wCcCCocowxdDF0igFaFgh
Artist: Anderson .Paak, Song: The Waters, Track URL: https://open.spotify.com/track/23T4gelZgImtFxbHjXnYbm
Artist: Timothy Brindle, Song: Mercy and Grace (feat. Timothy Brindle), Track URL: https://open.spotify.com/track/53i9QgdZ2X0DdvqDOvt7r4
Artist: Jurassic 5, Song: Concrete Schoolyard, Track URL: https://open.spotify.com/track/2FL7ilGkBOrKVck7msHRAM
Artist: The Pharcyde, Song: Runnin', Track URL: https://open.spotify.com/track/0XgpiStoxq1IJncYlPrvZ5
Artist: Plies, Song: Bust It Baby, Pt. 2, Track URL: https://open.spotify.com/track/56du2aCIzpCG7sI5geIvCC
Artist: Zion I, Song: Silly Puddy, Track URL: https://open.spotify.com/track/1QpJi4MgfAyvJ1jcgFu8GE
Artist: Kevin Hart, Song:

### Deployment to Gradio & HuggingFace

In [24]:
def format_output(similar_songs):
    output = []
    for song in similar_songs:
        output.append({"Artist Name": song[0], "Song Name": song[1], "Spotify Track URL": song[2]})
    return pd.DataFrame(output)

# Create the Gradio interface
iface = gr.Interface(
    fn=find_similar_songs,  # Your find_similar_songs function
    inputs=gr.Textbox(label="Enter Spotify Track URL"),
    outputs=gr.Dataframe(headers=["Artist Name", "Song Name", "Spotify Track URL"]),
    live=True
)


iface.launch("share=True")


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "g:\My Drive\Collab\Python\PythonProjects\Similar Song Finder\venv\Lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "g:\My Drive\Collab\Python\PythonProjects\Similar Song Finder\venv\Lib\site-packages\urllib3\connection.py", line 454, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Aykut\AppData\Local\Programs\Python\Python311\Lib\http\client.py", line 1374, in getresponse
    response.begin()
  File "C:\Users\Aykut\AppData\Local\Programs\Python\Python311\Lib\http\client.py", line 318, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Aykut\AppData\Local\Programs\Python\Python311\Lib\http\client.py", line 279, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
              