## Spotify Recommendation System with ML

Program takes an input song, compares to song data, and recommends similar songs

In [156]:
# Imports
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from tqdm import tqdm
sns.set()

# Data upload
fdata = pd.read_csv("data/genres_v2.csv", index_col=False, header=0)
data = fdata.drop(columns=['Unnamed: 0', 'title'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42305 entries, 0 to 42304
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      42305 non-null  float64
 1   energy            42305 non-null  float64
 2   key               42305 non-null  int64  
 3   loudness          42305 non-null  float64
 4   mode              42305 non-null  int64  
 5   speechiness       42305 non-null  float64
 6   acousticness      42305 non-null  float64
 7   instrumentalness  42305 non-null  float64
 8   liveness          42305 non-null  float64
 9   valence           42305 non-null  float64
 10  tempo             42305 non-null  float64
 11  type              42305 non-null  object 
 12  id                42305 non-null  object 
 13  uri               42305 non-null  object 
 14  track_href        42305 non-null  object 
 15  analysis_url      42305 non-null  object 
 16  duration_ms       42305 non-null  int64 

  fdata = pd.read_csv("data/genres_v2.csv", index_col=False, header=0)


In [157]:
# Data cleaning
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21519 entries, 0 to 21524
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      21519 non-null  float64
 1   energy            21519 non-null  float64
 2   key               21519 non-null  int64  
 3   loudness          21519 non-null  float64
 4   mode              21519 non-null  int64  
 5   speechiness       21519 non-null  float64
 6   acousticness      21519 non-null  float64
 7   instrumentalness  21519 non-null  float64
 8   liveness          21519 non-null  float64
 9   valence           21519 non-null  float64
 10  tempo             21519 non-null  float64
 11  type              21519 non-null  object 
 12  id                21519 non-null  object 
 13  uri               21519 non-null  object 
 14  track_href        21519 non-null  object 
 15  analysis_url      21519 non-null  object 
 16  duration_ms       21519 non-null  int64  
 17

In [158]:
# Song analysis correlation
df = data.drop(columns=['id', 'uri', 'track_href', 'analysis_url', 'time_signature', 'genre', 'type', 'song_name'])
df.corr()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
danceability,1.0,-0.205509,-0.014223,-0.059944,0.034484,0.191643,-0.044994,-0.214672,-0.127341,0.309665,-0.080899,-0.166523
energy,-0.205509,1.0,0.028381,0.699971,0.019783,0.030433,-0.389546,-0.010044,0.224582,0.245327,0.05421,0.041396
key,-0.014223,0.028381,1.0,0.000908,-0.22886,-0.001358,0.032237,0.030422,0.011275,0.054906,0.005999,0.040043
loudness,-0.059944,0.699971,0.000908,1.0,0.043966,-0.012122,-0.304527,-0.202542,0.140036,0.213404,0.035041,0.035967
mode,0.034484,0.019783,-0.22886,0.043966,1.0,0.020113,-0.061771,-0.046684,0.034606,-0.014759,-0.001292,-0.086842
speechiness,0.191643,0.030433,-0.001358,-0.012122,0.020113,1.0,0.019342,-0.241072,0.10184,0.223742,0.111887,-0.093504
acousticness,-0.044994,-0.389546,0.032237,-0.304527,-0.061771,0.019342,1.0,-0.017739,-0.093321,-0.012468,-0.022751,0.011141
instrumentalness,-0.214672,-0.010044,0.030422,-0.202542,-0.046684,-0.241072,-0.017739,1.0,-0.029965,-0.280156,-0.054799,0.055773
liveness,-0.127341,0.224582,0.011275,0.140036,0.034606,0.10184,-0.093321,-0.029965,1.0,0.067636,0.017724,-0.006682
valence,0.309665,0.245327,0.054906,0.213404,-0.014759,0.223742,-0.012468,-0.280156,0.067636,1.0,0.103554,0.024992


In [159]:
# Normalize numerical data
datatypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
normarization = data.select_dtypes(include=datatypes)
for col in normarization.columns:
    MinMaxScaler(col)

print(normarization.head())


   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.831   0.814    2    -7.364     1       0.4200        0.0598   
1         0.719   0.493    8    -7.230     1       0.0794        0.4010   
2         0.850   0.893    5    -4.783     1       0.0623        0.0138   
3         0.476   0.781    0    -4.710     1       0.1030        0.0237   
4         0.798   0.624    2    -7.668     1       0.2930        0.2170   

   instrumentalness  liveness  valence    tempo  duration_ms  time_signature  
0          0.013400    0.0556   0.3890  156.985       124539               4  
1          0.000000    0.1180   0.1240  115.080       224427               4  
2          0.000004    0.3720   0.0391  218.050        98821               4  
3          0.000000    0.1140   0.1750  186.948       123661               3  
4          0.000000    0.1660   0.5910  147.988       123298               4  


In [160]:
# Cluster based on genre
kmeans = KMeans(n_clusters=10)
features = kmeans.fit_predict(normarization)
data['features'] = features
MinMaxScaler(data['features'])

  super()._check_params_vs_input(X, default_n_init=10)


In [162]:
# Recommendation found using analysis data
class Spotify_Recommendation():
    def __init__(self, dataset):
        self.dataset = dataset
    def recommend(self, songs, amount = 1):
        distance = []
        song = self.dataset[(self.dataset.song_name.str.lower() == songs.lower())].head(1).values[0]
        rec = self.dataset[self.dataset.song_name.str.lower() != songs.lower()]

        print(song)
        for songs in tqdm(rec.values):
            d = 0
            for col in np.arange(len(rec.columns)):
                if not col in [11, 12, 13, 14, 15, 17, 18, 19, 20, 21]:
                    d = d + np.absolute(float(song[col]) - float(songs[col]))
            distance.append(d)

        rec['distance'] = distance
        rec = rec.sort_values('distance')
        columns = ['song_name', 'genre', 'track_href']
        return rec[columns][:amount]

recommendations = Spotify_Recommendation(data)
recommendations.recommend("The Middle", 10)

[0.634 0.8859999999999999 2 -3.466 1 0.0517 0.0291 0.0 0.342 0.922 161.933
 'audio_features' '5kD9T7GForh8LnRz5ClbL8'
 'spotify:track:5kD9T7GForh8LnRz5ClbL8'
 'https://api.spotify.com/v1/tracks/5kD9T7GForh8LnRz5ClbL8'
 'https://api.spotify.com/v1/audio-analysis/5kD9T7GForh8LnRz5ClbL8' 168253
 4 'Emo' 'The Middle' 6]


  0%|          | 0/21515 [00:00<?, ?it/s]

100%|██████████| 21515/21515 [00:00<00:00, 45582.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec['distance'] = distance


Unnamed: 0,song_name,genre,track_href
287,Analog Keys,Dark Trap,https://api.spotify.com/v1/tracks/7yMvF3mjdsFS...
8363,Chalk Em Out,Underground Rap,https://api.spotify.com/v1/tracks/2aMBeWzVIbk0...
9952,Stay Ur Distance,Underground Rap,https://api.spotify.com/v1/tracks/3GXDBeiF8ATG...
249,Day for Day,Dark Trap,https://api.spotify.com/v1/tracks/0rGipWrGwAnZ...
7185,Wishers Lose Copper Dreamers Lose Everything,Underground Rap,https://api.spotify.com/v1/tracks/0MvuMqc8FCPQ...
5196,Big Drip,Underground Rap,https://api.spotify.com/v1/tracks/4DkZDzcHicZB...
14893,Big Drip,Rap,https://api.spotify.com/v1/tracks/4DkZDzcHicZB...
30,Hold Uh,Dark Trap,https://api.spotify.com/v1/tracks/30Q5BsSJBLb9...
8586,Don't Come Out The House (with 21 Savage),Underground Rap,https://api.spotify.com/v1/tracks/2Grb4G6t9VIq...
17462,Don't Come Out The House (with 21 Savage),RnB,https://api.spotify.com/v1/tracks/2Grb4G6t9VIq...


# Notes:
- The data file used does not have an artist column; the track_href column links to the song, but I do not have an API key
- Further data cleaning is needed to remove duplicates