In [1]:
# Imports 

import pandas as pd
import numpy as np 
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
import pandas as pd

scaled = pd.read_csv('https://raw.githubusercontent.com/trackteam-spotify/data-science/master/data/final_scaled_dataset.csv')

scaled.head()

Unnamed: 0,index,artist_name,track_name,track_id,popularity,danceability,energy,instrumentalness,liveness,loudness,tempo
0,1,Nicholas Britell,Eros,3w5s0j9clwhk0O2uScrNOo,-0.259352,-2.986856,-1.425094,2.658536,-0.859445,-2.108236,-1.200244
1,2,Seeb,Grip,3gicyfiEVMGONgzygpWjNT,3.219563,0.049805,1.276434,-0.398629,0.478628,1.25485,0.614084
2,3,Lagwagon,Reign,7jLDlShR2ARgNKyAOD94LD,-1.157137,-0.673351,1.466926,-0.397736,-0.172702,0.643049,-0.479831
3,4,Super Whatevr,Someone Somewhere Somehow,0XvFwux1NYJrKmCYJ7DOjE,-1.157137,-0.922306,-1.4554,-0.296192,-0.394028,-1.292787,0.206582
4,5,The Front Bottoms,Flashlight,0xR5ZhiksJK6lMRdI6S2A4,-0.371575,-0.951944,1.107588,-0.398629,-0.526824,0.942632,1.048378


In [3]:
scaled.shape

(5000, 11)

In [4]:
# Variables to cluster

cluster = ['popularity', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'tempo']

distortion = list()
for k in range(1, 31): #started with range [1, 51] and narrowed down to this range 
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(scaled.loc[:, cluster])
    distortion.append(kmeans.inertia_) # append distortion value to list

## Modeling K-Nearest Neighbors

In [5]:
knn = NearestNeighbors(n_neighbors=15)
features = ['popularity', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'tempo']
X = scaled[features].values

knn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                 radius=1.0)

In [6]:
# test on 1 song
test_song = X[20]

distance, neighbors = knn.kneighbors(np.array([test_song]))
distance, neighbors

(array([[0.        , 0.63472403, 0.68087431, 0.68190637, 0.78568082,
         0.78977841, 0.79588194, 0.80620697, 0.8157036 , 0.81841708,
         0.82283869, 0.84523641, 0.84715768, 0.93537395, 0.93875525]]),
 array([[  20, 3806,  301, 2356, 3462, 1585, 1731, 1940, 2330,  695, 2353,
         1959,  849, 4776, 3544]]))

In [7]:
scaled.iloc[20]

index                                           21
artist_name                               Dua Lipa
track_name          New Rules - Initial Talk Remix
track_id                    2ygfHXyt3gvyhvKrNJU61n
popularity                                0.750656
danceability                             -0.341411
energy                                     1.59248
instrumentalness                         -0.398629
liveness                                 -0.235938
loudness                                   1.15463
tempo                                    -0.108978
Name: 20, dtype: object

## New playlist based off user input

In [9]:
song_list = []
for item in neighbors[0][1:]: # this way excludes itself
    row = scaled.iloc[item]
    song_list.append((row.track_name, row.artist_name))
    
names = ['song', 'artist']

new_playlist = pd.DataFrame(song_list, columns=names)
print(new_playlist.shape)
new_playlist

(14, 2)


Unnamed: 0,song,artist
0,On the Line - Featuring Jonas Brothers,Jonas Brothers
1,Uncomfortable,Halestorm
2,Jaded,Aerosmith
3,Apocalypse Dreams,Tame Impala
4,Adrenalize,In This Moment
5,The Other Side,Jason Derulo
6,Honky Tonk Badonkadonk,Trace Adkins
7,My Dilemma 2.0,Selena Gomez & The Scene
8,Rhinestone World,Dallas Smith
9,Desire - Gryffin Remix,Years & Years


## Pickle the Model


In [10]:
import joblib
joblib.dump(knn, 'model.pkl')

['model.pkl']

In [11]:
model = joblib.load('model.pkl')

In [12]:
model.kneighbors(X[11].reshape(1,-1))[1][0][1:]

array([  59, 3770,  360,  516,  102, 4828, 2162, 2257, 1526, 2219, 1453,
       4906, 4911, 1166])