# K-Nearest Neighbors approach

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

### Load Data

In [2]:
spotify_data = pd.read_csv('spotify_audio_features.csv', header=0)

#debugging output, uncomment to view data
#spotify_data.head

### Removing Duplicates

In [3]:
#default duplicate removal
#748411 results
spotify_data_df = spotify_data.drop_duplicates()

#I ran into issues using just the default above because some of the names were different even though they were the same song 
# i.e. https://ibb.co/JKvZ2Qn so instead if they have the same few features, we'll drop based on those
#482288 results
spotify_data_df = spotify_data.drop_duplicates(subset=['rank', 'danceability','energy','key','loudness','liveness','valence'])

#reset the index
spotify_data_df = spotify_data_df.reset_index(drop=True)

#debugging output, uncomment to view
#spotify_data_df.head

### Extract Relevant Features

In [4]:
#messing around with this to try and give better results
relevant_features = ['rank', 'danceability','energy','key','loudness','liveness','valence']
relevant_features = ['rank', 'danceability','energy','loudness','liveness']

train_data = spotify_data_df[relevant_features]

#debugging output, uncomment to view
#train_data.head

## Create Model
### Nearest Neighbors

In [5]:
#create Nearest Neighbors model and fit data
neigh = NearestNeighbors(10)
neigh.fit(train_data)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

### Pick a Song

In [6]:
#pick a random song
rand_song = spotify_data_df.sample(n=1)
rand_song_feat = rand_song[relevant_features]

In [7]:
neigh_ind = neigh.kneighbors(rand_song_feat, return_distance=False)

### Input Song

In [8]:
rand_song[['name', 'artist_name', 'tag_name', 'rank', 'danceability','energy','key','loudness','liveness','valence']]

Unnamed: 0,name,artist_name,tag_name,rank,danceability,energy,key,loudness,liveness,valence
370603,Psy Z Lasu Śpiewającego,Eldo,Hip-Hop,0.756,0.605,11,-8.514,1,0.651,90.031


### Closest Songs

In [9]:
closest_songs = spotify_data_df.iloc[ neigh_ind[0] , : ]

closest_songs[['name', 'artist_name', 'tag_name', 'rank', 'danceability','energy','key','loudness','liveness','valence']]

Unnamed: 0,name,artist_name,tag_name,rank,danceability,energy,key,loudness,liveness,valence
370603,Psy Z Lasu Śpiewającego,Eldo,Hip-Hop,0.756,0.605,11,-8.514,1,0.651,90.031
53349,Żyrandol,Taco Hemingway,seen live,0.755,0.584,11,-9.987,1,0.657,80.094
351594,I Had It Coming,White Rabbits,indie rock,0.751,0.591,11,-7.645,1,0.67,92.012
58794,Issook,De Jeugd van Tegenwoordig,seen live,0.744,0.607,11,-9.17,1,0.629,120.052
295562,Never Say Never,Edward Ka-Spel,experimental,0.75,0.579,11,-9.644,1,0.643,120.024
83845,Glass Houses - Acoustic,The Beach,alternative,0.768,0.579,11,-9.481,1,0.66,123.966
370866,Airplane,Curren$y,Hip-Hop,0.756,0.638,11,-8.738,1,0.644,79.054
468660,Walk Like An Egyptian (1986),The Bangles,80s,0.76,0.641,11,-13.904,1,0.629,104.001
37538,Render Me - Andrés Remix,Mr. Scruff,electronic,0.744,0.567,11,-11.926,1,0.674,117.483
97342,Durchreise,Fink,indie,0.717,0.602,11,-9.265,1,0.626,92.017


### To Do

Use the 'tag_name' feature in distance calculation (if one song is a singer-songwriter song, my personal opinion is that we should weight that in the model also, returning more )


# =========================================================
# =========================================================
# =========================================================
# =========================================================
# =========================================================
# =========================================================

# Combining Datasets

We need to come up with a way to link the User from the User/Artists Last FM dataset to the Songs/Features Spotify dataset. The method here finds the most popular songs of each artist and uses that to establish a connection between Users and Songs.

In [10]:
# sort by artist and playcount, so we'll have the top songs for each artist 
spotify_data_df.sort_values(by=['artist_name','playcount'], inplace=True, ascending=False)

In [11]:
#set the artist_name as index
spotify_data_df.set_index(keys=['artist_name'], drop=False,inplace=True)

#get all the artist_names as a list
artist_names = spotify_data_df['artist_name'].unique().tolist()

#debugging output
#spotify_data_df

## Get the 5 most played song for each artist

In [12]:
#this takes some time
artist_data = pd.concat([pd.DataFrame(spotify_data_df.loc[artist][:5], columns=spotify_data_df.columns) for artist in artist_names], ignore_index=True)

#debugging output
#artist_data

### Confirming that the 5 largest were found

In [13]:
spotify_data_df.loc['Coldplay'].nlargest(5,columns='playcount')

Unnamed: 0_level_0,name,tag_name,playcount,listeners,artist_name,rank,danceability,energy,key,loudness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
artist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Coldplay,Moving to Mars,rock,199515,24623,Coldplay,0.263,0.315,8,-10.797,1,...,0.196,142.657,audio_features,0kuv7BqWNDprDao3Tb5flN,spotify:track:0kuv7BqWNDprDao3Tb5flN,https://api.spotify.com/v1/tracks/0kuv7BqWNDpr...,https://api.spotify.com/v1/audio-analysis/0kuv...,258874,4,
Coldplay,The Scientist (live),rock,167650,23401,Coldplay,0.294,0.652,5,-7.314,1,...,0.167,75.335,audio_features,1aZhbOdRqshLS6uPjiO8Y4,spotify:track:1aZhbOdRqshLS6uPjiO8Y4,https://api.spotify.com/v1/tracks/1aZhbOdRqshL...,https://api.spotify.com/v1/audio-analysis/1aZh...,388600,4,
Coldplay,M.M.I.X,rock,165690,26280,Coldplay,0.175,0.0613,4,-29.286,1,...,0.157,126.842,audio_features,5Y7ztPw93VbAle55brM0jo,spotify:track:5Y7ztPw93VbAle55brM0jo,https://api.spotify.com/v1/tracks/5Y7ztPw93VbA...,https://api.spotify.com/v1/audio-analysis/5Y7z...,48466,3,
Coldplay,Prospekt's March / Poppyfields,rock,143545,24042,Coldplay,0.227,0.279,2,-11.992,1,...,0.0954,154.465,audio_features,4yr1059N96a8msTHVftmFN,spotify:track:4yr1059N96a8msTHVftmFN,https://api.spotify.com/v1/tracks/4yr1059N96a8...,https://api.spotify.com/v1/audio-analysis/4yr1...,219231,4,
Coldplay,Life Is for Living,rock,141846,24942,Coldplay,0.374,0.456,6,-7.352,0,...,0.0753,135.645,audio_features,1RNtm45kw0hPMBz7gKiIYu,spotify:track:1RNtm45kw0hPMBz7gKiIYu,https://api.spotify.com/v1/tracks/1RNtm45kw0hP...,https://api.spotify.com/v1/audio-analysis/1RNt...,436440,4,


In [14]:
artist_data.loc[artist_data['artist_name'] == 'Coldplay']

Unnamed: 0,name,tag_name,playcount,listeners,artist_name,rank,danceability,energy,key,loudness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
24794,Moving to Mars,rock,199515,24623,Coldplay,0.263,0.315,8,-10.797,1,...,0.196,142.657,audio_features,0kuv7BqWNDprDao3Tb5flN,spotify:track:0kuv7BqWNDprDao3Tb5flN,https://api.spotify.com/v1/tracks/0kuv7BqWNDpr...,https://api.spotify.com/v1/audio-analysis/0kuv...,258874,4,
24795,The Scientist (live),rock,167650,23401,Coldplay,0.294,0.652,5,-7.314,1,...,0.167,75.335,audio_features,1aZhbOdRqshLS6uPjiO8Y4,spotify:track:1aZhbOdRqshLS6uPjiO8Y4,https://api.spotify.com/v1/tracks/1aZhbOdRqshL...,https://api.spotify.com/v1/audio-analysis/1aZh...,388600,4,
24796,M.M.I.X,rock,165690,26280,Coldplay,0.175,0.0613,4,-29.286,1,...,0.157,126.842,audio_features,5Y7ztPw93VbAle55brM0jo,spotify:track:5Y7ztPw93VbAle55brM0jo,https://api.spotify.com/v1/tracks/5Y7ztPw93VbA...,https://api.spotify.com/v1/audio-analysis/5Y7z...,48466,3,
24797,Prospekt's March / Poppyfields,rock,143545,24042,Coldplay,0.227,0.279,2,-11.992,1,...,0.0954,154.465,audio_features,4yr1059N96a8msTHVftmFN,spotify:track:4yr1059N96a8msTHVftmFN,https://api.spotify.com/v1/tracks/4yr1059N96a8...,https://api.spotify.com/v1/audio-analysis/4yr1...,219231,4,
24798,Life Is for Living,rock,141846,24942,Coldplay,0.374,0.456,6,-7.352,0,...,0.0753,135.645,audio_features,1RNtm45kw0hPMBz7gKiIYu,spotify:track:1RNtm45kw0hPMBz7gKiIYu,https://api.spotify.com/v1/tracks/1RNtm45kw0hP...,https://api.spotify.com/v1/audio-analysis/1RNt...,436440,4,


### Extract Features

In [15]:
#get relevant features of the data
artist_data_rel = artist_data [['name', 'artist_name', 'tag_name', 'playcount','rank', 'danceability','energy','key','loudness','liveness','valence']]

#set artist_name column to lowercase to match with the Last FM dataset
artist_data_rel['artist_name'] = artist_data_rel['artist_name'].str.lower()

print(artist_data_rel.shape)
#artist_data_rel.head()

#debugging output
#artist_data_rel.loc[artist_data['artist_name'] == 'Coldplay']

(30177, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


We can see that in the Spotify dataset above we have the top 5 tracks each from about 6000 artists. (30000 tracks in total)

## Load in User/Artists from Last FM dataset

In [16]:
filename ="lastfm-dataset-360k\\usersha1-artmbid-artname-plays.tsv"
user_data = pd.read_csv(filename, sep='\t', header=0)

In [17]:
user_data.shape

(17535655, 4)

### Removing artists that aren't in the Spotify Dataset

We remove the artists that aren't found in the Spotify Dataset because we have no content to compare with for them.

In [18]:
#remove all artists that aren't in the spotify dataset
artist_names_lower = [artist.lower() for artist in artist_names]
user_data = user_data[user_data['artist-name'].isin(artist_names_lower)]
user_data = user_data.reset_index(drop=True)

user_data.shape

(6977817, 4)

In [19]:
user_data.head()

Unnamed: 0,user-mboxsha1,musicbrainz-artist-id,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
1,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691
2,00000c289a1829a808ac09c00daf10bc3c4e223b,c5db90c4-580d-4f33-b364-fbaa5a3a58b5,the murmurs,424
3,00000c289a1829a808ac09c00daf10bc3c4e223b,7b885d42-3c41-4f43-9944-a5855ec5155e,goldfrapp,361
4,00000c289a1829a808ac09c00daf10bc3c4e223b,7e870dd5-2667-454b-9fcf-a132dd8071f1,jack off jill,316


In [20]:
#dropping all artists who aren't in the user dataset
users_artist_names = user_data['artist-name'].unique().tolist()
print(len(users_artist_names))

print(artist_data_rel.shape)

artist_data_filtered = artist_data_rel[artist_data_rel['artist_name'].isin(users_artist_names)]

#removing the artists that aren't in the user database removes about 9000 tracks from our set
print(artist_data_filtered.shape)

#debugging output
#artist_data_filtered.loc[artist_data_filtered['artist_name'] == 'melissa etheridge']

4458
(30177, 11)
(21988, 11)


Here we can merge our datasets aroudnd the artist name and we'll have expanded our dataset to have 5 songs representing each artist that a user has listened to. Then we can find songs for an artist based on their past listening history.

In [21]:
#merge datasets
user_songs_df = pd.merge(user_data, artist_data_filtered, left_on="artist-name", right_on="artist_name")

#debugging output
#user_songs_df[0:10]

In [22]:
#creating a artist-tag dictionary because it's helpful to see how predictions compare
artist_tag_dict = dict(zip(user_songs_df.artist_name, user_songs_df.tag_name))

In [23]:
#Get the unique users
users = user_data['user-mboxsha1'].unique().tolist()


## Running KNN again on Users to Predict Songs with Merged Data

In [25]:
relevant_features = ['rank', 'danceability','liveness']

#create new train data
train_data_2 = artist_data_filtered[relevant_features]

In [24]:
#need to find which user has the most listened to songs, as we have to find that number + 10 nearest neighbors to avoid duplicates
items_counts = user_songs_df['user-mboxsha1'].value_counts()
max_item = items_counts.max()
print(max_item)

360


In [26]:
#create new model
neigh = NearestNeighbors(370)
neigh.fit(train_data_2)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=370, p=2,
                 radius=1.0)

In [27]:
def test_KNN(user=None):
    #choose a random user
    if user==None:
        l = len(users)
        user = users[np.random.randint(l)]
    print("user:",user)
    
    
    #get background on user
    user_background = user_data.loc[user_data['user-mboxsha1'] == user ]
    user_fav = user_background['artist-name'].unique().tolist()
    for fav in user_fav:
        print(fav, "-", artist_tag_dict[fav])
    
    #grab all song data for specific user
    test_user = user_songs_df.loc[user_songs_df['user-mboxsha1'] == user ]
    #get just the features used in training the KNN
    test_user_relevant = test_user[relevant_features]
    #convert to a numpy array to input in KNN
    user_song_feat = test_user_relevant.to_numpy()
    #get index_start to avoid distances of 0 (songs that are already in the database)
    index_start, _ = user_song_feat.shape
    #run KNN
    neigh_ind = neigh.kneighbors(user_song_feat, return_distance=False)

    #return closest songs
    closest_songs = artist_data_filtered.iloc[ neigh_ind[0][index_start:index_start+10] , : ]
    return closest_songs

In [28]:
#you can run this cell repeatedly to test KNN and view results for a specific user, or leave blank to pick a random user
test_KNN('95c0825905dfdd3db43b7f2051012360f0b14637')

user: 95c0825905dfdd3db43b7f2051012360f0b14637
ketil bjørnstad - jazz
max richter - ambient
liquid mind - ambient
swod - ambient
eluvium - ambient
goldmund - ambient
sylvain chauveau - ambient
the boats - ambient
jóhann jóhannsson - ambient
boards of canada - ambient
library tapes - ambient
stephan micus - ambient
dntel - electronic
biosphere - ambient


Unnamed: 0,name,artist_name,tag_name,playcount,rank,danceability,energy,key,loudness,liveness,valence
2006,Life as a Flower,vincent diamante,instrumental,17835,0.187,0.0918,2,-18.018,1,0.107,93.328
1137,La Valse d'Amélie,yann tiersen,instrumental,2406694,0.194,0.179,9,-16.836,0,0.179,177.667
19,primo,高木正勝,ambient,33940,0.225,0.0614,8,-22.935,1,0.121,77.384
1302,Prelude for Piano and Malaria,worrytrain,ambient,59053,0.218,0.0659,10,-23.204,1,0.114,68.574
26940,Melody of Love,billy vaughn,instrumental,17703,0.193,0.0845,5,-26.86,1,0.1,82.151
29630,:D/S/R:,ad hominem,black metal,3442,0.0995,0.112,5,-20.77,0,0.0857,81.578
19748,Yamagata,hakobune,ambient,7159,0.136,0.122,6,-31.151,1,0.0944,142.797
9719,Bad News From Home,randy newman,singer-songwriter,8497,0.315,0.057,5,-16.554,0,0.268,180.342
28185,Poverty And Its Opposite,arve henriksen,jazz,29259,0.283,0.14,8,-24.394,0,0.318,113.882
5282,Reprise,the communards,80s,9014,0.269,0.14,10,-14.959,0,0.214,74.15


# To Do

Normalize Valence, Energy, and Key

Evaluate Performance