In [40]:
# general
import pandas as pd
import numpy as np
import sys
import random

# spotipy dependencies
import spotipy
import spotipy.oauth2
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

# ML dependencies
import statsmodels.api as sm
import joblib
from joblib import load
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import IsolationForest

In [22]:
# lets us see all queried data
pd.set_option('display.max_rows', None)

# globals
client_id = '84bb0f8b3cfa4cc69a8c5175ae89c6b5'
client_secret = 'a01aa1e389bd45f9a83ceb76b89f5aee'
redirect_uri = 'http://localhost:8888/callback'
scope = 'user-top-read user-library-read playlist-modify-private playlist-modify-public'

# oauth2 login
sp_oauth = SpotifyOAuth(client_id = client_id,
                        client_secret = client_secret,
                        redirect_uri = redirect_uri,
                        scope = scope)
token_info = sp_oauth.get_cached_token() 
token = token_info['access_token']
sp = spotipy.Spotify(auth = token)

# new login
if not token_info:
    auth_url = sp_oauth.get_authorize_url()
    print(auth_url)
    response = input('Paste the redirect url here: ')
    code = sp_oauth.parse_response_code(response)
    token_info = sp_oauth.get_access_token(code)
    token = token_info['access_token']
sp = spotipy.Spotify(auth=token)

# your user id
user_id = sp.current_user()['display_name']

# function to refresh access token
def refresh():
    global token_info, sp
    if sp_oauth.is_token_expired(token_info):
        token_info = sp_oauth.refresh_access_token(token_info['refresh_token'])
        token = token_info['access_token']
        sp = spotipy.Spotify(auth=token)

# function to easily rank data        
def rank(data):
    refresh()
    ranked = data.sort_values(by = ['scores'], ascending = False)
    return ranked[['scores', 'name', 'artist', 'album', 'track_id']]

In [6]:
# id for the playlist you'd like to work with
playlist_id = '6vqdAph2gxHHgvojxM7js4'

In [8]:
# returns metadata for each song in the playlist
results = sp.user_playlist_tracks(user_id, playlist_id)
tracks = results['items']
while results['next']:
    results = sp.next(results)
    tracks.extend(results['items'])
tracks[:5]

[{'added_at': '2020-11-22T19:27:19Z',
  'added_by': {'external_urls': {'spotify': 'https://open.spotify.com/user/cloud_none'},
   'href': 'https://api.spotify.com/v1/users/cloud_none',
   'id': 'cloud_none',
   'type': 'user',
   'uri': 'spotify:user:cloud_none'},
  'is_local': False,
  'primary_color': None,
  'track': {'album': {'album_type': 'album',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/5EHvXKnNz78jkAVgTQLQ5O'},
      'href': 'https://api.spotify.com/v1/artists/5EHvXKnNz78jkAVgTQLQ5O',
      'id': '5EHvXKnNz78jkAVgTQLQ5O',
      'name': 'Dark Tranquillity',
      'type': 'artist',
      'uri': 'spotify:artist:5EHvXKnNz78jkAVgTQLQ5O'}],
    'available_markets': ['AD',
     'AE',
     'AG',
     'AL',
     'AM',
     'AO',
     'AR',
     'AT',
     'AU',
     'AZ',
     'BA',
     'BB',
     'BD',
     'BE',
     'BF',
     'BG',
     'BH',
     'BI',
     'BJ',
     'BN',
     'BO',
     'BR',
     'BS',
     'BT',
     'BW',
     'BY',
    

In [10]:
# returns ids for each track in the playlist 
track_list = []
for i in range(len(tracks)):
    ids = tracks[i]['track']['id']
    track_list.append(ids)
track_list[:5]

['7LkxDAm3CxOMSFDZcRx0kI',
 '23pLHGVFWELvxIiw0CQMaX',
 '2dzCUvvliKLw9dj84IRlAg',
 '4ueVppRcLrdLdL7sQTHhWb',
 '4Lgj2JWNhM5DTWAMhz34AS']

In [12]:
# returns audio featurs for the first track
sp.audio_features(track_list[0])

[{'danceability': 0.411,
  'energy': 0.965,
  'key': 7,
  'loudness': -4.503,
  'mode': 1,
  'speechiness': 0.0565,
  'acousticness': 2.51e-05,
  'instrumentalness': 0.216,
  'liveness': 0.063,
  'valence': 0.422,
  'tempo': 159.993,
  'type': 'audio_features',
  'id': '7LkxDAm3CxOMSFDZcRx0kI',
  'uri': 'spotify:track:7LkxDAm3CxOMSFDZcRx0kI',
  'track_href': 'https://api.spotify.com/v1/tracks/7LkxDAm3CxOMSFDZcRx0kI',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7LkxDAm3CxOMSFDZcRx0kI',
  'duration_ms': 277827,
  'time_signature': 4}]

In [16]:
# returns our all the descriptive data for tracks in our playlist

refresh()

def getTrackFeatures(track_id):
    t_id = track_id
    meta = sp.track(track_id)
    features = sp.audio_features(track_id)

    # meta
    name = meta['name']
    album = meta['album']['name']
    artist = meta['album']['artists'][0]['name']
    artist_id = meta['album']['artists'][0]['id']
    release_date = meta['album']['release_date']
    length = meta['duration_ms']
    popularity = meta['popularity']

    # features
    key = features[0]['key']
    mode = features[0]['mode']
    acousticness = features[0]['acousticness']
    danceability = features[0]['danceability']
    energy = features[0]['energy']
    valence = features[0]['valence']
    instrumentalness = features[0]['instrumentalness']
    liveness = features[0]['liveness']
    loudness = features[0]['loudness']
    speechiness = features[0]['speechiness']
    tempo = features[0]['tempo']
    time_signature = features[0]['time_signature']

    track = [t_id,
             artist_id,
             name,
             album,
             artist,
             release_date,
             length,
             popularity,
             key,
             mode,
             acousticness,
             danceability,
             energy,
             valence,
             instrumentalness,
             liveness,
             loudness,
             speechiness,
             tempo,
             time_signature]
    return track

# parse features
track_features = []
for i in range(len(track_list)):
    track = getTrackFeatures(track_list[i])
    track_features.append(track)

# build dataframe
df = pd.DataFrame(track_features,
                  columns = ['track_id',
                             'artist_id',
                             'name',
                             'album',
                             'artist',
                             'release_date',
                             'length',
                             'popularity',
                             'key',
                             'mode',
                             'acousticness',
                             'danceability',
                             'energy',
                             'valence',
                             'instrumentalness',
                             'liveness',
                             'loudness',
                             'speechiness',
                             'tempo',
                             'time_signature'])

# squish popularity to be in the same scale as the other features
# squish length to be measured in minutes
df['popularity'] = df['popularity'] / 100
df['length'] = (df['length'] / 1000) / 60

df.head()

Unnamed: 0,track_id,artist_id,name,album,artist,release_date,length,popularity,key,mode,acousticness,danceability,energy,valence,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,7LkxDAm3CxOMSFDZcRx0kI,5EHvXKnNz78jkAVgTQLQ5O,The Lesser Faith,Fiction,Dark Tranquillity,2007-09-26,4.630433,0.35,7,1,2.5e-05,0.411,0.965,0.422,0.216,0.063,-4.503,0.0565,159.993,4
1,23pLHGVFWELvxIiw0CQMaX,1xUhNgw4eJDZfvumIpcz1B,Hatebreeder,Hatebreeder,Children Of Bodom,1999-05-04,4.31955,0.44,7,0,0.000275,0.417,0.976,0.638,0.837,0.387,-3.196,0.0507,111.709,4
2,2dzCUvvliKLw9dj84IRlAg,1xUhNgw4eJDZfvumIpcz1B,Towards Dead End,Hatebreeder,Children Of Bodom,1999-05-04,4.9011,0.39,2,0,0.000156,0.188,0.949,0.538,0.839,0.32,-2.49,0.0599,108.075,4
3,4ueVppRcLrdLdL7sQTHhWb,4EvbQBS99RXzFGGimAS3i9,Eternal,Epsilon,Blood Stain Child,2011-06-30,4.498,0.27,1,1,2.1e-05,0.508,0.977,0.305,0.0145,0.162,-4.582,0.0709,104.993,4
4,4Lgj2JWNhM5DTWAMhz34AS,2UOVgpgiNTC6KK0vSC77aD,Sampo,Skyforger,Amorphis,2009-05-29,6.147333,0.16,7,1,6e-05,0.159,0.763,0.269,0.0257,0.812,-4.462,0.039,175.869,3


In [17]:
df.describe()

Unnamed: 0,length,popularity,key,mode,acousticness,danceability,energy,valence,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
count,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0
mean,5.201763,0.289067,5.786667,0.386667,0.000695,0.356847,0.933493,0.252531,0.285281,0.239961,-4.37028,0.090411,126.177573,3.8
std,1.480261,0.137184,3.739104,0.490266,0.003196,0.117077,0.073427,0.146089,0.323735,0.199103,1.456061,0.056537,29.506045,0.402694
min,3.422667,0.0,0.0,0.0,1e-06,0.0975,0.685,0.0345,0.0,0.045,-7.971,0.0306,87.564,3.0
25%,4.345442,0.18,2.0,0.0,8e-06,0.256,0.927,0.138,0.01186,0.11,-5.1755,0.05715,104.966,4.0
50%,4.8199,0.3,7.0,0.0,5.5e-05,0.39,0.963,0.222,0.13,0.16,-4.339,0.074,119.425,4.0
75%,5.4354,0.39,9.0,1.0,0.000211,0.4475,0.979,0.3205,0.561,0.308,-3.2375,0.1115,144.7575,4.0
max,11.657767,0.63,11.0,1.0,0.0271,0.557,0.995,0.638,0.947,0.941,-1.409,0.406,199.915,4.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          75 non-null     object 
 1   artist_id         75 non-null     object 
 2   name              75 non-null     object 
 3   album             75 non-null     object 
 4   artist            75 non-null     object 
 5   release_date      75 non-null     object 
 6   length            75 non-null     float64
 7   popularity        75 non-null     float64
 8   key               75 non-null     int64  
 9   mode              75 non-null     int64  
 10  acousticness      75 non-null     float64
 11  danceability      75 non-null     float64
 12  energy            75 non-null     float64
 13  valence           75 non-null     float64
 14  instrumentalness  75 non-null     float64
 15  liveness          75 non-null     float64
 16  loudness          75 non-null     float64
 17 

In [18]:
# drop object variables
preprocess = df.drop(df.select_dtypes('object'), axis = 1)

# convert int64 to categoryical variables
int64 = preprocess.select_dtypes('int64')
for x in int64:
    preprocess[x] = preprocess[x].astype('category')
preprocess.head()

Unnamed: 0,length,popularity,key,mode,acousticness,danceability,energy,valence,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,4.630433,0.35,7,1,2.5e-05,0.411,0.965,0.422,0.216,0.063,-4.503,0.0565,159.993,4
1,4.31955,0.44,7,0,0.000275,0.417,0.976,0.638,0.837,0.387,-3.196,0.0507,111.709,4
2,4.9011,0.39,2,0,0.000156,0.188,0.949,0.538,0.839,0.32,-2.49,0.0599,108.075,4
3,4.498,0.27,1,1,2.1e-05,0.508,0.977,0.305,0.0145,0.162,-4.582,0.0709,104.993,4
4,6.147333,0.16,7,1,6e-05,0.159,0.763,0.269,0.0257,0.812,-4.462,0.039,175.869,3


In [26]:
# returns homogenity scores added to the original dataframe

# prevents errors if rerunning analysis
if 'scores' in df.columns:
    df = df.drop(['scores'], axis = 1)

# Isolation Forest analysis    
model = IsolationForest(n_estimators = 500,
                        max_samples = 'auto',
                        contamination = 0.05, # assumes bottom 5% as outliers
                        n_jobs = -1,
                        random_state = 21)
model.fit(preprocess)
df['scores'] = model.decision_function(preprocess)
rank(df)

Unnamed: 0,scores,name,artist,album,track_id
21,0.152789,Where The Last Wave Broke,Insomnium,Across The Dark,1FQtNCA2nYXEeZ41Z4XyH7
49,0.134149,Through The Shadow,Insomnium,One For Sorrow,4bOX3KfGgQE1GkxxJ3OUtI
63,0.133181,Enhance My Nightmare,Sonic Syndicate,Eden Fire,2XD4NowUR2fQHNTF2FO1T5
0,0.132269,The Lesser Faith,Dark Tranquillity,Fiction,7LkxDAm3CxOMSFDZcRx0kI
39,0.13174,Sum Of The Universe,Universum,Mortuus Machina,5UqR2S9N3RTYNmpnpEYCwj
6,0.129903,Nihilistic Overdrive,Disarmonia Mundi,Mind Tricks (Extended Version),0wD8PBXXq3cDndyhu0SX9D
25,0.129193,Voices,Machinemade God,Masked,4UiUd7ClIzWK0r3qkswGc6
42,0.127836,Crimson Symphony,Blood Stain Child,Silence of Northern Hell,49Cc4uBdMIPPDDjmcn8DLb
3,0.125857,Eternal,Blood Stain Child,Epsilon,4ueVppRcLrdLdL7sQTHhWb
53,0.122369,The Second Flame,Omnium Gatherum,The Redshift,4YKY4vsi5tJKK2KsNY4zO5


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          75 non-null     object 
 1   artist_id         75 non-null     object 
 2   name              75 non-null     object 
 3   album             75 non-null     object 
 4   artist            75 non-null     object 
 5   release_date      75 non-null     object 
 6   length            75 non-null     float64
 7   popularity        75 non-null     float64
 8   key               75 non-null     int64  
 9   mode              75 non-null     int64  
 10  acousticness      75 non-null     float64
 11  danceability      75 non-null     float64
 12  energy            75 non-null     float64
 13  valence           75 non-null     float64
 14  instrumentalness  75 non-null     float64
 15  liveness          75 non-null     float64
 16  loudness          75 non-null     float64
 17 

In [28]:
# drop object variables
preprocess = df.drop(df.select_dtypes('object'), axis = 1)

# convert int64 to categoryical variables
int64 = preprocess.select_dtypes('int64')
for x in int64:
    preprocess[x] = preprocess[x].astype('category')
preprocess.head()

Unnamed: 0,length,popularity,key,mode,acousticness,danceability,energy,valence,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,scores
0,4.630433,0.35,7,1,2.5e-05,0.411,0.965,0.422,0.216,0.063,-4.503,0.0565,159.993,4,0.132269
1,4.31955,0.44,7,0,0.000275,0.417,0.976,0.638,0.837,0.387,-3.196,0.0507,111.709,4,0.060754
2,4.9011,0.39,2,0,0.000156,0.188,0.949,0.538,0.839,0.32,-2.49,0.0599,108.075,4,0.06627
3,4.498,0.27,1,1,2.1e-05,0.508,0.977,0.305,0.0145,0.162,-4.582,0.0709,104.993,4,0.125857
4,6.147333,0.16,7,1,6e-05,0.159,0.763,0.269,0.0257,0.812,-4.462,0.039,175.869,3,-0.018615


In [75]:
# returns pvalues for linear regression model
Y = preprocess['scores'].values
X = preprocess.drop(['scores'], axis = 1).values

model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()

pvalues = list(model_result.pvalues)
headers = list((preprocess.drop(['scores'], axis = 1)).columns)

for i in range(len(pvalues)):
    pvalues[i] = round(pvalues[i], 3)

headers_pvalues = dict(zip(headers, pvalues))  
headers_pvalues

{'length': 0.01,
 'popularity': 0.038,
 'key': 0.74,
 'mode': 0.82,
 'acousticness': 0.0,
 'danceability': 0.941,
 'energy': 0.0,
 'valence': 0.019,
 'instrumentalness': 0.007,
 'liveness': 0.001,
 'loudness': 0.009,
 'speechiness': 0.0,
 'tempo': 0.0,
 'time_signature': 0.0}

In [80]:
# returns any keys that are statistically insignificant
drops = {k:v for (k,v) in headers_pvalues.items() if v > 0.05}
drop_keys = list(drops.keys())
drop_keys

['key', 'mode', 'danceability']

In [82]:
# drop object variables
preprocess = df.drop(df.select_dtypes('object'), axis = 1)
preprocess = preprocess.drop(drop_keys, axis = 1)
# convert int64 to categoryical variables
int64 = preprocess.select_dtypes('int64')
for x in int64:
    preprocess[x] = preprocess[x].astype('category')
preprocess.head()

Unnamed: 0,length,popularity,acousticness,energy,valence,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,scores
0,4.630433,0.35,2.5e-05,0.965,0.422,0.216,0.063,-4.503,0.0565,159.993,4,0.132269
1,4.31955,0.44,0.000275,0.976,0.638,0.837,0.387,-3.196,0.0507,111.709,4,0.060754
2,4.9011,0.39,0.000156,0.949,0.538,0.839,0.32,-2.49,0.0599,108.075,4,0.06627
3,4.498,0.27,2.1e-05,0.977,0.305,0.0145,0.162,-4.582,0.0709,104.993,4,0.125857
4,6.147333,0.16,6e-05,0.763,0.269,0.0257,0.812,-4.462,0.039,175.869,3,-0.018615


In [89]:
if 'scores' in preprocess.columns:
    predictions = preprocess.drop(['scores'], axis = 1)
else:
    predictions = preprocess

# create a pipeline    
predict_pipeline = make_pipeline(IsolationForest(n_estimators = (500),
                                                 max_samples = 'auto',
                                                 max_features = predictions.shape[1],
                                                 contamination = 0.05,
                                                 n_jobs = -1,
                                                 random_state = 21)
                                )
predict_pipeline.fit(predictions)

# save model as playlist_id.joblib
joblib.dump(predict_pipeline, str(playlist_id + '.joblib'))
clf = load('6vqdAph2gxHHgvojxM7js4.joblib')

scores = clf.decision_function(predictions)
df['scores'] = scores
rank(df)

Unnamed: 0,scores,name,artist,album,track_id
21,0.17207,Where The Last Wave Broke,Insomnium,Across The Dark,1FQtNCA2nYXEeZ41Z4XyH7
3,0.170371,Eternal,Blood Stain Child,Epsilon,4ueVppRcLrdLdL7sQTHhWb
49,0.154815,Through The Shadow,Insomnium,One For Sorrow,4bOX3KfGgQE1GkxxJ3OUtI
17,0.149748,Petrichor by Sulphur,Soilwork,The Ride Majestic,2v4gF2JIQXqdbVpZiyvDHj
39,0.147419,Sum Of The Universe,Universum,Mortuus Machina,5UqR2S9N3RTYNmpnpEYCwj
25,0.147241,Voices,Machinemade God,Masked,4UiUd7ClIzWK0r3qkswGc6
53,0.146438,The Second Flame,Omnium Gatherum,The Redshift,4YKY4vsi5tJKK2KsNY4zO5
63,0.145923,Enhance My Nightmare,Sonic Syndicate,Eden Fire,2XD4NowUR2fQHNTF2FO1T5
6,0.143037,Nihilistic Overdrive,Disarmonia Mundi,Mind Tricks (Extended Version),0wD8PBXXq3cDndyhu0SX9D
0,0.137714,The Lesser Faith,Dark Tranquillity,Fiction,7LkxDAm3CxOMSFDZcRx0kI


In [97]:
# returns spotify recomendations for the most homogenous tracks in the playlist
refresh()
seed_tracks = rank(df)[:5]['track_id'].tolist()
seed_rec = sp.recommendations(seed_tracks = seed_tracks, limit = 100)

rec_list = []
for i in range(len(seed_rec['tracks'])):
    ids = seed_rec['tracks'][i]['id']
    rec_list.append(ids)
rec_list[:5]

['3d9hmArOgeP3iBHEaOL15O',
 '5yEVh1qPpp1pvNarqJPYwo',
 '63H2QSdtZPc3BA6e2IrvvW',
 '44mommx98x2Dh3Ud9fdbym',
 '73vyulFUPAla99D1wUtMQo']

In [98]:
track_features = []
for i in range(len(rec_list)):
    track = getTrackFeatures(rec_list[i])
    track_features.append(track)

# build dataframe
rec_df = pd.DataFrame(track_features,
                  columns = ['track_id',
                             'artist_id',
                             'name',
                             'album',
                             'artist',
                             'release_date',
                             'length',
                             'popularity',
                             'key',
                             'mode',
                             'acousticness',
                             'danceability',
                             'energy',
                             'valence',
                             'instrumentalness',
                             'liveness',
                             'loudness',
                             'speechiness',
                             'tempo',
                             'time_signature'])

# squish popularity to be in the same scale as the other features
# squish length to be measured in minutes
rec_df['popularity'] = rec_df['popularity'] / 100
rec_df['length'] = (rec_df['length'] / 1000) / 60

rec_df.head()

Unnamed: 0,track_id,artist_id,name,album,artist,release_date,length,popularity,key,mode,acousticness,danceability,energy,valence,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,3d9hmArOgeP3iBHEaOL15O,4JNF6YOUi69O4zAdwQVGLY,Wings Of Feather And Wax,Wings Of Feather And Wax,Killer Be Killed,2014-04-11,3.73585,0.14,0,1,3.7e-05,0.256,0.964,0.283,0.000585,0.351,-3.597,0.0919,92.469,4
1,5yEVh1qPpp1pvNarqJPYwo,0iem9JStyv56PV2X7avZbo,Stormghost,Cold Inferno,Disarmonia Mundi,2015-06-09,4.4135,0.32,8,1,5e-06,0.403,0.987,0.144,0.000414,0.385,-2.362,0.111,98.016,4
2,63H2QSdtZPc3BA6e2IrvvW,1tcrgTKBUpGVRGnPz4breO,Whispering Deeps,Inner Universe,Words Of Farewell,2020-03-31,5.807333,0.27,2,1,8.7e-05,0.402,0.981,0.0897,0.31,0.0687,-4.476,0.216,120.042,4
3,44mommx98x2Dh3Ud9fdbym,7gTbq5nTZGQIUgjEGXQpOS,When the Lights Are Down,The Black Halo,Kamelot,2005-03-15,3.6911,0.52,10,0,5.6e-05,0.303,0.939,0.369,0.0128,0.108,-6.377,0.0671,169.97,4
4,73vyulFUPAla99D1wUtMQo,3Meu28o8P5z9Zjm6NTGihT,Falling Snow,Ashes Against The Grain,Agalloch,2006-08-15,9.646,0.36,10,1,7e-06,0.184,0.987,0.166,0.585,0.0859,-4.328,0.0736,131.129,4


In [100]:
# drop object variables
rec_preprocess = rec_df.drop(rec_df.select_dtypes('object'), axis = 1)
rec_preprocess = rec_preprocess.drop(drop_keys, axis = 1)

# convert int64 to categoryical variables
int64 = rec_preprocess.select_dtypes('int64')
for x in int64:
    rec_preprocess[x] = rec_preprocess[x].astype('category')
rec_preprocess.head()


if 'scores' in rec_preprocess.columns:
    rec_predictions = rec_preprocess.drop(['scores'], axis = 1)
else:
    rec_predictions = rec_preprocess
    
scores = clf.decision_function(rec_predictions)
rec_df['scores'] = scores
rank(rec_df)

Unnamed: 0,scores,name,artist,album,track_id
29,0.154815,Through The Shadow,Insomnium,One For Sorrow,4bOX3KfGgQE1GkxxJ3OUtI
98,0.149192,Blade Reflections,Omnium Gatherum,Blade Reflections,02jmSLjpElhxW6w0waFQtw
32,0.145093,Strenght from My Wounds,Lahmia,Into the Abyss,4BFsNwlAP8AxLX4BOsTFlq
73,0.145078,Afterlife,Metalite,Heroes in Time,7E23KVVacUy7IXPwF60iZG
58,0.143206,Ascend to the Throne,Rise to Fall,Defying the Gods,0wZPKoGk8ddgmgqVUXaWBc
40,0.142138,Death in General,Soilwork,The Ride Majestic,1suh3dQIhaJiHger91tDQ0
97,0.1404,Frozen Angel,Norther,N,0qwHMBslXUZo7ePfVYnU5X
76,0.139749,Midnight Runner,Mustasch,Midnight Runner,4sHSGl11LQbHVynHu3p6JX
39,0.138885,In Sorrow,Destinity,XI Reasons to See,7qQeldOEJZ5JLSN3L8VA9M
19,0.136988,Unbreakable,My Dear Addiction,Unbreakable,4cOVDjeBdpQiU4Qolf9JXp


In [113]:
# top 75% of original playlist is above this score
cutoff = dict(df['scores'].describe())['75%']

# combine original playlist with top reccomendations
frames = [df, rec_df.loc[rec_df['scores'] >= cutoff]]
pull_list = pd.concat(frames, ignore_index = True)
pull_list = pull_list.drop_duplicates(subset = 'track_id')
len(pull_list)

91

In [117]:
pull_list

Unnamed: 0,track_id,artist_id,name,album,artist,release_date,length,popularity,key,mode,...,danceability,energy,valence,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,scores
0,7LkxDAm3CxOMSFDZcRx0kI,5EHvXKnNz78jkAVgTQLQ5O,The Lesser Faith,Fiction,Dark Tranquillity,2007-09-26,4.630433,0.35,7,1,...,0.411,0.965,0.422,0.216,0.063,-4.503,0.0565,159.993,4,0.137714
1,23pLHGVFWELvxIiw0CQMaX,1xUhNgw4eJDZfvumIpcz1B,Hatebreeder,Hatebreeder,Children Of Bodom,1999-05-04,4.31955,0.44,7,0,...,0.417,0.976,0.638,0.837,0.387,-3.196,0.0507,111.709,4,0.044909
2,2dzCUvvliKLw9dj84IRlAg,1xUhNgw4eJDZfvumIpcz1B,Towards Dead End,Hatebreeder,Children Of Bodom,1999-05-04,4.9011,0.39,2,0,...,0.188,0.949,0.538,0.839,0.32,-2.49,0.0599,108.075,4,0.088695
3,4ueVppRcLrdLdL7sQTHhWb,4EvbQBS99RXzFGGimAS3i9,Eternal,Epsilon,Blood Stain Child,2011-06-30,4.498,0.27,1,1,...,0.508,0.977,0.305,0.0145,0.162,-4.582,0.0709,104.993,4,0.170371
4,4Lgj2JWNhM5DTWAMhz34AS,2UOVgpgiNTC6KK0vSC77aD,Sampo,Skyforger,Amorphis,2009-05-29,6.147333,0.16,7,1,...,0.159,0.763,0.269,0.0257,0.812,-4.462,0.039,175.869,3,-0.029062
5,33Tw7bBpn08xmigGaruVhL,6e8ISIsI7UQZPyEorefAhK,In Death's Embrace,Enthrone Darkness Triumphant,Dimmu Borgir,1997-05-30,5.702667,0.41,5,1,...,0.0975,0.897,0.191,0.87,0.16,-5.352,0.0669,89.358,4,0.099276
6,0wD8PBXXq3cDndyhu0SX9D,0iem9JStyv56PV2X7avZbo,Nihilistic Overdrive,Mind Tricks (Extended Version),Disarmonia Mundi,2006,4.858,0.3,11,0,...,0.392,0.985,0.239,0.564,0.199,-2.937,0.072,95.003,4,0.143037
7,3kaEzsEQ6JweUJwzjpCCce,4XJZ7zUJzQuayqyAVJowKX,Beauty of Malice,Engraved in Black,Graveworm,2003-06-30,5.4111,0.13,4,1,...,0.431,0.977,0.232,0.672,0.25,-2.959,0.0788,129.977,4,0.123568
8,3HnwTO5lY1TeWYzFcnM4Eh,57ylwQTnFnIhJh4nu4rxCs,Ever Dying,Subterranean,In Flames,1995-06-15,4.388567,0.19,7,0,...,0.342,0.83,0.159,0.879,0.303,-7.971,0.0584,105.001,4,0.024356
9,6xPwcnwKGZfeLxCHMtbVhu,3uIgLG971oRM5fe6v8lvQS,Devoid Of Caring,Above The Weeping World,Insomnium,2006-08-09,5.671333,0.24,5,1,...,0.401,0.978,0.108,0.947,0.241,-3.235,0.0777,94.975,4,0.104753


In [115]:
def generate_playlist(input_track, input_playlist):
    refresh()
    pickiness = 3 # number of nearest songs
    knn = NearestNeighbors(n_neighbors = pickiness + 1)
    up_next = []
    up_next_length = 24 
    working_playlist = input_playlist
    
    seed = input_track
    up_next.append(input_track)
    
    for i in range(up_next_length): 

    # find seed track nearest neighbors (currently returns 6, trimmed down to 5)
        current_id = working_playlist.loc[working_playlist['track_id'] == seed]

    # retrain every, necessary evil to ensure closest remaining neighbors are selected
        knn_fit = preprocess(working_playlist)
        knn.fit(knn_fit)

    # find n nearest neighbors (6, generally includes self as nearest)
        processed_c_id = preprocess(current_id)
        neighbors = knn.kneighbors(processed_c_id,
                                   return_distance = False)
        neighbors_list = np.concatenate(neighbors).ravel().tolist()

    # exclude current seed as a next seed option
        seed_index = working_playlist.index[working_playlist['track_id'] == seed].to_list()
        for i in range(len(neighbors_list)-1):
            if neighbors_list[i] == seed_index[0]:
                del neighbors_list[i]
            else:
                pass

    # logic to reduce to drop furthest if seed isn't a nearest neighbor to self (not common in testing)
        if len(neighbors_list) == pickiness + 1:
            neighbors_list = neighbors_list[:pickiness]
        else:
            pass

    # randomly selected new seed from neighbors
        new_seed_index = []
        new_seed_index.append(random.choice(neighbors_list))
    
    # logic to greatly reduce probability of repeating artists back to back
        seed_artist = working_playlist.iloc[seed_index,[1]].values[0][0]
        new_seed_artist = working_playlist.iloc[new_seed_index,[1]].values[0][0]
        if seed_artist == new_seed_artist:
            neighbors_list.remove(new_seed_index[0])
            new_seed_index = []
            new_seed_index.append(random.choice(neighbors_list))
        else:
            pass
    
        new_seed = working_playlist.iloc[new_seed_index,[0]]
        new_seed_id = new_seed['track_id'].tolist()
        new_seed_id = new_seed_id[0]

    # update queue with new track id
        up_next.extend(new_seed['track_id'].tolist())

    # drop old seed from available seeds. cleans up dataframe
        working_playlist = working_playlist.drop(seed_index[0], 0)
        working_playlist.reset_index(inplace = True)
        working_playlist = working_playlist.drop(working_playlist.columns[[0]], 1)

        seed = new_seed_id

    # querry input playlist to generate a df with necessary info
    compile_playlist = []
    for i in range(len(up_next)):
        track = input_playlist.loc[input_playlist['track_id'] == up_next[i]]
        track_list = track[['track_id', 'name', 'artist','album']].values.tolist()
        compile_playlist.extend(track_list)
    final_playlist = pd.DataFrame(compile_playlist, columns =['track_id', 'name', 'artist','album'])
    
    return final_playlist

In [116]:
input_track = '4bOX3KfGgQE1GkxxJ3OUtI'
input_playlist = pull_list

generate_playlist(input_track, input_playlist)

TypeError: 'DataFrame' object is not callable