In [2]:
# Imports 

import pandas as pd
import numpy as np 
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [3]:
import pandas as pd

scaled = pd.read_csv('https://raw.githubusercontent.com/trackteam-spotify/data-science/master/data/final_scaled_dataset.csv')

scaled.head()

Unnamed: 0,index,artist_name,track_name,track_id,popularity,danceability,energy,instrumentalness,liveness,loudness,tempo
0,1,Nicholas Britell,Eros,3w5s0j9clwhk0O2uScrNOo,-0.259352,-2.986856,-1.425094,2.658536,-0.859445,-2.108236,-1.200244
1,2,Seeb,Grip,3gicyfiEVMGONgzygpWjNT,3.219563,0.049805,1.276434,-0.398629,0.478628,1.25485,0.614084
2,3,Lagwagon,Reign,7jLDlShR2ARgNKyAOD94LD,-1.157137,-0.673351,1.466926,-0.397736,-0.172702,0.643049,-0.479831
3,4,Super Whatevr,Someone Somewhere Somehow,0XvFwux1NYJrKmCYJ7DOjE,-1.157137,-0.922306,-1.4554,-0.296192,-0.394028,-1.292787,0.206582
4,5,The Front Bottoms,Flashlight,0xR5ZhiksJK6lMRdI6S2A4,-0.371575,-0.951944,1.107588,-0.398629,-0.526824,0.942632,1.048378


In [4]:
scaled.shape

(5000, 11)

## Modeling K-Nearest Neighbors

In [None]:
# Variables to cluster

cluster = ['popularity', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'tempo']

distortion = list()
for k in range(1, 31): #started with range [1, 51] and narrowed down to this range 
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(scaled.loc[:, cluster])
    distortion.append(kmeans.inertia_) # append distortion value to list

In [6]:
knn = NearestNeighbors(n_neighbors=15)
features = ['popularity', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'tempo']
X = scaled[features].values

knn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                 radius=1.0)

In [7]:
# test on 1 song
test_song = X[20]

distance, neighbors = knn.kneighbors(np.array([test_song]))
distance, neighbors

(array([[0.        , 0.63472403, 0.68087431, 0.68190637, 0.78568082,
         0.78977841, 0.79588194, 0.80620697, 0.8157036 , 0.81841708,
         0.82283869, 0.84523641, 0.84715768, 0.93537395, 0.93875525]]),
 array([[  20, 3806,  301, 2356, 3462, 1585, 1731, 1940, 2330,  695, 2353,
         1959,  849, 4776, 3544]]))

In [8]:
scaled.iloc[20]

index                                           21
artist_name                               Dua Lipa
track_name          New Rules - Initial Talk Remix
track_id                    2ygfHXyt3gvyhvKrNJU61n
popularity                                0.750656
danceability                             -0.341411
energy                                     1.59248
instrumentalness                         -0.398629
liveness                                 -0.235938
loudness                                   1.15463
tempo                                    -0.108978
Name: 20, dtype: object

## New playlist based off user input

In [9]:
song_list = []
for item in neighbors[0][1:]: # this way excludes itself
    row = scaled.iloc[item]
    song_list.append((row.track_name, row.artist_name))
    
names = ['song', 'artist']

new_playlist = pd.DataFrame(song_list, columns=names)
print(new_playlist.shape)
new_playlist

(14, 2)


Unnamed: 0,song,artist
0,On the Line - Featuring Jonas Brothers,Jonas Brothers
1,Uncomfortable,Halestorm
2,Jaded,Aerosmith
3,Apocalypse Dreams,Tame Impala
4,Adrenalize,In This Moment
5,The Other Side,Jason Derulo
6,Honky Tonk Badonkadonk,Trace Adkins
7,My Dilemma 2.0,Selena Gomez & The Scene
8,Rhinestone World,Dallas Smith
9,Desire - Gryffin Remix,Years & Years


## Pickle the Model


In [9]:
import joblib
joblib.dump(knn, 'model.pkl')

['model.pkl']

In [10]:
model = joblib.load('model.pkl')

In [11]:
model.kneighbors(X[11].reshape(1,-1))[1][0][1:]

array([  59, 3770,  360,  516,  102, 4828, 2162, 2257, 1526, 2219, 1453,
       4906, 4911, 1166])

## Sentiment Analysis (Pre-trained Neural Network)

In [10]:
!pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 13.3MB/s eta 0:00:01[K     |█████▏                          | 20kB 18.7MB/s eta 0:00:01[K     |███████▉                        | 30kB 18.7MB/s eta 0:00:01[K     |██████████▍                     | 40kB 11.8MB/s eta 0:00:01[K     |█████████████                   | 51kB 9.0MB/s eta 0:00:01[K     |███████████████▋                | 61kB 9.3MB/s eta 0:00:01[K     |██████████████████▏             | 71kB 9.0MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 9.6MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 9.2MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 9.8MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 9.8MB/s eta 0:00:01[K     |███████████████████████████████▏|

In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Using a pretrained neural network to encode title to numbers
# Adding numbers to column as sentiments
sentiments =[] 
analyzer = SentimentIntensityAnalyzer()
for sentence in scaled['track_name']:
  vs = analyzer.polarity_scores(sentence)
  sentiments.append(vs['compound'])
scaled['sentiments'] = sentiments

In [12]:
scaled.sort_values(by=['sentiments'],ascending=False).head(5)

Unnamed: 0,index,artist_name,track_name,track_id,popularity,danceability,energy,instrumentalness,liveness,loudness,tempo,sentiments
4275,4276,LANCO,Greatest Love Story - Single Mix,6Cju5I3wKoQhAJsuZuLWGB,0.077317,0.352107,0.774227,-0.398629,-0.507853,0.240678,-1.433452,0.8555
259,260,Jeff Mangum,I Love How You Love Me,07ARfS39HSzPfEWZS8opOs,-1.157137,-1.295739,0.388912,-0.398616,4.15896,-0.334077,-0.94618,0.8555
4824,4825,Céline Dion,"My Heart Will Go On - Love Theme from ""Titanic""",3oEHQmhvFLiE7ZYES0ulzv,2.658448,-0.957871,-1.442412,-0.398608,-0.450941,-0.765786,-0.690137,0.8555
598,599,Andy Williams,Happy Holiday / The Holiday Season,3sDdyBHQ60Cs1opmIyRvhp,-0.371575,-0.501453,-0.684772,-0.398629,0.282596,-0.436865,-1.523315,0.8442
4767,4768,Natalia Lafourcade,"Amor, Amor de Mis Amores",3dB42OEuVNiktvt3V5JRVI,0.750656,0.589208,-0.914228,-0.398506,-0.381381,0.310273,-0.495322,0.8402


In [13]:
scaled.sort_values(by=['sentiments'],ascending=False).tail(5)

Unnamed: 0,index,artist_name,track_name,track_id,popularity,danceability,energy,instrumentalness,liveness,loudness,tempo,sentiments
1444,1445,Kid Cudi,Too Bad I Have To Destroy You Now,6kpiwAM3vWN9UbGAQw2v8q,-0.259352,0.115007,0.237384,-0.320904,-0.526824,-0.206235,-0.462764,-0.7906
3027,3028,Rich Brian,Crisis (feat. 21 Savage),6GmjF0MtrK8UHQlIh6tzRt,0.862879,1.828056,0.26769,-0.398423,-0.43197,0.473449,0.845482,-0.7964
4460,4461,Natti Natasha,Pain Killer (feat. Chika),5X7AFA7B9RVg0IrqARIrWW,-1.044914,0.808525,0.89112,-0.398629,0.428039,0.508354,0.171945,-0.8225
576,577,Little Big Town,Pain Killer,7kieCiVmj4StWHclc04u3M,-0.932691,-0.222861,0.54044,-0.398629,1.073046,0.922931,1.414603,-0.8225
3761,3762,Scott Helman,Cry Cry Cry,6SgDipB3aLBZ6Hi7GNXWBc,-0.371575,0.215775,1.129235,-0.398629,-0.267556,0.773461,-0.494618,-0.8519


## Visualization 1

In [14]:
import plotly.graph_objects as go

categories = ['popularity', 'danceability', 'energy', 'instrumentalness', 'tempo']

fig = go.Figure()


fig.add_trace(go.Scatterpolar(
      r=[0.75, -0.34, 1.59, -0.39, -0.10],
      theta=categories,
      fill='toself',
      name='New Rules - Dua Lipa'
))
fig.add_trace(go.Scatterpolar(
      r=[0.63, -0.57, 1.38, 0.02, 0.27],
      theta=categories,
      fill='toself',
      name='Apocalypse Dreams - Tame Impala'
))
fig.add_trace(go.Scatterpolar(
      r=[0.52, -0.11, 1.26, -0.39, 0.17],
      theta=categories,
      fill='toself',
      name='On The Line - Jonas Brothers'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[-2, 2]
    )),
  showlegend=False
)

fig.show()

## Visualization 2 

In [15]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/Build-Week-Spotify-Song-Suggester-5/Data-Science/master/app/most_popular_spotify_songs.csv')

df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,R&B,Mary J. Blige,Be Without You - Kendu Mix,2YegxR5As7BeQuVp2U6pek,65,0.083,0.724,246333,0.689,0.0,D,0.304,-5.922,Minor,0.135,146.496,4/4,0.693
1,R&B,Rihanna,Desperado,6KFaHC9G178beAp7P0Vi5S,63,0.323,0.685,186467,0.61,0.0,C,0.102,-5.221,Minor,0.0439,94.384,3/4,0.323
2,R&B,Yung Bleu,Ice On My Baby (feat. Kevin Gates) - Remix,6muW8cSjJ3rusKJ0vH5olw,62,0.0675,0.762,199520,0.52,4e-06,F,0.114,-5.237,Minor,0.0959,75.047,4/4,0.0862
3,R&B,Surfaces,Heaven Falls / Fall on Me,7yHqOZfsXYlicyoMt62yC6,61,0.36,0.563,240597,0.366,0.00243,B,0.0955,-6.896,Minor,0.121,85.352,4/4,0.768
4,R&B,Olivia O'Brien,Love Myself,4XzgjxGKqULifVf7mnDIQK,68,0.596,0.653,213947,0.621,0.0,B,0.0811,-5.721,Minor,0.0409,100.006,4/4,0.466


In [16]:
# Compare/contrast genres
groupby_genre = df.groupby(['genre'])
genre_musical_attributes = groupby_genre.mean()

genre_musical_attributes.sort_values(by=['popularity'],ascending=False)

Unnamed: 0_level_0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Pop,67.168938,0.253703,0.60526,218404.415944,0.659991,0.017253,0.178326,-6.175284,0.074487,123.306378,0.505836
Rap,59.65584,0.223981,0.703992,202058.282654,0.617269,0.02331,0.188647,-7.315692,0.181473,121.026423,0.44205
Rock,59.165608,0.20877,0.520227,246147.502269,0.674086,0.058634,0.194449,-7.873069,0.053366,122.918544,0.513846
Hip-Hop,58.622716,0.173847,0.727813,217497.842561,0.645481,0.008903,0.200204,-6.785387,0.211632,122.017802,0.461974
Dance,58.329712,0.155698,0.641946,224235.651374,0.695615,0.02797,0.185667,-6.01337,0.085471,120.677401,0.517677
Indie,53.748256,0.310207,0.573341,215959.894146,0.60248,0.099785,0.171845,-7.606221,0.065098,120.680002,0.452243
Children’s Music,53.473438,0.150533,0.525571,232100.3875,0.716429,0.029552,0.196079,-6.307256,0.070634,122.549317,0.440069
Alternative,51.028906,0.16102,0.541975,232911.767936,0.7129,0.059844,0.195165,-6.503551,0.089687,122.325118,0.449224
Folk,50.92276,0.482245,0.52471,234371.038688,0.481843,0.089172,0.16578,-9.977678,0.044788,118.489007,0.427171
Reggaeton,50.51512,0.224959,0.739527,233292.535513,0.736152,0.003471,0.194116,-5.636099,0.126337,122.575213,0.63994


In [17]:
# Popularity By Genre

import plotly.graph_objects as go
import plotly.express as px

fig = go.Figure()
fig.add_trace(go.Bar(
    y= ['Pop', 'Rap', 'Rock', 'Hip-Hop', 'Dance', 'Indie', 'Childrens Music', 'Alternative', 'Folk', 'Raggaeton', 'R&B', 'Movie',
        'Country', 'Raggae', 'Soul', 'Electronic', 'Jazz'],
    
    x= [67, 59, 59, 58, 58, 53, 53, 51, 50, 50, 49, 49, 48, 48, 47, 47, 47],
    name='Genre',
    orientation='h',
    marker=dict(
        color='rgba(2, 181, 8, 0.6)',
        line=dict(color='rgba(105, 105, 105, 1.0)', width=1)      
    ) 
))
fig.update_layout(barmode='stack')
fig.update_layout(
  title_text="Popularity By Genre"),

# Source
annotations = []
annotations.append(dict(xref='paper', yref='paper',
                        x=0, y=-0.15,
                        text='Source: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks',
                        font=dict(family='Arial', size=13, color='rgb(150,150,150)'),
                        showarrow=False))

fig.update_layout(annotations=annotations)
fig.show()

## Hard Code Input Dictionary

In [None]:
def model_maker():

  # Variables to cluster
  cluster = ['popularity', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'tempo']

  distortion = list()
  for k in range(1, 31): #started with range [1, 51] and narrowed down to this range 
      kmeans = KMeans(n_clusters = k)
      kmeans.fit(scaled.loc[:, cluster])
      distortion.append(kmeans.inertia_) # append distortion value to list
  
  # Modeling K-Nearest Neighbors
  knn = NearestNeighbors(n_neighbors=15)
  features = ['popularity', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'tempo']
  X = scaled[features].values

  knn.fit(X)



# test on 1 song
test_song = X[user_input]

distance, neighbors = knn.kneighbors(np.array([test_song]))

knn = model_maker()

def predict(user_input):

  # test on 1 song
  test_song = X[user_input]

  distance, neighbors = knn.kneighbors(np.array([test_song]))

  song_list = []
  for item in neighbors[0][1:]: # this way excludes itself
      row = scaled.iloc[item]
      song_list.append((row.track_name, row.artist_name))
    
  names = ['song', 'artist']

  new_playlist = pd.DataFrame(song_list, columns=names)

In [None]:
predict(user_input)