Libraries

In [1]:
import pandas as pd
import numpy as np

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import requests
from random import randint
from time import sleep

from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
from matplotlib.lines import Line2D

from sklearn.cluster import KMeans


### Spotipy

In [2]:
secrets_file = open("SpotifySecret.txt","r")

In [3]:
string = secrets_file.read()

In [4]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

In [5]:
#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['cs']))

In [None]:
def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3))
    return tracks

I will mix several TOP song lists

In [None]:
plIds = ["37i9dQZEVXbMDoHDwVN2tF","4NLrcCMFyUgtXor7EjlE7d","5ZyAjPmaz9KOB4f73RFYvi","6MJSGcF4iV79gyo8xZpd8U", 
         "6CfQ2Ptcxju9l6YC2LXzUb", "4hNaqkezNNv7ImufYGHgkf", "1zrx1DCawPLg6Y8AMFgCUZ", "2iBH9S3UXlrtUBxjffgZEh",
         "1cax1gYS1699tnR5bVjtiY", "2P4tjMKY8ORcUU17QBz83q", "5fcPNR6KXA3iYgNrvTDZpY", "62SVVpIhl7b5c82G0IH603"]




In [None]:
#Combining all lists in one
all_tracks = []
for i in range(len(plIds)):
    all_tracks = all_tracks + (get_playlist_tracks(plIds[i]))

In [None]:
title = []
artist = []
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []


for i in range(len(all_tracks)):
    title.append(all_tracks[i]["track"]["name"])
    artist.append(all_tracks[i]['track']['artists'][0]['name'])
    song_uri = all_tracks[i]["track"]["uri"]
    audio = sp.audio_features(song_uri)
    danceability.append(audio[0]['danceability'])
    energy.append(audio[0]['energy'])
    key.append(audio[0]['key'])
    loudness.append(audio[0]['loudness'])
    mode.append(audio[0]['mode'])
    speechiness.append(audio[0]['speechiness'])
    acousticness.append(audio[0]['acousticness'])
    instrumentalness.append(audio[0]['instrumentalness'])
    liveness.append(audio[0]['liveness'])
    valence.append(audio[0]['valence'])
    tempo.append(audio[0]['tempo'])
    sleep(randint(1,3))
    
audioFeatures = pd.DataFrame({"title":title,
                        "artist":artist,
                        "danceability":danceability,
                        "energy":energy,
                        "key":key,
                        "loudness":loudness,
                        "mode":mode,
                        "speechiness":speechiness,
                        "acousticness":acousticness,
                        "instrumentalness":instrumentalness,
                        "liveness":liveness,
                        "valence":valence,
                        "tempo":tempo
                      })

In [6]:
#Importing a saved copy of the songs with their audio features which I obtained with the instructions above
audioFeatures = pd.read_csv('TOPListsLarge.csv')

In [7]:
audioFeatures.shape

(3456, 13)

In [8]:
audioFeatures.head()

Unnamed: 0,title,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Heat Waves,Glass Animals,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87
1,STAY (with Justin Bieber),The Kid LAROI,0.591,0.764,1,-5.484,1,0.0483,0.0383,0.0,0.103,0.478,169.928
2,abcdefu,GAYLE,0.695,0.54,4,-5.692,1,0.0493,0.299,0.0,0.367,0.415,121.932
3,Enemy (with JID) - from the series Arcane Leag...,Imagine Dragons,0.728,0.783,11,-4.424,0,0.266,0.237,0.0,0.434,0.555,77.011
4,Cold Heart - PNAU Remix,Elton John,0.795,0.8,1,-6.32,1,0.0309,0.0354,7.3e-05,0.0915,0.934,116.032


In [9]:
#Searching for duplicates: I have combined several lists so it's likely that some songs are in more than one. I will keep only 1 song with the same title
print("Length of the dataset:", len(audioFeatures))
print("Length of the dataset without duplicates:", len(audioFeatures.drop_duplicates(['title'], keep='first')))

Length of the dataset: 3456
Length of the dataset without duplicates: 2979


In [10]:
#Droping duplicates
audioFeatures2 = audioFeatures.drop_duplicates(['title'], keep='first')

In [11]:
#Droping the artist because I don´t need it this time
audioFeatures2 = audioFeatures2.drop(['artist'], axis=1)

### KMeans 

In [12]:
def clustering_a_DF(audioFeaturesDF, n):
    #Splitting the DF
    y = audioFeaturesDF['title']
    X_audio = audioFeaturesDF.drop(['title'], axis=1)
    #Scaling X
    X_prep = StandardScaler().fit_transform(X_audio)
    #Aplying KMeans
    kmeans = KMeans(n_clusters=n, random_state=1234)
    kmeans.fit(X_prep)
    clusters = kmeans.predict(X_prep)
    #Creating a column with the cluster from each Song
    audioFeaturesDF['cluster'] = clusters
    
    return (audioFeaturesDF)

In [13]:
#Creating a copy
audioFeatures3 = audioFeatures2.copy()
audioFeatures3.head()

Unnamed: 0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Heat Waves,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87
1,STAY (with Justin Bieber),0.591,0.764,1,-5.484,1,0.0483,0.0383,0.0,0.103,0.478,169.928
2,abcdefu,0.695,0.54,4,-5.692,1,0.0493,0.299,0.0,0.367,0.415,121.932
3,Enemy (with JID) - from the series Arcane Leag...,0.728,0.783,11,-4.424,0,0.266,0.237,0.0,0.434,0.555,77.011
4,Cold Heart - PNAU Remix,0.795,0.8,1,-6.32,1,0.0309,0.0354,7.3e-05,0.0915,0.934,116.032


In [14]:
#Clustering the songs
audioFeatures2 = clustering_a_DF(audioFeatures2, 4)
audioFeatures2.head()

Unnamed: 0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,cluster
0,Heat Waves,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87,0
1,STAY (with Justin Bieber),0.591,0.764,1,-5.484,1,0.0483,0.0383,0.0,0.103,0.478,169.928,1
2,abcdefu,0.695,0.54,4,-5.692,1,0.0493,0.299,0.0,0.367,0.415,121.932,0
3,Enemy (with JID) - from the series Arcane Leag...,0.728,0.783,11,-4.424,0,0.266,0.237,0.0,0.434,0.555,77.011,2
4,Cold Heart - PNAU Remix,0.795,0.8,1,-6.32,1,0.0309,0.0354,7.3e-05,0.0915,0.934,116.032,1


#### Importing "HOT Songs" DF

Before continuing I will add the code to get the "Hot songs list"

In [None]:
iterations = range(1900, 2011, 10)
[i for i in iterations]

pages = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url = "https://playback.fm/one-hit-wonders-" + start_at + "s"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    #print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4)
    #print("I will sleep for " + str(wait_time) + " second/s.")
    sleep(wait_time)

In [None]:
from bs4 import BeautifulSoup
pages_parsed = []
titles = []
artists = []

for i in range(len(pages)):
    # parse all pages
    pages_parsed.append(BeautifulSoup(pages[i].content, "html.parser"))
    # select only the info about the songs
    songs_html = pages_parsed[i].select("div.content.post")
    # for song, store title and artist into lists
    for j in range(len(songs_html)):
        num_iter = len(songs_html[j].select("p.song-title a"))
        for k in range(num_iter):
            titles.append(songs_html[j].select("p.song-title a")[k].get_text())
            artists.append(songs_html[j].select("p.song-title strong")[k].get_text().strip())

#DF of "Hot songs"
topsongs = pd.DataFrame({"title":titles,
                              "artist":artists
                             })
#Droping duplicates 
topsongs = topsongs.drop_duplicates()

topsongs.head()

In [16]:
#Importing a saved copy of the "Hot songs" which I obtained with the instructions above
topsongs = pd.read_csv('HOTSongs.csv')

Now, I have the two lists:

    - "Hot songs list": topsongs
    - "List of many songs with theis audio features clustered": audioFeatures2

In [17]:
#I will create a function to find the song on Spotify and cluster it
def Clustering_a_Song(song, audio, audioFeatures):
    #Creating a DF with the audio Features
    SongAudioFeatures = pd.DataFrame()
    SongAudioFeatures["title"] = [song["tracks"]["items"][0]["name"]]
    SongAudioFeatures["danceability"] = [audio[0]['danceability']]
    SongAudioFeatures["energy"] = [audio[0]['energy']]
    SongAudioFeatures["key"] = [audio[0]['key']]
    SongAudioFeatures["loudness"] = [audio[0]['loudness']]
    SongAudioFeatures["mode"] = [audio[0]['mode']]
    SongAudioFeatures["speechiness"] = [audio[0]['speechiness']]
    SongAudioFeatures["acousticness"] = [audio[0]['acousticness']]
    SongAudioFeatures["instrumentalness"] = [audio[0]['instrumentalness']]
    SongAudioFeatures["liveness"] = [audio[0]['liveness']]
    SongAudioFeatures["valence"] = [audio[0]['valence']]
    SongAudioFeatures["tempo"] = [audio[0]['tempo']]
    
    #Joining the song to the DF with all songs to find its cluster
    audioFeatures = audioFeatures.append(SongAudioFeatures, ignore_index=True)
    
    audioFeatures = clustering_a_DF(audioFeatures, 4)
    
    return(audioFeatures)

In [18]:
#Asking for a Song
print("What is your song title?")
favSong = input()

#Looking for a Recomendation

#If it's in the "Hot songs" list, I will recommend another "Hot song"
if favSong in topsongs['title'].values: 
    RecSong = topsongs['title'].sample().to_string(index=False)
    print("Recommended song:", RecSong)
    #Looking for it on Spotify
    song = sp.search(q=RecSong, limit=1)
    if len(song['tracks']['items']) > 0:
        link = song["tracks"]["items"][0]['external_urls']['spotify']
        print("You can find it here:", link)
        
#If it's in the general list, I will recommend a song from its cluster    
elif favSong in audioFeatures2['title'].values:
    songRow = audioFeatures2[audioFeatures2.title == favSong]
    PossSongs = audioFeatures2[audioFeatures2.cluster == int(songRow.cluster.to_string(index=False))]
    RecSong = PossSongs['title'].sample().to_string(index=False)
    print("Recommended song:", RecSong)
    #Looking for it on Spotify
    song = sp.search(q=RecSong, limit=1)
    link = song["tracks"]["items"][0]['external_urls']['spotify']
    print("You can find it here:", link)
        
#If it isn't listed i will look for it on Spotify, cluster it and recommend a song from its cluster
else: 
    #Searching for the song and its audio features in Spotify
    song = sp.search(q=favSong, limit=1)
    song_uri = song["tracks"]["items"][0]["uri"] 
    audio = sp.audio_features(song_uri)
    #Checking if it exists and it has audio features:
    if ((len(song['tracks']['items']) > 0) & (len(audio)>0)):
        #Calculating its cluster
        audioFeaturesComp = Clustering_a_Song(song, audio, audioFeatures3)
        #Recomending a song with the same cluster
        songRow = audioFeaturesComp[audioFeaturesComp.title == favSong]
        PossSongs = audioFeatures2[audioFeatures2.cluster == int(songRow.cluster.to_string(index=False))]
        RecSong = PossSongs['title'].sample().to_string(index=False)
        print("Recommended song:", RecSong)
        #Looking for it on Spotify
        song = sp.search(q=RecSong, limit=1)
        link = song["tracks"]["items"][0]['external_urls']['spotify']
        print("You can find it here:", link)
    else: 
        print("Your song is not listed on Spotify or its audio features are not available")

What is your song title?
Like a Rolling Stone
Recommended song: Pauk
You can find it here: https://open.spotify.com/track/4AA4xt0NvJCUaJYh9SoZ1y
