In [1]:
# Importing necessary libraries and settings
import time
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format ='retina'
import random
from functools import reduce
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import oauth2

In [2]:
# Insert your Spotify username and the credentials that you obtained from spotify developer
cid = 'e5164fbe5751433e874d5fd5b59efc3b'
secret = 'bde09ec2fbe044f689e9b64db6dcf24a'
redirect_uri='http://localhost:8888/callback'
username = 'c3drys7oy7f88tijr6fs0eg0i'

In [3]:
# Once the Authorisation is complete, we just need to `sp` to call the APIs
scope = 'user-top-read playlist-modify-private playlist-modify-public'
token = util.prompt_for_user_token(username, scope, client_id=cid, client_secret=secret, redirect_uri=redirect_uri)

if token:
    sp = spotipy.Spotify(auth=token)
else:
    print("Can't get token for", username)

In [4]:
# Getting features for each song
def fetch_audio_features(sp, df):
    playlist = df[['track_id','track_name']] 
    index = 0
    audio_features = []
    
    # Make the API request
    while index < playlist.shape[0]:
        audio_features += sp.audio_features(playlist.iloc[index:index + 50, 0])
        index += 50
    
    # Create an empty list to feed in different charactieritcs of the tracks
    features_list = []
    #Create keys-values of empty lists inside nested dictionary for album
    for features in audio_features:
        features_list.append([features['danceability'],
                              features['acousticness'],
                              features['energy'], 
                              features['tempo'],
                              features['instrumentalness'], 
                              features['loudness'],
                              features['liveness'],
                              features['duration_ms'],
                              features['key'],
                              features['valence'],
                              features['speechiness'],
                              features['mode']
                             ])
    
    df_audio_features = pd.DataFrame(features_list, columns=['danceability', 'acousticness', 'energy','tempo', 
                                                             'instrumentalness', 'loudness', 'liveness','duration_ms', 'key',
                                                             'valence', 'speechiness', 'mode'])
    
    # Create the final df, using the 'track_id' as index for future reference
    df_playlist_audio_features = pd.concat([playlist, df_audio_features], axis=1)
    df_playlist_audio_features.set_index('track_name', inplace=True, drop=True)
    return df_playlist_audio_features

### Getting the songs from all of Spotify's playlists

The following cells are collecting the songs from all of the playlists from the official Spotify account. In this repo, I've curated a dataframe of approximately 10,000 songs. If you would like to add more songs, modify the cells below. I've commented them out because they take a long time to run and are unnecessary if you'd like to use the dataframe I've already built.

In [5]:
# Getting playlist IDs from each of Spotify's playlists
playlists = sp.user_playlists('spotify')
spotify_playlist_ids = []
while playlists:
    for i, playlist in enumerate(playlists['items']):
        spotify_playlist_ids.append(playlist['uri'][-22:])
    if playlists['next']:
        playlists = sp.next(playlists)
    else:
        playlists = None
spotify_playlist_ids[:20]

['37i9dQZF1DXcBWIGoYBM5M',
 '37i9dQZF1DX0XUsuxWHRQd',
 '37i9dQZF1DX1lVhptIYRda',
 '37i9dQZF1DX10zKzsJ2jva',
 '37i9dQZF1DX4JAvHpjipBk',
 '37i9dQZF1DX4sWSpwq3LiO',
 '37i9dQZF1DX4SBhb3fqCJd',
 '37i9dQZF1DWXRqgorJj26U',
 '37i9dQZF1DX4dyzvuaRJ0n',
 '37i9dQZF1DXcF6B6QPhFDv',
 '37i9dQZF1DWXJfnUiYjUKT',
 '37i9dQZF1DXcRXFNfZr7Tp',
 '37i9dQZF1DX4o1oenSJRJd',
 '37i9dQZF1DXbTxeAdrVG2l',
 '37i9dQZF1DX4UtSsGT1Sbe',
 '37i9dQZF1DWTJ7xPn4vNaz',
 '37i9dQZF1DXaKIA8E7WcJj',
 '37i9dQZF1DWSV3Tk4GO2fq',
 '37i9dQZF1DWTwnEm1IYyoj',
 '37i9dQZF1DX2A29LI7xHn1']

In [6]:
len(spotify_playlist_ids)

1196

### Getting tracks from Spotify playlists

In [61]:
# Creating a function to get the first 50 tracks IDs from a playlist
def getTrackIDs(playlist_id):
    playlist = sp.user_playlist('spotify', playlist_id)
    for item in playlist['tracks']['items'][:1]:
        track = item['track']
        ids.append(track['id'])
    return

In [62]:
# Creating a function get features of each track from track id
def getTrackFeatures(track_id):
  meta = sp.track(track_id)
  features = sp.audio_features(track_id)

  # meta
  track_id = track_id
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']
  length = meta['duration_ms']
  popularity = meta['popularity']

  # features
  acousticness = features[0]['acousticness']
  danceability = features[0]['danceability']
  energy = features[0]['energy']
  instrumentalness = features[0]['instrumentalness']
  liveness = features[0]['liveness']
  loudness = features[0]['loudness']
  speechiness = features[0]['speechiness']
  tempo = features[0]['tempo']
  time_signature = features[0]['time_signature']

  track = [track_id, name, album, artist, release_date, length, popularity, danceability, acousticness, energy, instrumentalness, liveness, loudness, speechiness, tempo, time_signature]
  return track

The cell below takes about five minutes to run.

In [63]:
%%time
# Gathering track ids
ids = []
for x in spotify_playlist_ids[:200]:
    getTrackIDs(x)
ids[:5]

CPU times: total: 3.08 s
Wall time: 57.6 s


['4LRPiXqCikLlN15c3yImP7',
 '3ihIZrJreMJPjQdNLrEXnP',
 '3WMj8moIAXJhHsyLaqIIHI',
 '7dSZ6zGTQx66c2GF91xCrb',
 '1qEmFfgcLObUfQm0j1W2CK']

The cell below takes about 30 minutes to run.

In [65]:
%%time
# loop over track ids to get audio features for each track
tracks = []
for i in range(len(ids)):
    try:  
        track = getTrackFeatures(ids[i])
        tracks.append(track)
    except:
        pass

# create dataset
df = pd.DataFrame(tracks, columns = ['track_id', 'name', 'album', 'artist', 'release_date', 'length', 'popularity', 'danceability', 'acousticness', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature'])
df.head()

CPU times: total: 2.62 s
Wall time: 1min 13s


200

In [66]:
len(df)

200

In [67]:
df.to_csv('playlist_songs.csv',index=False)

In [68]:
df = pd.read_csv('data/playlist_songs.csv')
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,7MAibcTli4IisCtbHKrGMh,Leave The Door Open,Leave The Door Open,Bruno Mars,2021-03-05,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4
1,5QO79kh1waicV47BqGRL3g,Save Your Tears,After Hours,The Weeknd,2020-03-20,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4
2,1diS6nkxMQc3wwC4G1j0bh,We're Good,Future Nostalgia (The Moonlight Edition),Dua Lipa,2021-02-11,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4
3,4u4NyuceXP7Uzh7XFJKCr1,Hold On,Hold On,Justin Bieber,2021-03-05,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4
4,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,Astronaut In The Ocean,Masked Wolf,2021-01-06,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4


In [69]:
# Dropping columns that could lead to data leakage
df = df.drop(columns=['name', 'album', 'artist', 'release_date'])
df.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,7MAibcTli4IisCtbHKrGMh,242096,90,0.586,0.182,0.616,0.0,0.0927,-7.964,0.0324,148.088,4
1,5QO79kh1waicV47BqGRL3g,215626,97,0.68,0.0212,0.826,1.2e-05,0.543,-5.487,0.0309,118.051,4
2,1diS6nkxMQc3wwC4G1j0bh,165506,88,0.722,0.0319,0.588,0.0,0.183,-5.932,0.0544,134.01,4
3,4u4NyuceXP7Uzh7XFJKCr1,170813,89,0.658,0.0106,0.634,0.0,0.132,-5.797,0.0413,139.98,4
4,3Ofmpyhv5UAQ70mENzB277,132780,94,0.778,0.175,0.695,0.0,0.15,-6.865,0.0913,149.996,4


In [70]:
# Dropping duplicated songs
df = df.drop_duplicates(subset=['track_id'])
df['track_id'].value_counts()

7MAibcTli4IisCtbHKrGMh    1
7KT7VGnPU5QVXN3q1BOeqb    1
2rkCYgvzyHp1AESIlJcqqY    1
7EYLrneh08x29IAWLl7Tst    1
2x4i2WnE05D2Z7yEeytmPr    1
                         ..
2Xa0oBYSZSXNk4DtNyrcYf    1
48O6kz322Dzu1R6Al5147q    1
3MDvya4tKmSsyPW9oXygB4    1
4OsLDuaH0bWR6xM6nj66F8    1
0jo2BDAxfgNrL5VplFuulx    1
Name: track_id, Length: 8883, dtype: int64

## Getting user's favorite tracks

In [71]:
# Getting top 50 tracks from user
results = sp.current_user_top_tracks(limit=1000, offset=0,time_range='short_term')

In [72]:
# Convert it to Dataframe
track_name = []
track_id = []
artist = []
album = []
duration = []
popularity = []
for i, items in enumerate(results['items']):
        track_name.append(items['name'])
        track_id.append(items['id'])
        artist.append(items["artists"][0]["name"])
        duration.append(items["duration_ms"])
        album.append(items["album"]["name"])
        popularity.append(items["popularity"])

# Create the final df   
df_favourite = pd.DataFrame({ "track_name": track_name, 
                             "album": album, 
                             "track_id": track_id,
                             "artist": artist, 
                             "duration": duration, 
                             "popularity": popularity})

df_favourite.head()

Unnamed: 0,track_name,album,track_id,artist,duration,popularity
0,You Said,You Said,2k5rWMAz9RvFS3k3kk3cc5,Connor Price,135000,64
1,Aankhon Se Batana,Aankhon Se Batana,1ZiReD9pPTttQWwSoYqdyH,Dikshant,221271,67
2,Love Language,Love Language,526SpJgLqe5JZtyXNo0Ic6,Connor Price,170880,50
3,lead me on,lead me on,45HHTHXv7gQ5q2r89ui2Fy,sammy rash,127802,59
4,IDGAF (with blackbear),IDGAF,6Jrdb6CFOJEGaHjaa6c4WR,BoyWithUke,140665,81


In [73]:
%%time
# Getting track features for each song in favorite song dataframe
fav_tracks = []
for track in df_favourite['track_id']:
    try:  
        track = getTrackFeatures(track)
        fav_tracks.append(track)
    except:
        pass

CPU times: total: 500 ms
Wall time: 19.1 s


In [74]:
# Create favorite track with audio features dataset
df_fav = pd.DataFrame(fav_tracks, columns = ['track_id', 'name', 'album', 'artist', 'release_date', 'length', 'popularity', 'danceability', 'acousticness', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature'])
df_fav.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,2k5rWMAz9RvFS3k3kk3cc5,You Said,You Said,Connor Price,2021-07-09,135000,64,0.726,0.141,0.481,0.0,0.155,-8.254,0.207,79.978,4
1,1ZiReD9pPTttQWwSoYqdyH,Aankhon Se Batana,Aankhon Se Batana,Dikshant,2022-04-12,221271,67,0.544,0.791,0.307,0.0,0.11,-11.361,0.0334,177.808,4
2,526SpJgLqe5JZtyXNo0Ic6,Love Language,Love Language,Connor Price,2020-06-19,170880,50,0.75,0.326,0.401,0.177,0.269,-9.105,0.305,139.695,4
3,45HHTHXv7gQ5q2r89ui2Fy,lead me on,lead me on,sammy rash,2021-11-05,127802,59,0.787,0.566,0.478,0.0,0.147,-8.761,0.092,79.029,4
4,6Jrdb6CFOJEGaHjaa6c4WR,IDGAF (with blackbear),IDGAF,BoyWithUke,2022-03-18,140665,81,0.783,0.426,0.75,0.0,0.241,-5.762,0.078,97.953,4


In [75]:
# Dropping columns that could lead to data leakage
df_fav = df_fav.drop(columns=['name', 'album', 'artist', 'release_date'])
df_fav.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature
0,2k5rWMAz9RvFS3k3kk3cc5,135000,64,0.726,0.141,0.481,0.0,0.155,-8.254,0.207,79.978,4
1,1ZiReD9pPTttQWwSoYqdyH,221271,67,0.544,0.791,0.307,0.0,0.11,-11.361,0.0334,177.808,4
2,526SpJgLqe5JZtyXNo0Ic6,170880,50,0.75,0.326,0.401,0.177,0.269,-9.105,0.305,139.695,4
3,45HHTHXv7gQ5q2r89ui2Fy,127802,59,0.787,0.566,0.478,0.0,0.147,-8.761,0.092,79.029,4
4,6Jrdb6CFOJEGaHjaa6c4WR,140665,81,0.783,0.426,0.75,0.0,0.241,-5.762,0.078,97.953,4


In [76]:
# Checking for duplicates in df_fav
df_fav['track_id'].value_counts()

2k5rWMAz9RvFS3k3kk3cc5    1
3TfJtvgXTC0BIsRGXD3SLz    1
3jvqqFqmRnZKZJXnOKV4zp    1
0WXyJHcgaibGLH1xoiWDIa    1
4T6FWA703h6H7zk1FoSARw    1
0KSOLEBixnBYIKNHF1VbzF    1
1O5mTSQa0sCce9ghhDqIZl    1
2SUxn2O9NHL6GHGQFgwCY0    1
45bE4HXI0AwGZXfZtMp8JR    1
4mjTMmz0j6Wl3GhODoPqTx    1
4LMlIX9Y8qziRafzW30mGc    1
4eDtZP99H6xfasP4Tku9Ee    1
2PR4jVXzyjCky2PkONvipq    1
1ZiReD9pPTttQWwSoYqdyH    1
0gGwIl6b1PRp2bE9wCbWa3    1
1EWLY8o2kEQRR3OlPoLnns    1
3Ofmpyhv5UAQ70mENzB277    1
0NvGVd87SYZnryTcfQPKUO    1
5Cr5e4NGjmbcem8nugnf4i    1
01UeBfllW39eaKdiCw556R    1
4SLWJ2ff0RHSJOSp2MPIHH    1
0s76ExpXyMGVBlKLUr683e    1
0OPIBKjAYBFzNpo1HzjqzQ    1
0NLkVxf0PyxsXBG3EuZcJf    1
5nTbPFqLKmQdIg1SD8KgG4    1
7lvDsmTRXFE3dK4OjvRiWB    1
1RlmAzI27AlIpvoRN3u6Iy    1
47BBI51FKFwOMlIiX6m8ya    1
526SpJgLqe5JZtyXNo0Ic6    1
45HHTHXv7gQ5q2r89ui2Fy    1
6Jrdb6CFOJEGaHjaa6c4WR    1
3afkJSKX0EAMsJXTZnDXXJ    1
7mxEWcW3A0jULs64ckyq9k    1
5PiPZVKX9xIeqfg7sT0vpZ    1
560V8xbikuV7YVXqbpDGOv    1
7i2theCezASheQ8KmXRc

In [77]:
# Creating favorite column to use in classification
df_fav['favorite'] = 1
df['favorite'] = 0 

In [78]:
# Checking if both datasets have the same columns
df.columns == df_fav.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

## Preparing dataset for model creation

In [79]:
print(df.shape)
print(df_fav.shape)

(8883, 13)
(50, 13)


In [80]:
# Combining the favorite dataframe with the dataframe of Spotify's songs
combined = pd.concat([df, df_fav])
combined.shape

(8933, 13)

In [81]:
combined.favorite.value_counts()

0    8883
1      50
Name: favorite, dtype: int64

The favorite songs to not favorite songs ratio is imbalanced, I will need to fix this when building the model

In [82]:
# Creating dataframe of favorite songs
df_fav = combined.loc[combined['favorite'] == 1]
df_fav.head()

Unnamed: 0,track_id,length,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,favorite
0,2k5rWMAz9RvFS3k3kk3cc5,135000,64,0.726,0.141,0.481,0.0,0.155,-8.254,0.207,79.978,4,1
1,1ZiReD9pPTttQWwSoYqdyH,221271,67,0.544,0.791,0.307,0.0,0.11,-11.361,0.0334,177.808,4,1
2,526SpJgLqe5JZtyXNo0Ic6,170880,50,0.75,0.326,0.401,0.177,0.269,-9.105,0.305,139.695,4,1
3,45HHTHXv7gQ5q2r89ui2Fy,127802,59,0.787,0.566,0.478,0.0,0.147,-8.761,0.092,79.029,4,1
4,6Jrdb6CFOJEGaHjaa6c4WR,140665,81,0.783,0.426,0.75,0.0,0.241,-5.762,0.078,97.953,4,1


In [83]:
# Removing favorite songs from playlist songs
df = combined.loc[combined['favorite'] != 1]
df.shape

(8883, 13)

In [84]:
df_fav.shape

(50, 13)

In [85]:
# Saving these dataframes to use in model creation
df.to_csv('encoded_playlist_songs.csv', index=False)
df_fav.to_csv('favorite_songs.csv', index=False)