# Processing and Handling Data for Spotify Similar Song Finder

This notebook was created to filter and sample the "Spotify Million Playlists" dataset.


Random sampling of 1000 playlists will be used, with XXXXXXXXXXXXXXXXXXXXXXXXX number of tracks.


### Imports


In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd
import numpy as np
from tqdm import tqdm


import os
import random
import json
import time

from dotenv import load_dotenv


load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [2]:

directory = r'D:\Python Projects\XL Datasets\Spotify Million Playlist\spotify_million_playlist_dataset\data'
num_samples = 1  # Number of files to sample

json_files = os.listdir(directory)
sampled_files = random.sample(json_files, num_samples) # Sample files randomly

unique_track_uris = set()

for file_name in sampled_files:
    with open(os.path.join(directory, file_name), 'r') as file:
        data = json.load(file)
        for playlist in data['playlists']:
            for track in playlist['tracks']:
                track_uri = track['track_uri']
                unique_track_uris.add(track_uri)


In [3]:
print(len(unique_track_uris))

35493


In [4]:

track_data = []

index = 0
for track_uri in unique_track_uris:
    track_info = sp.track(track_uri)
    
    artist_name = track_info['artists'][0]['name']
    track_name = track_info['name']
    
    audio_features = sp.audio_features(track_uri)[0]
    try:
        track_dict = {
            'artist_name': artist_name,
            'track_name': track_name,
            'uri' : audio_features['uri'],
            'acousticness': audio_features['acousticness'],
            'danceability': audio_features['danceability'],
            'energy': audio_features['energy'],
            'instrumentalness': audio_features['instrumentalness'],
            'liveness': audio_features['liveness'],
            'loudness': audio_features['loudness'],
            'speechiness': audio_features['speechiness'],
            'tempo': audio_features['tempo'],
            'valence': audio_features['valence'],
            'key': audio_features['key'],
            'mode': audio_features['mode']
            
        }
        
        track_data.append(track_dict)
    
    except Exception as e:
        print(f"Error processing track {track_uri}: {e}")
        continue
    
    time.sleep(0.2)



Max Retries reached


SpotifyException: http status: 429, code:-1 - /v1/audio-features/?ids=5DO1SazQppcsKQ1c1JpyQz:
 Max Retries, reason: too many 429 error responses

In [None]:
df = pd.DataFrame(track_data) # I have used a pre-made CSV in this project as I cannot get past Spotify's API rate limit.

In [None]:
df.shape # It's taken 8473 tracks out of 35933. I'll have to run it again to get the rest of the data.

(8473, 14)

In [None]:
df

Unnamed: 0,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,Rend Collective,Rescuer (Good News),0.491,0.7460,7,-4.872,1,0.0565,0.39600,0.000000,0.3680,0.6270,152.022,spotify:track:32f24TdkpHx8L0EZX2HbP4
1,Yuna,Lullabies - Adventure Club Remix,0.403,0.6980,9,-5.058,0,0.0357,0.00171,0.001260,0.2560,0.0833,159.869,spotify:track:3RJuptH9QoeJWzHd4ZCI0u
2,Tech N9ne,Shut the F**k Up (Skit),0.000,0.9500,9,-8.761,1,0.0000,0.51000,0.071200,0.8720,0.0000,0.000,spotify:track:4oiXR2h98f196Ov2wCjCJS
3,Bayside,Landing Feet First,0.436,0.3320,9,-7.649,1,0.0406,0.00434,0.000000,0.1090,0.3290,174.901,spotify:track:3y0ycPzjAE6R66u3aw8MBk
4,Iamsu!,Back On My BS (feat. Skipper & Problem),0.755,0.8090,7,-5.954,0,0.1660,0.17400,0.000000,0.1810,0.5490,98.035,spotify:track:3AtNkheraHcfe0ubFQ9Xp1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8468,Huey Lewis & The News,If This Is It,0.512,0.4930,7,-12.964,1,0.0276,0.15500,0.000005,0.1280,0.8360,146.668,spotify:track:6xBedrkburGrmhQGvlBfoH
8469,Murry Jonathan Gold,Doctor Who 'I Am The Doctor' For Orchestra,0.589,0.7920,2,-5.145,0,0.0337,0.06240,0.881000,0.0602,0.3890,90.981,spotify:track:2aDZw6PUiIBcIYZ9d47jwa
8470,Musica de Piano Escuela,Kiss the Rain,0.573,0.0707,8,-22.578,1,0.0469,0.98000,0.939000,0.1110,0.2030,131.016,spotify:track:7JpMDy3skWcOYdGDvE9ng9
8471,Piano Tribute Players,She Will Be Loved,0.485,0.2990,0,-18.445,0,0.0658,0.99200,0.924000,0.1120,0.1260,204.036,spotify:track:0aabUVAPbLV6sLtnM1lb8B


In [None]:
df.to_csv('../data/spotify_data.csv', index=False)