# Processing and Handling Data for Spotify Similar Song Finder

This notebook was created to filter and sample the "Spotify Million Playlists" dataset.


Random sampling of 5,000+ playlists will be used, with XXXXXXXXXXXXXXXXXXXXXXXXX number of tracks.


### Imports


In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd
import numpy as np
from tqdm import tqdm


import os
import random
import json

from dotenv import load_dotenv


load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [3]:

directory = r'D:\Python Projects\XL Datasets\Spotify Million Playlist\spotify_million_playlist_dataset\data'
num_samples = 5  # Number of files to sample

json_files = os.listdir(directory)
sampled_files = random.sample(json_files, num_samples) # Sample files randomly

unique_track_uris = set()

for file_name in sampled_files:
    with open(os.path.join(directory, file_name), 'r') as file:
        data = json.load(file)
        for playlist in data['playlists']:
            for track in playlist['tracks']:
                track_uri = track['track_uri']
                unique_track_uris.add(track_uri)


In [35]:
track_data = []

index = 0
for track_uri in unique_track_uris:
    track_info = sp.track(track_uri)
    
    artist_name = track_info['artists'][0]['name']
    track_name = track_info['name']
    audio_features = sp.audio_features(track_uri)[0]
    
    track_dict = {
        'artist_name': artist_name,
        'track_name': track_name,
        'danceability': audio_features['danceability'],
        'energy': audio_features['energy'],
        'key': audio_features['key'],
        'loudness': audio_features['loudness'],
        'mode': audio_features['mode'],
        'speechiness': audio_features['speechiness'],
        'acousticness': audio_features['acousticness'],
        'instrumentalness': audio_features['instrumentalness'],
        'liveness': audio_features['liveness'],
        'valence': audio_features['valence'],
        'tempo': audio_features['tempo'],
        'uri' : audio_features['uri']
    }
    
    track_data.append(track_dict)

          artist_name                         track_name  danceability  \
0    Hollywood Undead                               Lion         0.477   
1          Housefires                   Good Good Father         0.258   
2     Trisha Yearwood                       Walkaway Joe         0.619   
3       The Neon Crew  Back in Black (From "Iron Man 2")         0.505   
4   The Buffalo Chips                       Viva La Vida         0.522   
5           JJ Flores         Stay - John Dahlbäck Remix         0.819   
6          The Nylons                     God Only Knows         0.461   
7       Justin Bieber               One Less Lonely Girl         0.580   
8        Alan Jackson          Chasin' That Neon Rainbow         0.719   
9            Hellberg                           Back2You         0.559   
10          Lady Gaga                           Applause         0.670   
11           PnB Rock                 Feelin' Like Diddy         0.798   
12   Enrique Iglesias                 

In [36]:
df = pd.DataFrame(track_data)

In [39]:
df.shape

(15, 14)

In [38]:
df.head()

Unnamed: 0,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,Hollywood Undead,Lion,0.477,0.897,6,-5.607,1,0.117,0.00212,0.0,0.0724,0.516,168.975,spotify:track:3G6b0ytwf3kluuupwAnSBL
1,Housefires,Good Good Father,0.258,0.571,9,-8.081,1,0.0415,0.638,0.0,0.141,0.175,144.841,spotify:track:1hYtQci8161VCPhfTJO0MA
2,Trisha Yearwood,Walkaway Joe,0.619,0.405,0,-10.367,1,0.0263,0.885,0.00032,0.0718,0.428,114.051,spotify:track:5pzoq7ggrA0GN3SMjy0NT8
3,The Neon Crew,"Back in Black (From ""Iron Man 2"")",0.505,0.654,9,-7.57,1,0.0277,0.0057,0.000547,0.0524,0.815,93.457,spotify:track:6JzkxKjghgLsj14OGz7uKZ
4,The Buffalo Chips,Viva La Vida,0.522,0.695,8,-8.578,1,0.0795,0.769,0.0,0.124,0.186,136.95,spotify:track:3h7QyvRCvDzaq5byDWMulH
