In [2]:
# First install spotipy in the terminal
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import numpy as np
import pandas as pd

In [3]:
# Reading the client ID and client secret from a separate json file
credentials = json.load(open('authorization.json'))
client_id = credentials['client_id']
client_secret = credentials['client_secret']

In [177]:
playlist_index = 39 #fill in the index of the playlist. There's 40 URI's in the json file

f = open('playlists.json')
playlists = json.load(f)
playlist_uri = playlists[playlist_index]['uri']

In [178]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [179]:
# Saving the playlist ID
playlist_id = playlist_uri.split(':')[2]  # the URI is split by ':' to get the playlist ID
results = sp.playlist(playlist_id, 'tracks')

In [180]:
# Extracting track info like title and artist
playlist_tracks_data = results['tracks'] 
playlist_tracks_id = [] # Creating empty lists
playlist_tracks_titles = []
playlist_tracks_artists = []
playlist_tracks_first_artists = []

# Looping over every track in the playlist to add the track ID, song name and artist(s) name(s) to the lists
for track in playlist_tracks_data['items']:
    playlist_tracks_id.append(track['track']['id'])
    playlist_tracks_titles.append(track['track']['name'])
    # adds a list of all artists involved in the song to the list of artists for the playlist
    artist_list = []
    for artist in track['track']['artists']:
        artist_list.append(artist['name'])
    playlist_tracks_artists.append(artist_list)
    playlist_tracks_first_artists.append(artist_list[0])

In [181]:
# Extracting the genre per artist (because there is no genre infromation available per track) 
# and release date of the album the track is on (because there is no release date per track)
genre_artist = []
release_date_album = []

for track in playlist_tracks_data['items']:
    # Retrieve the first artist's genre information
    first_artist_id = track['track']['artists'][0]['id']
    artist_info = sp.artist(first_artist_id)
    genre_artist.append(artist_info['genres'])
    album_info = sp.album(track['track']['album']['id'])
    # Retrieve the album's release date information
    release_date_album.append(album_info['release_date'])

In [182]:
# Extracting the audio features of each track. XXX
features = sp.audio_features(playlist_tracks_id)
features_df = pd.DataFrame(data=features, columns=features[0].keys())

In [183]:
# Extracting the audio analysis info of each track. XXX
num_bars = []
num_sections = []
num_segments = []

for i in range(0,len(features_df['id'])):
    analysis = sp.audio_analysis(features_df.iloc[i]['id'])
    num_bars.append(len(analysis['bars'])) # beats/time_signature
    num_sections.append(len(analysis['sections']))
    num_segments.append(len(analysis['segments']))


In [184]:
# Merging the data frames
features_df['title'] = playlist_tracks_titles
features_df['first_artist'] = playlist_tracks_first_artists
features_df['all_artists'] = playlist_tracks_artists

features_df = features_df[['id', 'title', 'first_artist', 'all_artists',
                           'danceability', 'energy', 'key', 'loudness',
                           'mode', 'acousticness', 'instrumentalness',
                           'liveness', 'valence', 'tempo',
                           'duration_ms', 'time_signature']]

features_df['num_bars'] = num_bars
features_df['num_sections'] = num_sections
features_df['num_segments'] = num_segments

features_df['genres_artist'] = genre_artist
features_df['release_date_album'] = release_date_album

# Let's see what the dataframe looks like
features_df.head()

Unnamed: 0,id,title,first_artist,all_artists,danceability,energy,key,loudness,mode,acousticness,...,liveness,valence,tempo,duration_ms,time_signature,num_bars,num_sections,num_segments,genres_artist,release_date_album
0,2d8D7uk3tbAThjRkdfrx9c,Voyage voyage,Desireless,[Desireless],0.606,0.913,8,-6.039,0,0.246,...,0.34,0.838,123.58,266600,4,133,11,1057,"[french synthpop, italian disco]",2001-01-02
1,23l1kVpqMVREiwU1YAlcr4,I Guess That's Why They Call It The Blues,Elton John,[Elton John],0.673,0.663,0,-7.29,1,0.217,...,0.177,0.671,120.634,285333,3,186,16,865,"[glam rock, mellow gold, piano rock, rock]",1983-05-30
2,1WeoeHh0TSzsApyJ6Q8OOK,I'll Be Over You,TOTO,[TOTO],0.515,0.277,10,-17.85,1,0.062,...,0.258,0.413,82.167,229880,4,78,12,741,"[album rock, classic rock, hard rock, mellow g...",1986-08
3,48p5E25cFPanxuwCTmTpuL,The Promise,When In Rome,[When In Rome],0.629,0.876,0,-10.141,1,0.0864,...,0.0431,0.358,118.01,220960,4,104,8,1025,"[new romantic, new wave, synthpop]",1988-01-01
4,5AhRNIjYPBleR1lfHphcrE,The Edge of Heaven,Wham!,[Wham!],0.493,0.665,9,-13.817,0,0.117,...,0.159,0.877,152.644,270800,4,171,11,894,"[new romantic, new wave, new wave pop, soft ro...",1986-07-01


In [185]:
# Saving the playlist's dataframe as a CSV
features_df.to_csv("playlist_" + str(playlist_index) + ".csv", encoding='utf-8',index="false")

In [186]:
# Now let's combine all the csv files

import glob
import os

# Set the path for joining multiple files
files = os.path.join("playlist_*.csv")
files = glob.glob(files) # Return list of merged files                 

In [191]:
# Concatenating the files
full_df = pd.concat(map(pd.read_csv, files), ignore_index=True)

# The dataframe contains 3289 songs, but some songs are probably in it multiple times.
# Only save unique track IDs
full_df = full_df.drop_duplicates(subset=['id'])

# Printing the length of the dataframe. 
len(full_df) # Now there's 2263 songs left

2263

In [192]:
# Saving the dataset as a new CSV
full_df.to_csv("spotify_pop_songs" + ".csv", encoding='utf-8',index="false")