In [1]:
# Importing required libraries
import pandas as pd
import numpy as np

In [2]:
# reading in the built data
combined_df = pd.read_csv('../data/built/track_data.csv')

combined_df.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
0,Walking Blues,Big Joe Williams,spotify:track:1ZjN5X8LmUB67pWPgimW3B,0.509,0.277,6,-14.323,1,0.0495,0.827,0.00206,0.0756,0.64,101.157,161893,4,68.4653,7,0
1,Suddenly Last Summer,The Motels,spotify:track:4fLIM0B1WwrLux9RdnMvze,0.716,0.753,2,-5.682,1,0.0286,0.162,0.0306,0.0831,0.561,120.141,222000,4,57.71583,11,1
2,Sanctuary,Béla Fleck,spotify:track:3DwlNfiCQSdj0GOxYkR9Rq,0.36,0.542,5,-13.885,1,0.0339,0.368,0.165,0.116,0.803,116.831,444907,4,30.34574,17,0
3,The Wild Rover,The Pogues,spotify:track:6JyYNPLalPgGa7XnclF5FO,0.656,0.512,7,-11.872,1,0.029,0.585,0.0,0.072,0.88,97.5,157893,3,50.97022,7,0
4,In The Driver's Seat,John Schneider,spotify:track:6jJi8OXF5qaFdysB6sjWIT,0.642,0.889,2,-5.62,0,0.0494,0.375,0.0,0.18,0.764,163.351,162293,4,33.62053,7,1


In [3]:
# Initializing a spotipy instance to get track metadata
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
from dotenv import load_dotenv

load_dotenv()

CLIENT_ID = os.getenv('BULK_CLIENT_ID')
CLIENT_SECRET = os.getenv('BULK_CLIENT_SECRET')

client_creds_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
spotify = spotipy.Spotify(client_credentials_manager=client_creds_manager)


In [4]:
def get_tracks_metadata(song_uri_list):
    release_types = []
    release_dates = []
    artists = []
    
    num_iters = len(song_uri_list) // 50
    
    for iter in range(num_iters + 1):
        if iter == 0:
            start_index = 0
            end_index = 50
        else:
            start_index = end_index
            end_index += 50
            
        if iter != num_iters:
            response = spotify.tracks(tracks=song_uri_list[start_index:end_index])
        else:
            response = spotify.tracks(tracks=song_uri_list[start_index:])
        
        tracks = response['tracks']
        
        for track in tracks:
            release_types.append(track['album']['album_type'])
            release_dates.append((track['album']['release_date'], track['album']['release_date_precision']))
            artist_list = []
            
            for artist in track['artists']:
                artist_list.append(artist['name'])
                
            artists.append(artist_list)
            
    return release_dates, release_types, artists

In [5]:
track_uris = combined_df.uri.tolist()

release_dates, release_types, artists = get_tracks_metadata(track_uris)

In [6]:
num_artists_list = [len(artist) for artist in artists]

In [7]:
combined_df['num_artists'] = num_artists_list
combined_df['release_type'] = release_types

In [8]:
combined_df.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,num_artists,release_type
0,Walking Blues,Big Joe Williams,spotify:track:1ZjN5X8LmUB67pWPgimW3B,0.509,0.277,6,-14.323,1,0.0495,0.827,...,0.0756,0.64,101.157,161893,4,68.4653,7,0,1,album
1,Suddenly Last Summer,The Motels,spotify:track:4fLIM0B1WwrLux9RdnMvze,0.716,0.753,2,-5.682,1,0.0286,0.162,...,0.0831,0.561,120.141,222000,4,57.71583,11,1,1,album
2,Sanctuary,Béla Fleck,spotify:track:3DwlNfiCQSdj0GOxYkR9Rq,0.36,0.542,5,-13.885,1,0.0339,0.368,...,0.116,0.803,116.831,444907,4,30.34574,17,0,6,album
3,The Wild Rover,The Pogues,spotify:track:6JyYNPLalPgGa7XnclF5FO,0.656,0.512,7,-11.872,1,0.029,0.585,...,0.072,0.88,97.5,157893,3,50.97022,7,0,1,album
4,In The Driver's Seat,John Schneider,spotify:track:6jJi8OXF5qaFdysB6sjWIT,0.642,0.889,2,-5.62,0,0.0494,0.375,...,0.18,0.764,163.351,162293,4,33.62053,7,1,1,album


In [9]:
combined_df.num_artists.unique()

array([ 1,  6,  2,  3,  4,  5, 14,  7,  9, 18, 10, 17,  8, 16, 11, 23, 13])

In [10]:
# exporting the processed data for model building
combined_df.to_csv('../data/processed/tracks.csv', index=False)