# Data Preprocessing

In [2]:
# Importing the required modules and libraries
import pandas as pd


In [11]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/utils")

from spotify_client import spotify

In [3]:
raw_80s_df = pd.read_csv('../data/raw/tracks_80s.csv')
raw_90s_df = pd.read_csv('../data/raw/tracks_90s.csv')
raw_00s_df = pd.read_csv('../data/raw/tracks_00s.csv')
raw_10s_df = pd.read_csv('../data/raw/tracks_10s.csv')

raw_data_df = pd.concat([raw_80s_df, raw_90s_df, raw_00s_df, raw_10s_df], ignore_index=True, axis=0)

del raw_80s_df, raw_90s_df, raw_00s_df, raw_10s_df

raw_data_df.shape

(24698, 19)

In [4]:
song_uri_list = raw_data_df.uri.tolist()

In [9]:
def get_tracks_metadeta(song_uri_list):
    albums = []
    release_types = []
    release_dates = []
    explicities = []
    popularities = []
    
    num_iters = len(song_uri_list) // 50
    
    for iter in range(num_iters+1):
        # print(iter)
        if iter == 0:
            starting_index = 0
            ending_index = 50
        else:
            starting_index = ending_index
            ending_index += 50
        
        if iter != num_iters:
            response = spotify.tracks(tracks=song_uri_list[starting_index:ending_index])
        else:
            response = spotify.tracks(tracks=song_uri_list[starting_index:])
            
        tracks = response['tracks']
        
        for track in tracks:
            albums.append(track['album']['name'])
            release_types.append(track['album']['album_type'])
            release_dates.append((track['album']['release_date'], track['album']['release_date_precision']))
            explicities.append(True if track['explicit'] == 'true' else False)
            popularities.append(track['popularity'])
        
    return albums, release_types, release_dates, explicities, popularities

In [12]:
albums, release_types, release_dates, explicities, popularities = get_tracks_metadeta(song_uri_list)

In [10]:
spotify.tracks(['5wG3HvLhF6Y5KTGlK0IW3J','2Hh3ETdQKrmSI3QS0hme7g'])

{'tracks': [{'album': {'album_type': 'album',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0iEtIxbK0KxaSlF7G42ZOp'},
      'href': 'https://api.spotify.com/v1/artists/0iEtIxbK0KxaSlF7G42ZOp',
      'id': '0iEtIxbK0KxaSlF7G42ZOp',
      'name': 'Metro Boomin',
      'type': 'artist',
      'uri': 'spotify:artist:0iEtIxbK0KxaSlF7G42ZOp'}],
    'available_markets': ['AR',
     'AU',
     'AT',
     'BE',
     'BO',
     'BR',
     'BG',
     'CA',
     'CL',
     'CO',
     'CR',
     'CY',
     'CZ',
     'DK',
     'DO',
     'DE',
     'EC',
     'EE',
     'SV',
     'FI',
     'FR',
     'GR',
     'GT',
     'HN',
     'HK',
     'HU',
     'IS',
     'IE',
     'IT',
     'LV',
     'LT',
     'LU',
     'MY',
     'MT',
     'MX',
     'NL',
     'NZ',
     'NI',
     'NO',
     'PA',
     'PY',
     'PE',
     'PH',
     'PL',
     'PT',
     'SG',
     'SK',
     'ES',
     'SE',
     'CH',
     'TW',
     'TR',
     'UY',
     'US',
     'GB',


In [26]:
albums[0:5]

['Know-It-All (Deluxe)',
 'The Genius of Esquivel',
 'Love Someone',
 'Cura',
 'Juju on That Beat (TZ Anthem)']

In [13]:
release_date, release_date_precision = zip(*release_dates)

In [15]:
raw_data_df['album'] = albums
raw_data_df['release_type'] = release_types
raw_data_df['release_date'] = release_date
raw_data_df['release_date_precision'] = release_date_precision
raw_data_df['explicit'] = explicities
raw_data_df['popularities'] = popularities

In [28]:
release_dates[0:5]

[('2015-11-13', 'day'),
 ('2012-06-12', 'day'),
 ('2018-09-07', 'day'),
 ('2018-02-02', 'day'),
 ('2016-09-30', 'day')]

In [16]:
raw_data_df.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,time_signature,chorus_hit,sections,target,album,release_type,release_date,release_date_precision,explicit,popularities
0,Walking Blues,Big Joe Williams,spotify:track:1ZjN5X8LmUB67pWPgimW3B,0.509,0.277,6,-14.323,1,0.0495,0.827,...,4,68.4653,7,0,Walking Blues,album,1981-01-01,day,False,0
1,Suddenly Last Summer,The Motels,spotify:track:4fLIM0B1WwrLux9RdnMvze,0.716,0.753,2,-5.682,1,0.0286,0.162,...,4,57.71583,11,1,Classic Masters,album,2002-01-01,day,False,31
2,Sanctuary,Béla Fleck,spotify:track:3DwlNfiCQSdj0GOxYkR9Rq,0.36,0.542,5,-13.885,1,0.0339,0.368,...,4,30.34574,17,0,Drive,album,1988-01-01,day,False,13
3,The Wild Rover,The Pogues,spotify:track:6JyYNPLalPgGa7XnclF5FO,0.656,0.512,7,-11.872,1,0.029,0.585,...,3,50.97022,7,0,Red Roses for Me (Expanded Edition),album,1984,year,False,49
4,In The Driver's Seat,John Schneider,spotify:track:6jJi8OXF5qaFdysB6sjWIT,0.642,0.889,2,-5.62,0,0.0494,0.375,...,4,33.62053,7,1,The Dukes Of Hazzard,album,1978,year,False,21


In [18]:
raw_data_df.to_csv('../data/processed/track_features.csv', index=False)