# Data Preprocessing

In [1]:
# Importing the required modules and libraries
import pandas as pd


In [2]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/utils")

from spotify_client import spotify

In [3]:
raw_80s_df = pd.read_csv('../data/raw/tracks_80s.csv')
raw_90s_df = pd.read_csv('../data/raw/tracks_90s.csv')
raw_00s_df = pd.read_csv('../data/raw/tracks_00s.csv')
raw_10s_df = pd.read_csv('../data/raw/tracks_10s.csv')

raw_data_df = pd.concat([raw_80s_df, raw_90s_df, raw_00s_df, raw_10s_df], ignore_index=True, axis=0)

del raw_80s_df, raw_90s_df, raw_00s_df, raw_10s_df

raw_data_df.shape

(24698, 19)

In [4]:
song_uri_list = raw_data_df.uri.tolist()

In [5]:
def get_tracks_metadeta(song_uri_list):
    albums = []
    release_types = []
    release_dates = []
    explicities = []
    popularities = []
    artists = []
    
    num_iters = len(song_uri_list) // 50
    
    for iter in range(num_iters+1):
        # print(iter)
        if iter == 0:
            starting_index = 0
            ending_index = 50
        else:
            starting_index = ending_index
            ending_index += 50
        
        if iter != num_iters:
            response = spotify.tracks(tracks=song_uri_list[starting_index:ending_index])
        else:
            response = spotify.tracks(tracks=song_uri_list[starting_index:])
            
        tracks = response['tracks']
        
        for track in tracks:
            albums.append(track['album']['name'])
            release_types.append(track['album']['album_type'])
            release_dates.append((track['album']['release_date'], track['album']['release_date_precision']))
            explicities.append(True if track['explicit'] == 'true' else False)
            popularities.append(track['popularity'])
            artist_list = []
            
            for artist in track['artists']:
                artist_list.append(artist['name'])
                
            artists.append(artist_list)
        
    return albums, release_types, release_dates, explicities, popularities, artists

In [6]:
albums, release_types, release_dates, explicities, popularities, artists = get_tracks_metadeta(song_uri_list)

In [41]:
artists_str_list = []

for artist in artists:
    artist_str = ''
    for unit in artist:
        artist_name = unit.replace('"','')
        artist_str += f'{artist_name}&&'
    
    artist_str = artist_str.rstrip('&&')
    artists_str_list.append(artist_str)
    
artists_str_list

['Big Joe Williams',
 'The Motels',
 "Béla Fleck&&Sam Bush&&Jerry Douglas&&Mark O'Connor&&Tony Rice&&Mark Schatz",
 'The Pogues',
 'John Schneider',
 'Fields Of The Nephilim',
 'Brighton Rock',
 'The Other Ones',
 'Rupert Holmes',
 'Yngwie Malmsteen',
 'Breakfast Club',
 'Chris & Cosey',
 'Orchestra Baobab',
 'Martika',
 'The Jets',
 'Harold Budd&&Brian Eno',
 'ZZ Top',
 'Norman Blake&&Tony Rice',
 'Running Wild',
 'Carlos Cano',
 'Bruce Springsteen',
 'Eric B. & Rakim',
 'Slime',
 'Phil Seymour',
 'Enya',
 'Real Life',
 'Moebius&&Plank',
 'Simply Red',
 'Big Country',
 'Kenny Loggins',
 'Barbra Streisand&&Donna Summer',
 'Sheena Easton',
 'Skinny Puppy',
 'Huey Lewis & The News',
 'Vladimir Cosma',
 'Kool & The Gang',
 'Tenor Fly&&Daddy Freddy',
 'Demented Are Go',
 'Skinny Puppy',
 'Crass',
 'Los Suaves',
 'Keith Mansfield',
 'The Doobie Brothers',
 'Luiz Henrique',
 'The Pogues',
 'Miami Sound Machine',
 'Paula Abdul',
 'Stan Getz&&João Gilberto&&Antônio Carlos Jobim',
 'George Mich

In [None]:
spotify.tracks(['5wG3HvLhF6Y5KTGlK0IW3J','2Hh3ETdQKrmSI3QS0hme7g'])

{'tracks': [{'album': {'album_type': 'album',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0iEtIxbK0KxaSlF7G42ZOp'},
      'href': 'https://api.spotify.com/v1/artists/0iEtIxbK0KxaSlF7G42ZOp',
      'id': '0iEtIxbK0KxaSlF7G42ZOp',
      'name': 'Metro Boomin',
      'type': 'artist',
      'uri': 'spotify:artist:0iEtIxbK0KxaSlF7G42ZOp'}],
    'available_markets': ['AR',
     'AU',
     'AT',
     'BE',
     'BO',
     'BR',
     'BG',
     'CA',
     'CL',
     'CO',
     'CR',
     'CY',
     'CZ',
     'DK',
     'DO',
     'DE',
     'EC',
     'EE',
     'SV',
     'FI',
     'FR',
     'GR',
     'GT',
     'HN',
     'HK',
     'HU',
     'IS',
     'IE',
     'IT',
     'LV',
     'LT',
     'LU',
     'MY',
     'MT',
     'MX',
     'NL',
     'NZ',
     'NI',
     'NO',
     'PA',
     'PY',
     'PE',
     'PH',
     'PL',
     'PT',
     'SG',
     'SK',
     'ES',
     'SE',
     'CH',
     'TW',
     'TR',
     'UY',
     'US',
     'GB',


In [None]:
albums[0:5]

['Know-It-All (Deluxe)',
 'The Genius of Esquivel',
 'Love Someone',
 'Cura',
 'Juju on That Beat (TZ Anthem)']

In [36]:
release_date, release_date_precision = zip(*release_dates)

In [42]:
raw_data_df['album'] = albums
raw_data_df['release_type'] = release_types
raw_data_df['release_date'] = release_date
raw_data_df['release_date_precision'] = release_date_precision
raw_data_df['explicit'] = explicities
raw_data_df['popularities'] = popularities
raw_data_df['artists'] = artists_str_list

In [None]:
release_dates[0:5]

[('2015-11-13', 'day'),
 ('2012-06-12', 'day'),
 ('2018-09-07', 'day'),
 ('2018-02-02', 'day'),
 ('2016-09-30', 'day')]

In [43]:
raw_data_df.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,chorus_hit,sections,target,album,release_type,release_date,release_date_precision,explicit,popularities,artists
0,Walking Blues,Big Joe Williams,spotify:track:1ZjN5X8LmUB67pWPgimW3B,0.509,0.277,6,-14.323,1,0.0495,0.827,...,68.4653,7,0,Walking Blues,album,1981-01-01,day,False,0,Big Joe Williams
1,Suddenly Last Summer,The Motels,spotify:track:4fLIM0B1WwrLux9RdnMvze,0.716,0.753,2,-5.682,1,0.0286,0.162,...,57.71583,11,1,Classic Masters,album,2002-01-01,day,False,32,The Motels
2,Sanctuary,Béla Fleck,spotify:track:3DwlNfiCQSdj0GOxYkR9Rq,0.36,0.542,5,-13.885,1,0.0339,0.368,...,30.34574,17,0,Drive,album,1988-01-01,day,False,13,Béla Fleck&&Sam Bush&&Jerry Douglas&&Mark O'Co...
3,The Wild Rover,The Pogues,spotify:track:6JyYNPLalPgGa7XnclF5FO,0.656,0.512,7,-11.872,1,0.029,0.585,...,50.97022,7,0,Red Roses for Me (Expanded Edition),album,1984,year,False,48,The Pogues
4,In The Driver's Seat,John Schneider,spotify:track:6jJi8OXF5qaFdysB6sjWIT,0.642,0.889,2,-5.62,0,0.0494,0.375,...,33.62053,7,1,The Dukes Of Hazzard,album,1978,year,False,22,John Schneider


In [47]:
raw_data_df.to_csv('../data/built/track_features.csv', index=False)

In [16]:
req_series = raw_data_df.artists

In [39]:
raw_data_df.iloc[2].artists.split('&&')

['Béla Fleck',
 'Sam Bush',
 'Jerry Douglas',
 "Mark O'Connor",
 'Tony Rice',
 'Mark Schatz']

In [29]:
raw_data_df.artists

0                                         Big Joe Williams
1                                               The Motels
2        Béla Fleck\nSam Bush\nJerry Douglas\nMark O'Co...
3                                               The Pogues
4                                           John Schneider
                               ...                        
24693                                                Yolta
24694                                          Kodak Black
24695                                           Katy Perry
24696                Oscar Peterson\nRay Brown\nHerb Ellis
24697                                          Hans Zimmer
Name: artists, Length: 24698, dtype: object

In [30]:
testing = raw_data_df.artists.apply(lambda x: len(x.split('\n')))

In [31]:
testing.max()

23

In [33]:
testing.value_counts()

artists
1     20462
2      3001
3       796
4       251
5       103
6        48
7        17
9         9
8         3
14        1
18        1
10        1
17        1
16        1
11        1
23        1
13        1
Name: count, dtype: int64