# Data Retrieval

In [1]:
# Hide all warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Request Genre

In [4]:
from SpotifyAPI import get_access_token, get_genre_seed
access_token = get_access_token() # Get access token
genre_seed = np.array(get_genre_seed(access_token)) # Get a list of genre

In [5]:
len(genre_seed)

126

In [6]:
# Check if there are more than 10000 songs in that genre
from SpotifyAPI import get_item
genre_count = np.array([get_item(genre, 1, 9999, access_token)['tracks']['total'] for genre in genre_seed])

# Keep the genres that have songs in the database
genre_to_keep = genre_seed[genre_count > 0]
genre_seed = [x for x in genre_seed if x in genre_to_keep]

In [7]:
len(genre_seed)

91

## Request Song Data

In [8]:
from SpotifyAPI import get_data
max_offset = 10000
max_limit = 50
data = []

In [13]:
%%time
for x in range(0, max_offset, max_limit):
    access_token = get_access_token()
    for genre in genre_seed:
        data += get_data(genre, max_limit, x, access_token)

CPU times: user 5min 3s, sys: 20.6 s, total: 5min 23s
Wall time: 1h 36s


In [15]:
len(data)

913150

In [25]:
# Define the name for all variables
var_to_use = [
    'track_id', 
    'artists_id',
    'album_id',
    'track_name', 
    'artists_name', 
    'album_name',
    'album_type', 
    'release_date', 
    'release_date_precision',    
    'is_explicit', 
    'track_popularity',
    'genre',
    'danceability', 
    'energy', 
    'key', 
    'loudness', 
    'mode', 
    'speechiness', 
    'acousticness', 
    'instrumentalness', 
    'liveness', 
    'valence', 
    'tempo', 
    'duration_ms', 
    'time_signature'
]

In [26]:
df = pd.DataFrame(data, columns=var_to_use)

## Get Number of Artist Follower

In [28]:
# Divide a list into a list of N size list then join the strings within each list
def chunk_split(chunk, limit):
    num_iter = int(len(chunk) / limit) + 1
    return [','.join(chunk[x*limit:(x+1)*limit]) for x in range(num_iter)]

In [43]:
# Extract all the unique artist id and divide them into chunks of size 50
artist_id = np.unique([item for x in df['artists_id'] for item in x.split(';')])
artist_chunk = chunk_split(artist_id, 50)

In [49]:
from SpotifyAPI import get_artist_total_follower
access_token = get_access_token()

In [50]:
%%time
# Use spotify API to get number of follower for each unique artist
artist_total_follower_chunk = [get_artist_total_follower(x, access_token) for x in artist_chunk]

CPU times: user 26.4 s, sys: 1.93 s, total: 28.3 s
Wall time: 3min 9s


In [62]:
# Calculate the number of follower for each song by computing the max
df['num_follower'] = df['artists_id'].apply(lambda row: max([get_num_follower[x] for x in row.split(';')]))

In [64]:
df.to_csv('spotify_audio_feature.csv', index=False) # save the result of raw data