# Song Scraping on Spotify API
This notebook allows you to retrieve data on songs (spotify ID, artists, name, popularity, release_date...) on Spotify database. Spotify playlists ID are provided as input.

The obtained data are in notebooks/lyrizz/csv/{df_playlist.csv | df_artists.csv | df_tracks.csv}

In [None]:
import requests
import pandas as pd
import numpy as np

import urllib.parse
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime
import zipfile
import time
import pymysql
from googlesearch import search

import ssl
from googleapiclient.discovery import build

ssl._create_default_https_context = ssl._create_unverified_context

FOLDER_LYRIZZ_TXT = "/home/tanguy/data/lyrizz/txt"
FOLDER_LYRIZZ_CSV = "/home/tanguy/data/lyrizz/csv"

### Spotify API

In [None]:
# Put your ID from Spotify
CLIENT_ID = 'YOUR***CLIENT***ID'
CLIENT_SECRET = 'YOUR***CLIENT***SECRET'
TOKEN = 'YOUR***TOKEN'

AUTH_URL = 'https://accounts.spotify.com/api/token'

auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']

HEADERS_SPOTIFY = {'Authorization': 'Bearer {token}'.format(token=access_token)}

# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'

### Definition of functions

In [None]:
def split_list(raw_list, N_max=100):
    list_index = []
    nb_split = int(np.ceil(len(raw_list) / N_max))
    for i in range(nb_split):
        a = N_max * i
        if i == nb_split-1:
            b = len(raw_list)
        else:
            b = N_max * i + N_max

        list_index.append((a, b))
    return list_index

In [None]:
def get_info_playlist(playlist_id):
    """
    Parameters
    ----------
    playlist_id : str
        e.g., '4l1CEhc7ZPbaEtiPdCSGbl'

    Returns
    -------
    df_tracks : pd.DataFrame
        columns [track_id, artists, name, popularity, release_date, release_date_precision, speechiness, instrumentalness]
    list_artist_id : list of artist_id
    """
    
    nb_track = requests.get(BASE_URL + 'playlists/' + playlist_id + '/tracks', headers=HEADERS_SPOTIFY, params={'fields':'total'}).json()['total']
    nb_split = int(np.ceil(nb_track / 100))
    
    list_artist_id = []
    data_track = []
    for i in range(nb_split):
        offset = i * 100

        r_playlist = requests.get(BASE_URL + 'playlists/' + playlist_id + '/tracks', headers=HEADERS_SPOTIFY,
                             params={'offset':offset, 'limit':100})
        d_playlist = r_playlist.json()

        
        
        for items in d_playlist['items']:
            track = items['track']
            track_id = track['id']
            
            if 'spotify' not in track['external_urls']:
                continue
            else:
                external_urls = track['external_urls']['spotify']
                
            artists_id = [a['id'] for a in track['artists']]
            for id_a in artists_id:
                if id_a not in list_artist_id:
                    list_artist_id.append(id_a)
            list_artist_name = [a['name'] for a in track['artists']]
            list_artist_name_str = ', '.join(list_artist_name)
            name = track['name']
            popularity = track['popularity']
            
            try:
                image = track['album']['images'][0]['url']
            except:
                image = ''
            release_date = track['album']['release_date']
            release_date_precision = track['album']['release_date_precision']

            data_track.append([track_id, list_artist_name_str, name, popularity, release_date, release_date_precision])

    df_tracks = pd.DataFrame(data_track, columns=['track_id', 'artists', 'name', 'popularity', 'release_date', 'release_date_precision'])
    list_track = list(df_tracks['track_id'].values)
        
    list_speechiness = []
    list_instrumentalness = []
    for a,b in split_list(list_track, 100):
        sub_list = list_track[a:b]
        list_track_str = ','.join(sub_list)
        r_features = requests.get(BASE_URL + 'audio-features/', headers=HEADERS_SPOTIFY, params = {'ids':list_track_str})
        d_features = r_features.json()['audio_features']
        list_speechiness += [elem['speechiness'] for elem in d_features]
        list_instrumentalness += [elem['instrumentalness'] for elem in d_features]
        time.sleep(1)
    
    df_tracks['speechiness'] = list_speechiness
    df_tracks['instrumentalness'] = list_instrumentalness
    
    df_tracks = df_tracks.set_index('track_id')

    return df_tracks, list_artist_id

# df_tracks, list_artist = get_info_playlist('4l1CEhc7ZPbaEtiPdCSGbl')

In [None]:
def get_df_list_artists(list_artist_id):
    """
    Parameters
    ----------
    list_artist_id : list of str
        e.g., ['63MCBZRiUdnqRsAOJwijiB', ..]
        
    Returns
    -------
    df_artists : pd.DataFrame
    """
    
    list_name = []
    list_popularity = []
    list_genres = []
    for a,b in split_list(list_artist_id, 50):
        sub_list = list_artist_id[a:b]
        list_artist_str = ','.join(sub_list)
        r_features = requests.get(BASE_URL + 'artists/', headers=HEADERS_SPOTIFY, params = {'ids':list_artist_str})
        d_features = r_features.json()['artists']
        
        list_name += [elem['name'] for elem in d_features]
        list_popularity += [elem['popularity'] for elem in d_features]
        list_genres += [elem['genres'] for elem in d_features]
        time.sleep(1)
    
    df_artists = pd.DataFrame([list_name, list_popularity, list_genres]).T
    df_artists.index = list_artist_id
    df_artists.columns = ['name', 'popularity', 'genres']
    df_artists.index.name = 'artist_id'
    
    return df_artists

# df_artists = get_df_list_artists(list_artist)

### Process
- Extract the songs of a playlist
- Extract artists
- Saves data in CSV files

In [None]:
# List of playlist to add (to enter manually)
list_playlist_id = ['4l1CEhc7ZPbaEtiPdCSGbl', '1mNmt0VqjbYH1WCHZDT9yL', '2oERu0baZbTfhoopJeYjq5', '4JnuqMuz4LrGUzIGGUjfdw',
                   '37i9dQZF1DXd0Y4aXXQXWv', '37i9dQZF1DWSBZhfF4ZHr8', '37i9dQZF1DX9h4FeDa1xsw', '37i9dQZF1DWXRqgorJj26U', '37i9dQZF1DWXTHBOfJ8aI7', '37i9dQZF1DWWwzidNQX6jx', '37i9dQZF1DX1spT6G94GFC', '37i9dQZF1DX1rVvRgjX59F', '37i9dQZF1DX3oM43CtKnRV', '37i9dQZF1DWWzBc3TOlaAV', '37i9dQZF1DWSrqNVMcxGKc', '37i9dQZF1DX0AgrgHFR9aa', '37i9dQZF1DX186v583rmzp', '37i9dQZF1DWYmmr74INQlb', '37i9dQZF1DWXbttAJcbphz', '37i9dQZF1DX6VDO8a6cQME', '37i9dQZF1DWWOaP4H0w5b0', '37i9dQZF1DWWGI3DKkKGzJ', '37i9dQZF1DWWl7MndYYxge', '37i9dQZF1DWSvv6VnIb3i0', '37i9dQZF1DXacPj7eARo6k', '37i9dQZF1DX69KJk2S04Hp', '37i9dQZF1DX7LGssahBoms', '37i9dQZF1DX4WELsJtFZjZ',
                   '7xh5J2B7Q07o1zMr61fY39', '6tyX6gCq27jSmnlPqTPMna', '6WgChJyRl2rwqy2GT5wqkf', '6KFgNNf7wMYZTUleaXWLGV', '64pqv88Pv17rg5wh1wqvBl', '3Nln7j2Sm8XCIlIoBRoUY2', '7gReNdpp6a1Cqcl3bpsSce', '1H4VHj0cxUCwJFzlELZvui', '7HteAXMTMj7Ogsl5ldIApo',
                   '3NJrEK6haivxJ6nhvkS7iy', '4t5FzcxNPxMe6zhbVwcZwy']


# Get historical data
df_tracks_tot = pd.read_csv(os.path.join(FOLDER_LYRIZZ_CSV, 'df_tracks.csv'), sep=';', index_col=0)
df_artists_tot = pd.read_csv(os.path.join(FOLDER_LYRIZZ_CSV, 'df_artists.csv'), sep=';', index_col=0)
df_playlist_tot = pd.read_csv(os.path.join(FOLDER_LYRIZZ_CSV, 'df_playlist.csv'), sep=';', index_col=0)


for playlist_id in list_playlist_id:
    
    if playlist_id in df_playlist_tot['playlist_id'].values:
        # Playlist already added
        continue
#     print(playlist_id)
    df_tracks, list_artist = get_info_playlist(playlist_id)
    df_artists = get_df_list_artists(list_artist)
    
    print(playlist_id, f'{len(df_tracks)} tracks' , f'{len(df_artists)} artists')
    
    # Save on CSV
    df_playlist_tot = pd.read_csv(os.path.join(FOLDER_LYRIZZ_CSV, 'df_playlist.csv'), sep=';', index_col=0)
    df_playlist_tot = df_playlist_tot.append({'playlist_id': playlist_id}, ignore_index=True)
    df_playlist_tot.to_csv(os.path.join(FOLDER_LYRIZZ_CSV, 'df_playlist.csv'), sep=';')
    
    df_artists_tot = pd.concat([df_artists_tot, df_artists])
    df_artists_tot = df_artists_tot[~df_artists_tot.index.duplicated(keep='first')]
    df_artists_tot.to_csv(os.path.join(FOLDER_LYRIZZ_CSV, 'df_artists.csv'), sep=';')
    
    df_tracks_tot = pd.concat([df_tracks_tot, df_tracks])
    df_tracks_tot = df_tracks_tot[~df_tracks_tot.index.duplicated(keep='first')]
    df_tracks_tot.to_csv(os.path.join(FOLDER_LYRIZZ_CSV, 'df_tracks.csv'), sep=';')