## Introduction
In this notebook, we define a class **GetMeta()** to get all metadata of a song from Spotify, including:
* `name`
* `artists`
* `popularity`
* `genre`
* `release date`
* `features`: `popularity`, `danceability`, `energy`, `key`, `loudness`, `mode`, `speechiness`, `acousticness`, `instrumentalness`, `liveness`, `valence`, `tempo`, `duration_ms`.
* `lyrics`

**GetMeta()** has been adapted from:
* https://towardsdatascience.com/become-a-lyrical-genius-4362e7710e43?gi=d438aadd83e9
* https://github.com/christianlomboy/MIR-Genre-Predictor/blob/master/MIR_data_collection.ipynb

To get a song's popularity, release date, genre, and features, I personally added four functions: **get_track_popularity()**, **get_track_dates()**, **get_track_genres()**, **get_track_features()**.

In [None]:
# import libraries
import spotipy 
from spotipy.oauth2 import SpotifyClientCredentials
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd 
import os.path
from os import path

## Scrape Metadata

In [None]:
# adapted from https://towardsdatascience.com/become-a-lyrical-genius-4362e7710e43?gi=d438aadd83e9
# and https://github.com/christianlomboy/MIR-Genre-Predictor/blob/master/MIR_data_collection.ipynb

class GetMeta():
    def __init__(self, spotify_client_id, spotify_client_secret, user_id, playlist_id, genius_key):
        self.spotify_client_id = spotify_client_id
        self.spotify_client_secret = spotify_client_secret
        self.user_id = user_id
        self.playlist_id = playlist_id
        self.genius_key = genius_key
    
    # get info for the playlist
    def get_playlist_info(self):
        token = SpotifyClientCredentials(client_id=self.spotify_client_id, client_secret=self.spotify_client_secret).get_access_token()
        sp = spotipy.Spotify(token)
        playlist = sp.user_playlist_tracks(self.user_id, self.playlist_id)
        tracks = playlist['items']
        while playlist['next']:
            playlist = sp.next(playlist)
            tracks.extend(playlist['items'])
        self.tracks = tracks
        self.playlist = playlist
        return self.tracks
    
    # get the name of a track
    def get_track_names(self):
        track_names = []
        for song in range(len(self.tracks)):
            track_names.append(self.tracks[song]['track']['name'])
        self.track_names = track_names
        return self.track_names
    
    # get artist(s) of a track
    def get_track_artists(self):
        track_artists = []
        for song in range(len(self.tracks)):
            track_artists.append(self.tracks[song]['track']['artists'][0]['name'])
        self.track_artists = track_artists
        return self.track_artists
    
    # get the popularity of a track
    def get_track_popularity(self):
        track_popularity = []
        for song in range(len(self.tracks)):
            track_popularity.append(self.tracks[song]['track']['popularity'])
        self.track_popularity = track_popularity
        return self.track_popularity
    
    # get the release date of a track
    def get_track_dates(self):
        token = SpotifyClientCredentials(client_id=self.spotify_client_id, client_secret=self.spotify_client_secret).get_access_token()
        sp = spotipy.Spotify(token)
        track_dates = []
        for song in range(len(self.tracks)):
            track_dates.append(sp.album(self.tracks[song]['track']['album']['id'])['release_date'])
        self.track_dates = track_dates
        return self.track_dates
    
    # get a genre of an artist
    def get_track_genres(self):
        token = SpotifyClientCredentials(client_id=self.spotify_client_id, client_secret=self.spotify_client_secret).get_access_token()
        sp = spotipy.Spotify(token)
        track_genres = []
        for song in range(len(self.tracks)):
            track_genres.append(sp.artist(self.tracks[song]['track']['artists'][0]['id'])['genres'])
        self.track_genres = track_genres
        return self.track_genres
    
    # get all features of a track
    def get_track_features(self):
        token = SpotifyClientCredentials(client_id=self.spotify_client_id, client_secret=self.spotify_client_secret).get_access_token()
        sp = spotipy.Spotify(token)
        track_features = []
        for song in range(len(self.tracks)):
            track_features.append(sp.audio_features(tracks = self.tracks[song]['track']['id']))
        self.track_features = track_features
        return self.track_features
    
    ## the below functions are for getting lyrics from genius.com
    # get info of the song
    def request_song_info(self, track_name, track_artist):
        self.track_name = track_name
        self.track_artist = track_artist
        base_url = 'https://api.genius.com'
        headers = {'Authorization': 'Bearer ' + self.genius_key}
        search_url = base_url + '/search'
        data = {'q': track_name + ' ' + track_artist}
        response = requests.get(search_url, data=data, headers=headers)
        self.response = response
        return self.response

    # check whether the song exists in the database
    def check_hits(self):
        json = self.response.json()
        remote_song_info = None
        for hit in json['response']['hits']:
            if self.track_artist.lower() in hit['result']['primary_artist']['name'].lower():
                remote_song_info = hit
                break
        self.remote_song_info = remote_song_info
        return self.remote_song_info
    
    # get the url 
    def get_url(self):
        song_url = self.remote_song_info['result']['url']
        self.song_url = song_url
        return self.song_url
    
    # scrape the lyrics
    def scrape_lyrics(self):
        page = requests.get(self.song_url)
        html = BeautifulSoup(page.text, 'html.parser')
        lyrics1 = html.find("div", class_="lyrics")
        lyrics2 = html.find("div", class_="Lyrics__Container-sc-1ynbvzw-2 jgQsqn")
        if lyrics1:
            lyrics = lyrics1.get_text()
        elif lyrics2:
            lyrics = lyrics2.get_text()
        elif lyrics1 == lyrics2 == None:
            lyrics = None
        return lyrics

    # get the lyrics
    def get_lyrics(self):
        playlist = GetMeta.get_playlist_info(self)
        track_names = GetMeta.get_track_names(self)
        track_artists = GetMeta.get_track_artists(self)
        song_lyrics = []
        for i in range(len(self.track_names)):
            print(f"Working on track {i}.")
            response = GetMeta.request_song_info(self, self.track_names[i], self.track_artists[i])
            remote_song_info = GetMeta.check_hits(self)
            if remote_song_info == None:
                lyrics = 'None'
                print(f"Track {i} is not in the Genius database.")
            else:
                url = GetMeta.get_url(self)
                lyrics = GetMeta.scrape_lyrics(self)
                if lyrics == None:
                    lyrics = 'None'
                    print(f"Track {i} is not in the Genius database.")
                else:
                    print(f"Retrieved track {i} lyrics!")
            song_lyrics.append(lyrics)
        return song_lyrics

In [None]:
# client_id, client_secret, genius_key
genius_key = '__'
spotify_client_id = '__'
spotify_client_secret = '__'
spotify_user_id = '__'
spotify_playlist_id = '__'
songs = GetMeta(spotify_client_id, spotify_client_secret, spotify_user_id, spotify_playlist_id, genius_key)

In [None]:
# start to scrape all metadata by executing get_lyrics()
song_lyrics = songs.get_lyrics()

## Export to CSV files

After successfully scraping all the metadata of the songs from a playlist on Spotify, we define functions to help us export all the metadata to csv files. Afterwards, we can have a dataset that includes all the metadata of the songs by merging all exported files below. 

### Names of the songs

In [None]:
# get and write names to a csv file
def get_write_song_names(filename):
    song_names = songs.get_track_names()
    d_names = {'name': song_names}  
    df_names = pd.DataFrame(d_names) 
    if path.exists(filename):
        df_names.to_csv(filename, index=False, mode='a', header=False)
    else:
        df_names.to_csv(filename, index=False)
    
get_write_song_names('data/names.csv')

### Artist(s) of the songs

In [None]:
# get and write artists to a csv file
def get_write_artists(filename):
    song_artists = songs.get_track_artists()
    d_artists = {'artists': song_artists}  
    df_artists = pd.DataFrame(d_artists) 
    if path.exists(filename):
        df_artists.to_csv(filename, index=False, mode='a', header=False)
    else:
        df_artists.to_csv(filename, index=False)

get_write_artists('data/artists.csv')

### Popularity of the songs

In [None]:
# get and write popularity to a csv file
def get_write_popularity(filename):
    song_popularity = songs.get_track_popularity()
    d_popularity = {'popularity': song_popularity}  
    df_popularity = pd.DataFrame(d_popularity) 
    if path.exists(filename):
        df_popularity.to_csv(filename, index=False, mode='a', header=False)
    else:
        df_popularity.to_csv(filename, index=False)

get_write_popularity('data/popularity.csv')

### Release Dates of the songs

In [None]:
# get and write dates to a csv file
def get_write_release_date(filename):
    song_dates = songs.get_track_dates()
    d_dates = {'release_date': song_dates}  
    df_dates = pd.DataFrame(d_dates) 
    if path.exists(filename):
        df_dates.to_csv(filename, index=False, mode='a', header=False)
    else:
        df_dates.to_csv(filename, index=False)

get_write_release_date('data/release_date.csv')

### Genre(s) of the artist

In [None]:
# get and write genres to a csv file
def get_write_genres(filename):
    song_genres = songs.get_track_genres()
    d_genres = {'genres': song_genres}  
    df_genres = pd.DataFrame(d_genres) 
    if path.exists(filename):
        df_genres.to_csv(filename, index=False, mode='a', header=False)
    else:
        df_genres.to_csv(filename, index=False)
            
get_write_genres('data/genres.csv')

### Features of the songs

In [None]:
# get and write song features to a csv file
def get_write_song_features(filename):
    song_features = songs.get_track_features()
    d_features = {'features': song_features} 
    df_features = pd.DataFrame(d_features)
    df_features = pd.concat([df_features, df_features['features'].apply(pd.Series)], axis = 1).drop('features', axis = 1)
    df_features.columns = ['features']
    df_features = pd.concat([df_features, df_features['features'].apply(pd.Series)], axis = 1).drop('features', axis = 1)
    if path.exists(filename):
        df_features.to_csv(filename, index=False, mode='a', header=False)
    else:
        df_features.to_csv(filename, index=False)
        
get_write_song_features('data/features.csv')

### Lyrics of the songs

In [None]:
# get and write lyrics to a csv file
def get_write_lyrics(filename):
    song_lyrics_filtered = [i.replace('\n\n', ' ') for i in song_lyrics]
    song_lyrics_filtered = [i.replace('\n', '. ') for i in song_lyrics_filtered]
    song_lyrics_filtered = [i.replace('\u2005', ' ') for i in song_lyrics_filtered]
    song_lyrics_filtered = [i.replace('\u205f', ' ') for i in song_lyrics_filtered]
    song_lyrics_filtered = [i.replace('"', "'") for i in song_lyrics_filtered]
    d_lyrics = {'lyrics': song_lyrics_filtered}  
    df_lyrics = pd.DataFrame(d_lyrics) 
    if path.exists(filename):
        df_lyrics.to_csv(filename, index=False, mode='a', header=False)
    else:
        df_lyrics.to_csv(filename, index=False)

get_write_lyrics('data/lyrics.csv')