# Scraping overview

### 1. Growing/gathering the related artist network using the Spotify API

### 2. Fetching a list of available songs for each artist from Genius API

### 3. Scraping the lyrics for each song

### 4. Additionally fetching the release date for each song from Genius API




In [None]:
import requests
import json
import pandas as pd
import glob
import numpy as np
import time
import concurrent.futures
import matplotlib.pyplot as plt
import lyricsgenius as lg

# Variables

In [None]:
# Spotify API
API_artists = 'https://api.spotify.com/v1/artists/'
API_search = 'https://api.spotify.com/v1/search/'
# Genius API
API_base = 'https://api.genius.com'
API_search = API_base + '/search?q='


# Local directory path names
url_rel_done = '../Data/Spotify_rel_done.txt'
url_rel_links = '../Data/Spotify_related_link_list.csv'
# Data for artist network
url_artist = '../Data/Related artist network data/artists.csv'
url_edges = '../Data/Related artist network data/edges.csv'
url_genres = '../Data/Related artist network data/genres.txt'

url_genius_songs = '../Data/Lyrics data/lyrics.csv'
url_genius_relase_date = '../Data/Lyrics data/song_release_date.csv'

url_spotify_token = '../Data/authorization_code_spotify.txt'
url_genius_token = '../Data/authorization_code_genius.txt'

# For authentication
spotify_token = open(url_spotify_token).read().replace("\n", " ")
spotify_headers = {"Authorization": "Bearer " + spotify_token}

# Token
genius_token = open(url_genius_token).read().replace("\n", " ")
genius_headers = {"Authorization": "Bearer " + genius_token}

# Seed for related artists crawler
Kanye_ID = '5K4W6rqBFWDnAN6FQUkS6x'

# other data
default_columns = ['name','id','followers','popularity','genres']
final_columns = ['name','id','followers','popularity']

# Empty container for final dataframe
empty_df = pd.DataFrame(columns = ['name','id','followers','popularity','genres'])


# Search url
API_base = 'https://api.genius.com'
API_search = API_base + '/search?q=' 


# Helper functions

## General functions
- **execute query:** Takes in a query and sends a request using the header defined in the variables seciton
- **concurrent_requests:** asynchronously calls the given function over the iterable given and returns the results a list of futures


## Spotify functions
- **get_raw_related_artist_responses:** Takes in a spotify artist ID, returns a json reponse with related artists
- **concurrent_related_artist_requests:** asynchronously calls <code>get_raw_related_artists</code> and returns a list of futures
- **process_raw_related_artist_responses:** Takes in the response from <code>get_raw_related_artists</code> and outputs three objects:
    - <code>artist_df</code>: a pandas dataframe with the variables of interest about each artist
    - <code>genre_dict</code>: a dictionary with a single key, which is the artist ID and the value is the list of the artists genres
    - <code>new_edges</code>: a pandas dataframe with a <code>from</code> column for the ID of the artist and a <code>to</code> column with the related artist from the reponse 





In [None]:
# General functions
def execute_query(query,return_string = False):
    if 'spotify' in query.lower():
        headers = spotify_headers
    else:
        headers = genius_headers

    response = requests.get(url = query,headers=headers).json()
    if return_string:
        return json.dumps(response, indent=2)
    return response

def concurrent_requests(iterable, function):
    future_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        for i in iterable:
            future = executor.submit(function, i)
            future_list.append(future)

    return future_list

In [None]:
# Spotify functions
def process_raw_song_responses(response,artist_name):
    # load response
    response_content = response['response']

    # Extract data and insert into dataframe
    hits_df = pd.DataFrame(response_content['hits'])
    if len(hits_df) == 0:
        return hits_df
    hits_df = pd.DataFrame(list(hits_df.result.values))
    hits_df.loc[:,'primary_artist'] = hits_df.loc[:,'primary_artist'].apply(lambda x : x['name'])
    hits_df = hits_df[hits_df.primary_artist == artist_name]
    return hits_df

def get_raw_related_artist_responses(ID):
    query = API_artists + f'{ID}/related-artists' 
    response = execute_query(query)
    return response


def concurrent_related_artist_requests(id_list):
    return concurrent_requests(id_list,get_raw_related_artist_responses)

## Spotify: Getters and setters

- Artist dataframe
- Edge dataframe
- Genre dictionary


In [None]:
# Spotify getters & setters
# For artist dataframe
def save_current_artist_dataframe(df):
    df.to_csv(url_artist)

def load_current_artist_dataframe():
    return pd.read_csv(url_artist,index_col=0)

# For edges dataframe
def save_current_edge_dataframe(df):
    df.index = np.arange(len(df))
    df.to_csv(url_edges)

def load_current_edge_dataframe():
    return pd.read_csv(url_edges,index_col=0)

# For the genre dictionary
def save_current_genre_dict(dictionary):
    file_object = open(url_genres,'w')
    file_object.write(json.dumps(dictionary))
    file_object.close()

def load_current_genre_dict():
    string = open(url_genres).read()
    json_object = json.loads(string)
    return json_object 

# aggregate functions for all three
def save_all(artists,genre_dict,edges):
    save_current_genre_dict(genre_dict)
    save_current_edge_dataframe(edges)
    save_current_artist_dataframe(artists)

def load_all():
    genre_dict = load_current_genre_dict()
    edges = load_current_edge_dataframe()
    artists = load_current_artist_dataframe()
    return artists,genre_dict,edges

# Spotify related artists crawler

## Settings
- <code>batch_size</code>: Number of artists to query asyncronously in each iteration
- <code>Stopping size</code>: number of artist required before stopping

## Crawler description
**For each iteration:**

0. (Unless being run for the first time) Reload previously saved artists
1. Create a list of unseached artists
2. Select a batch from the unsearched artists, selecting first the ones with the highest popularity score
3. Call asyncronous function to get a batch of unprocessed responses
4. For each response:
    1. Process response using <code>process_raw_related_artist_responses</code>
    2. Handle if the response was empty. Only mark as searched if artist had no related artist, not if the API returned an error
    3. Add the new artist to the completed data objects
5. Save data objects after every batch

In [None]:
artists,genre_dict,edges = load_all()

batch_size = 100
stopping_size = 100_000

while len(artists) < stopping_size:
    
    artists = artists.sort_values(['popularity','followers'],ascending=False)
    unsearched_ids = artists.loc[artists.searched == False].index.values
    current_batch = unsearched_ids[:batch_size].copy()
    
    # Make the requests concurrently
    future_list = concurrent_related_artist_requests(current_batch)

    # then process the requests
    results = [x.result() for x in future_list]
    for ID,response in zip(current_batch,results):

        tmp_artist, tmp_genre_dict, tmp_new_edges = process_raw_related_artist_responses(response,ID)
        if len(tmp_new_edges) == 0:
            print(tmp_genre_dict)
            if tmp_genre_dict['error'] == 'problem':
                break
            elif tmp_genre_dict['error'] == 'no relation':
                # end of the road, this artist has no related artists
                artists.at[ID,'searched'] = True
                continue
            else:
                print('unexplained error')
                break
        # append new ids to the artis dataframe
        for i in tmp_artist.index.values:
            if i not in artists.index:
                artists.loc[i,:] = None
                artists.loc[i,:] = tmp_artist.loc[i,:]

        # Add new genres to genre_dict
        genre_dict.update(tmp_genre_dict)
        
        len(tmp_new_edges)
        # add new edges to the edge list
        edges = pd.concat([edges,tmp_new_edges],ignore_index=True)

        artists.at[ID,'searched'] = True

    save_all(artists,genre_dict,edges)
    print(len(unsearched_ids),len(artists),len(edges))
    

# 2. Fetching a list of available songs for each artist from Genius API

### Helper functions

- <code>get_raw_artist_songs</code> fetch songs for a single artist
- <code>concurrent_get_songs_requests</code> asynchronously call a batch of artist song requests
- <code>process_raw_song_responses</code> process the requests

In [None]:
def get_raw_artist_songs(artist_name):
    q = API_search + artist_name
    response = execute_query(q)
    return response
   
def concurrent_get_songs_requests(artist_names):
    return concurrent_requests(artist_names,get_raw_artist_songs)

def process_raw_song_responses(response,artist_name):
    # load response
    response_content = response['response']

    # Extract data and insert into dataframe
    hits_df = pd.DataFrame(response_content['hits'])
    if len(hits_df) == 0:
        return hits_df
    hits_df = pd.DataFrame(list(hits_df.result.values))
    hits_df.loc[:,'primary_artist'] = hits_df.loc[:,'primary_artist'].apply(lambda x : x['name'])
    hits_df = hits_df[hits_df.primary_artist == artist_name]
    return hits_df


## Genius: Getters and setters
- Song list dataframe


In [None]:
# Genius
def load_songs():
    df = pd.read_csv(url_genius_songs,index_col=0)
    df.index = np.arange(len(df))
    return df

def save_songs(dataframe):
    dataframe.to_csv(url_genius_songs)

# Fetching and saving songs from Genius

1. (Unless being run for the first time) Reload previously saved list of songs
2. Create the list of artist which we want to fetch songs for (~4000 artists)
    - *See explainer notebook for more details on why this was done*
3. Create a running list of artists that have already been queried
4. Iteratively do the following
    - Select a batch of un-searched artists
    - Call <code>concurrent_get_songs_requests</code> for these artists to asynchronously get the song data for each artist
    - Process the batch of responses, making sure to include at least the following variables of interest:
        - <code>api_path</code>: For later querying of release date from API
        - <code>path</code>: For later scraping of lyrics from website
        - <code>title</code>: for identification of unique songs
        - <code>primary_artist</code>: for unique identification of songs, aggregation by artist and connection to network


In [None]:
songs = load_songs()

top_popularity = artists.sort_values('popularity',ascending=False).iloc[:3000]
top_followers = artists.sort_values('followers',ascending=False).iloc[:3000]
top_both = artists.loc[top_popularity.index.union(top_followers.index)]

In [None]:
already_searched = list(songs.primary_artist.unique())
batch_size = 205
new_added = 1
while new_added > 0:
    start,end = 0,batch_size
    # print(f'batch: {i} of {total_batches}')
    batch = top_both.loc[~top_both.name.isin(already_searched)].name.values[start:end]
    print('starting concurrent')
    song_response_future_list = concurrent_get_songs_requests(batch)
    print('extracting results')
    song_response_results = [x.result() for x in song_response_future_list]
    print('processing')
    processed_responses = [process_raw_song_responses(response,artist_name) for response,artist_name in zip(song_response_results,batch)]
    print('saving')
    song_batch_df = pd.concat(processed_responses)
    songs = pd.concat([songs,song_batch_df])
    songs = songs.drop_duplicates(['primary_artist','title'])
    already_searched = already_searched + list(batch)
    new_added = len(song_batch_df)
    print(new_added, 'new songs, now', len(songs),' in total')
    save_songs(songs)


starting concurrent
extracting results
processing
saving
0 new songs, now 27436  in total


# 3. Scraping the lyrics for each song
1. Create an instance of a lyricsgenius object specifically designed to help with scraping from the genius website
    - **Note:** Due to [legal reasons](https://genius.com/discussions/277279-Get-the-lyrics-of-a-song) Genius does not offer direct API access to the lyrics for songs, but instead we can use this package, which has been made to directly scrape the lyrics from the site itself. 
2. Create a list of songs which have not been scraped
3. In batches of 1000 at a time and until no songs are yet unsearched
    - Scrape the lyrics using lyricsgenius lyrics scraper from the website
    - Update and save the dataframe after each iteration  

# Genius lyrics: Helper functions
- <code>concurrent_get_songs_requests</code> asynchronously call the lyricsgenius objects built in function <code>lyrics</code> with the url to each song

In [None]:
def concurrent_get_songs_requests(urls):
    return concurrent_requests(urls,lambda x: genius.lyrics(x))

In [None]:
genius = lg.Genius(genius_token, skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"], remove_section_headers=True, verbose=False)
unsearched_urls = songs.loc[songs.lyrics.isna(),'url']
batch_size = 1000
while(len(unsearched_urls) > 0):
    print(f'{len(unsearched_urls)} unsearched lyrics...')
    batch = unsearched_urls[:batch_size]
    lyrics_futures = concurrent_get_songs_requests(batch.values)
    lyrics_batch = pd.Series([x.result() for x in lyrics_futures])
    lyrics_batch.loc[lyrics_batch.isna()] = ''
    songs.loc[batch.index,'lyrics'] = lyrics_batch.values
    save_songs(songs)
    unsearched_urls = songs.loc[songs.lyrics.isna(),'url']

387 unsearched lyrics...


# 4. Additionally fetching the meta-data for each song from Genius API
- We're mainly interested in the release date here, but there is more data available here which could be used for more analysis

# Genius song meta-data fetch: helper functions

In [None]:
def concurrent_get_songs_metadata_requests(urls):
    return concurrent_requests(urls,lambda x: execute_query(API_base + x))

In [None]:
try:
    relase_dates = pd.read_csv(url_genius_relase_date,index_col = 0)
except:
    print('No such file exists... creating from scratch.')
    futures = concurrent_get_songs_metadata_requests(songs.api_path.values)
    results = [x.result()['response']['song'] for x in futures if x.result()['meta']['status'] != 404]
    songs_metadata = pd.DataFrame(results)
    relase_dates = tmp_songs_metadata.loc[:,['api_path','title','release_date','primary_artist']]
    relase_dates.primary_artist = relase_dates.primary_artist.apply(lambda x : x['name'])
    relase_dates.to_csv(url_genius_relase_date)