# 0. Setting up the environment

In [None]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
from pandas.io.json import json_normalize
import re
import math
import numpy as np

# 1. Web scraping: Getting artists from Discogs.com

We want to get a list with 'n' artists, searching by most popular albums by style in discogs.com. By changing the style in the link, we can apply the same job on other music styles.
The style code in the link its noted as '&style_exact=Sample+Style'. By concatenating them, we can choose multiple styles.

Using 'Rock' genre to find results:

In [None]:
url = 'https://www.discogs.com/search/?sort=have%2Cdesc&genre_exact=Rock&ev=gs_mc&type=master&page=1'

Iterating over page 1 of the website

In [None]:
html = requests.get(url).content

In [None]:
soup = BeautifulSoup(html, 'lxml')


We will extract the artist name for each record stored in the page. The tag for the artist is named 'h5'.

In [None]:
tag = ['h5']

In [None]:
#
artists = [element.text for element in soup.find_all(tag)]
print(artists[:-5]) # last five elements are not useful for us

We will extract all the genres:



In [260]:
tag = 'span'

In [264]:
genres = [element.text for element in soup.find_all(tag, class_="facet_name")]

genres[:10]


['Pop',
 'Electronic',
 'Folk, World, & Country',
 'Funk / Soul',
 'Blues',
 'Pop',
 'Electronic',
 'Folk, World, & Country',
 'Funk / Soul',
 'Blues']

Now we find a problem. The sited is mainly organized by GENRES or STYLES (sub-genres). In order to properly use one of them, we'll need to extract the page link instead of the genre/style name, because each one has its own path:

We create a function to clean up the string. Luckily for us, these strings come pretty clean.

In [None]:
a = '\n\nThe String (5)*'

In [None]:
def clean_artist(string):
    '''
    This function takes a string and erases the skipline '\n', parenthesis '()' and stars '*'
    '''
    
    new_str = re.sub('\s\(\d\)', '', string).replace('\n\n', '').replace('*', '')
    
    
    return new_str

In [None]:
clean_text = [clean_artist(elem) for elem in text[:-5]]
clean_text


We can clearly see that there are some elements which are repeated, so to find 'n' artists we'll iterate over the webpage until our list meets the requirements.

In [None]:
def artist_scrape(genre='Rock', n=100):
    '''
    This function scrapes the Discogs.com webpage to see artist names, sorted by popularity.
    Returns a list with artist names.
    
    Takes 2 arguments:
    
        genre='string': Selects the genre you're interested on.
        
        n='int': Indicates the length of the list returned
    
    '''
    
    i = 0
    artists = []
    
    selected_genre= '&genre_exact=' + genre
    
    while len(artists) < n:
        
        i += 1
        
        url = 'https://www.discogs.com/search/?sort=have%2Cdesc' + selected_genre + '&ev=gs_mc&type=master&page=' + str(i)
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        tag = 'h5'
        
        text = [element.text for element in soup.find_all(tag)]
        clean_text = [clean_artist(elem) for elem in text[:-5]]
        
        for elem in clean_text:
            if len(artists) < n:
                if elem not in artists:
                    artists.append(elem)
                    
    return artists

In [None]:
bands = artist_scrape(genre='Rock', n=10)

In [None]:
len(bands)

# 2.  Songkick API Querying

In [None]:
apikey = 'PxrJ1AnxJlC6uT7i'

Now that we have the bands list, we want to find out how many concerts they made, when and where. Because of the structure of this API and the data we collected, we'll have to break this step in two parts:

### 2.1 Artist ID gathering

In order to locate an artist we're interested in, we'll have to find the unique ID of this artist on a separated part of the API of Songkick.
https://www.songkick.com/developer/artist-search

In [None]:
id_url = 'https://api.songkick.com/api/3.0/search/artists.json?apikey={your_api_key}&query={artist_name}'

We'll test our code with just one band: Pink Floyd

First thing we find out is that in order to correctly acces the band name, we'll have to replace all whitespaces with '_' (underscores). So we're making a function for it:

In [None]:
def underscore_artist(artist):
    '''
    This function takes an artist name and transforms its string into 
    a 'Str_Str' format
    '''
    return artist.replace(' ', '_') 


In [None]:
underscore_artist('Pink Floyd')

In [None]:
# FIND ARTIST ID:

band_name = 'Pink_Floyd'

pinkfloyd_url = 'https://api.songkick.com/api/3.0/search/artists.json?apikey=' + apikey + '&query=Pink_Floyd'

In [None]:
# Doing the request to the api
response = requests.get(pinkfloyd_url)
results = response.json()

After examining the JSON Dictionary, we find out that the ID Key it's stored in the following path:

In [None]:
# Display the naked json
artist_id = results['resultsPage']['results']['artist'][0]['id']

artist_id

Similarly to the Discogs.com process, we'll make a function that returns all the Artists IDs (in the same order)

In [252]:
def songkick_artist_id(artist_list, apikey = apikey):
    '''
    This function locates an Songkick ID number for a given list of artists. Then, it creates a tuple
    with the band ID and the band name
    
    Takes two arguments:
        
        artist_list = type lst: List of artist names to be found. If an input it's a string, transform it
                                into a list for a correct processing.
                                
        api_key = type str: Your Songkick API key
    
    The output is a tuple composed of two parts:
    
        output[0]= type int: List of Songkick IDs
        output[1]= type str: Artist names
        
    '''
    
    
    if type(artist_list) == str:
        artist_list = [artist_list]
    
    IDs = []
    not_found = []
    
    
    for artist in artist_list:
        
        url = 'https://api.songkick.com/api/3.0/search/artists.json?apikey=' + apikey +'&query=' +underscore_artist(artist)
        response = requests.get(url)
        results = response.json()
        try:
            songkick_id = results['resultsPage']['results']['artist'][0]['id']
            IDs.append((str(songkick_id), str(artist)))
        except:
            not_found.append(artist)
        
        print(f'Found the artist {artist}, with Songkick ID: {songkick_id}')
        
    if len(not_found) > 0:
        print(f'The following artist couldnt be found: {not_found}')
        
    return IDs
    
    

In [247]:
bands = songkick_artist_id(artist_list = 'Pink Floyd')

Found the artist Pink Floyd, with ID: 400904


We want to store the IDs and the Artist Name, because later on we found out that if an Artist plays in a Festival, this artist ID it's kinda randomized on the API

### 2.2 Getting data from the API

Now that we have the artist name and the artist id, we can now find some more data we're interested on.
We want to find:
    - The concert name
    - The concert date
    - The concert time
    - The city where the concert was done
    - The venue where the concert was done
    - The coordinates of that venue
    - If the artist was headlining the concert (billing)
    

https://www.songkick.com/developer/past-events-for-artist

All the data we're looking for it's located on this API link, we'll have to explore the JSON object to find it out:

https://api.songkick.com/api/3.0/artists/{artist_id}/gigography.json?apikey={your_api_key}

Again, we'll use Pink Floyd and its ID for testing:

In [None]:
pinkfloyd_id = songkick_artist_id(['Pink Floyd'])[0][0]
pinkfloyd_id

In [None]:
pinkfloyd_url = 'https://api.songkick.com/api/3.0/artists/'+pinkfloyd_id +'/gigography.json?apikey=' + apikey
pinkfloyd_url

In [None]:
response = requests.get(pinkfloyd_url)
response

Works!

In [None]:
result = response.json()
result

### 2.2.1 Finding the concert name

In [None]:
result['resultsPage']['results']['event'][0]['displayName']

How many results are on the page?

In [None]:
len(result['resultsPage']['results']['event'])

### 2.2.2 Finding the concert date

In [None]:
result['resultsPage']['results']['event'][0]['start']['date']

### 2.2.3 Finding the concert time

In [None]:
result['resultsPage']['results']['event'][0]['start']['time'] 

# In this particular case we have no info, so the result is None.


### 2.2.4 Finding the city where the concert was done

In [None]:
result['resultsPage']['results']['event'][0]['location']['city']

### 2.2.5 Finding the venue name where the concert was played



In [None]:
result['resultsPage']['results']['event'][0]['venue']['displayName']

### 2.2.6 Finding the coordinates of that venue

In [None]:
result['resultsPage']['results']['event'][0]['venue']['lat'] # This particular show has no info

In [None]:
result['resultsPage']['results']['event'][0]['venue']['lng'] # This particular show has no info

### 2.2.7 Finding out if the artist was headlining the concert ( billing )

In [None]:
result['resultsPage']['results']['event'][0]['performance'][0]['billing']

### 2.2.8 Getting the artist's name
We are doing this to directly assing the artist in a row for each lecture of the data

In [None]:
result['resultsPage']['results']['event'][0]['performance'][0]['displayName']

## 2.3 Defining a function for our data rows:
Finding out that the pagination comes as an argument in the link= '&page=%s'. We have 50 entries per page

In [None]:
entries = result['resultsPage']['totalEntries']

elm_per_page = result['resultsPage']['perPage']

pages = entries / elm_per_page

print(f"The number of pages on each ID will be equal to its entries ({entries}), \ndivided by the elements by page ({elm_per_page}). \nFor example, {entries}/{elm_per_page}={pages} pages. \nSince the number of pages is not an integer, we'll round up this number to iter on the elements that are left. \nIf the pages are {pages}, we'll iter over {round(pages) + 1} pages" )
      
      
      

Now, we're making a function to collect the data of each show for every artist id we pass on the list #artist_id

In [None]:
def collect_data(artist_id_list, apikey='PxrJ1AnxJlC6uT7i'):  
    
    artist_data = []
    
    for artist in artist_id_list:
        url = 'https://api.songkick.com/api/3.0/artists/'+str(artist[0]) +'/gigography.json?apikey=' + apikey
        response = requests.get(url)
        result = response.json()
        
        ## Iterating over different pages:
        pages = result['resultsPage']['totalEntries'] / result['resultsPage']['perPage']
        
        pages_round= int(math.ceil(pages))
        
    
        for z in range(pages_round):
                
                # Page 0 and Page 1 are equal, so we'll start from page 1
                url = 'https://api.songkick.com/api/3.0/artists/'+str(artist[0]) +'/gigography.json?&page='+ str(z+1) +'&apikey=' + apikey
                response = requests.get(url)
                result = response.json()
        
    
                for i in range(len(result['resultsPage']['results']['event'])):
                        
                    print(f'Fetching artist {artist[1]}... entry no.{str(z*50+i)}', end='\r')

        
                    band = artist[1]
                    b_id = artist[0]
                    bill = result['resultsPage']['results']['event'][i]['performance'][0]['billing']
                    name = result['resultsPage']['results']['event'][i]['displayName']
                    date = result['resultsPage']['results']['event'][i]['start']['date']
                    time = result['resultsPage']['results']['event'][i]['start']['time']
                    city = result['resultsPage']['results']['event'][i]['location']['city']
                    venu = result['resultsPage']['results']['event'][i]['venue']['displayName']
                    lat = result['resultsPage']['results']['event'][i]['venue']['lat'] 
                    lng = result['resultsPage']['results']['event'][i]['venue']['lng'] 
        
                    artist_data.append([band, b_id, bill, name, date, time, city, venu, lat, lng])
    
    return artist_data


In [None]:
data = collect_data(bands)

## Bringing it all together

Our goal is to make a dataframe with the gathered data, to be able to study it.
We'll define a function with the scripts we used through the process.
This function is called run_process():

In [216]:
bands= 'Metallica'

In [253]:
def get_shows_info(n=10, genre='Rock', apikey='PxrJ1AnxJlC6uT7i', artists = None):
    '''
    This function returns an organized Dataframe which contains information about all the shows played
    by that artist over its history. Contains the following variables:
        n = type int: '''
    
    if artists == None:

        artists = artist_scrape(n=n, genre=genre)
    
    
    ids = songkick_artist_id(artists)
    
    data = collect_data(ids)
    
    df = pd.DataFrame(data, columns = ['artist', 'band_id', 'headline', 'show_name', 'date', 'time',
                                       'city', 'venue', 'lat', 'lng'])
        
    print('\n-Done-')
    
    return df

In [254]:
data = get_shows_info(artists='Metallica')

Found the artist Metallica, with Songkick ID: 331163
Fetching artist Metallica... entry no.2036
-Done-


In [259]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2037 entries, 0 to 2036
Data columns (total 10 columns):
artist       2037 non-null object
band_id      2037 non-null object
headline     2037 non-null object
show_name    2037 non-null object
date         2037 non-null object
time         284 non-null object
city         2037 non-null object
venue        2037 non-null object
lat          1620 non-null float64
lng          1620 non-null float64
dtypes: float64(2), object(8)
memory usage: 159.3+ KB
