# 0. Setting up the environment

In [130]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
from pandas.io.json import json_normalize
import re
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import register_matplotlib_converters

import seaborn as sns


# 1. Web scraping: Getting artists from Discogs.com

We want to get a list with 'n' artists, searching by most popular albums by style in discogs.com. By changing the style in the link, we can apply the same job on other music styles.
The style code in the link its noted as '&style_exact=Sample+Style'. By concatenating them, we can choose multiple styles.

In [2]:
url = 'https://www.discogs.com/search/?sort=have%2Cdesc&ev=gs_mc&type=master&page=1'

## Sample Rock URL ='https://www.discogs.com/search/?sort=have%2Cdesc&genre_exact=Rock&ev=gs_mc&type=master&page=1'

Iterating over page 1 of the website

In [3]:
html = requests.get(url).content

In [4]:
html = requests.get(url).content
soup = BeautifulSoup(html, 'lxml')


We will extract the artist name for each record stored in the page. The tag for the artist is named 'h5'.

In [5]:
tag = ['h5']

In [6]:
#
artists = [element.text for element in soup.find_all(tag)]
artists = artists[:-5]# last five elements of the list are not useful for us
artists[:5]

['\n\nPink Floyd',
 '\n\nThe Beatles',
 '\n\nThe Beatles',
 '\n\nPink Floyd',
 '\n\nMichael Jackson']

We'll create a function to clean up the string. Luckily for us, these strings come pretty clean.

In [7]:
def clean_artist(string):
    '''
    This function takes a string and erases the skipline '\n', parenthesis '()' and stars '*'
    '''
    
    new_str = re.sub('\s\(\d\)', '', string).replace('\n\n', '').replace('*', '')
    
    
    return new_str

In [8]:
artist_clean = [clean_artist(elem) for elem in artists]
artist_clean[:5]


['Pink Floyd', 'The Beatles', 'The Beatles', 'Pink Floyd', 'Michael Jackson']

We will extract all the genres and styles:



In [9]:
tag = 'span'

In [10]:
genres_styles = [element.text for element in soup.find_all(tag, class_="facet_name")]

genres_styles[:5]


['Rock', 'Electronic', 'Pop', 'Folk, World, & Country', 'Jazz']

We'll store this list for later use.

Now we find a problem. The sited is mainly organized by GENRES or STYLES (sub-genres). In order to properly use one of them, we'll need to extract the page link instead of the genre/style name, because each type has its own path:

In [11]:
tag = 'a'

In [12]:
genres_link = [element.get('href') for element in soup.find_all(tag, attrs={'href': re.compile("genre_exact=")})]

genres_link[:5]

['/search/?sort=have%2Cdesc&genre_exact=Rock&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&genre_exact=Electronic&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&genre_exact=Pop&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&genre_exact=Folk%2C+World%2C+%26+Country&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&genre_exact=Jazz&ev=gs_mc&type=master&page=1']

Given this strings, we can extract the exact **_'genre_exact=Sample+Genre'_** items:

In [13]:
genres_list = [str(re.findall('&genre_exact=\w+\+?\-?\w+', link))[2:-2] for link in genres_link ]

genres = list(set(genres_list))

genres[:5]

['&genre_exact=Blues',
 '&genre_exact=Folk',
 '&genre_exact=Latin',
 '&genre_exact=Jazz',
 '&genre_exact=Reggae']

Similarly, we can extract the **_'style_exact=Sample+Style'_** items:

In [14]:
styles_list = [element.get('href') for element in soup.find_all(tag, attrs={'href': re.compile("style_exact=")})]

styles_list[:5]

['/search/?sort=have%2Cdesc&style_exact=House&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&style_exact=Pop+Rock&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&style_exact=Punk&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&style_exact=Vocal&ev=gs_mc&type=master&page=1',
 '/search/?sort=have%2Cdesc&style_exact=Techno&ev=gs_mc&type=master&page=1']

In [15]:
styles = [str(re.findall('&style_exact=\w+\+?\w\+?\-?\w+', link))[2:-2] for link in styles_list]

styles[:5]


['&style_exact=House',
 '&style_exact=Pop+Rock',
 '&style_exact=Punk',
 '&style_exact=Vocal',
 '&style_exact=Techno']

We'll define a function that searches for a Genre or Style, and returns it's link piece, with the format ' &'GENRE/STYLE'_exact=ENTRY'
For that matter, we'll filter if the keyword belongs to Genre or Style, then we'll apply the link piece format.

In [16]:
def genre_link(string, genres=genres, styles=styles):
    '''
    This function takes as an input a genre or style name, and returns a piece of link depending on its category
    ''' 
    
    string = re.sub(' ', '+', string).title()
    link_piece = ''
    
    if ('&genre_exact=' + string) in genres:
        link_piece = ('&genre_exact=' + string)
        
    elif ('&style_exact=' + string) in styles:
        link_piece = ('&style_exact=' + string)
    else:
        raise ValueError('The selected genre doesnt exist')
    
    return link_piece

genre_link(string ='Rock')

'&genre_exact=Rock'

We can clearly see that there are some elements which are repeated, so to find 'n' artists we'll iterate over the webpage until our list meets the requirements.

In [17]:
def artist_scrape(genres='Rock', n=10):
    '''
    This function scrapes the Discogs.com webpage to get 'n' artist names, sorted by popularity and filtered by genre.
    Returns a list with artist names.
    
    Takes 2 arguments:
    
        genre= type str or type list: Selects the genre(s) you're interested on.
        
        n= type int: Indicates the number of artist returned
    
    '''
    
    i = 0
    
    artists = []
    
    
    
    if type(genres) == list:
        genre_link_list = [genre_link(genre) for genre in genres]
        genres = ''.join(genre_link_list)
    
    else:
        genres = genre_link(genres)    
    
    while len(artists) < n:
                
        i += 1
        
        url = 'https://www.discogs.com/search/?sort=have%2Cdesc' + genres + '&ev=gs_mc&type=master&page=' + str(i)
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        tag = 'h5'
        
        text = [element.text for element in soup.find_all(tag)]
        clean_text = [clean_artist(elem) for elem in text[:-5]]
        
        for elem in clean_text:
            if len(artists) < n:
                if elem not in artists:
                    artists.append(elem)
                    
    return artists

In [18]:
rock_bands = artist_scrape(genres=['rock'], n=10)

In [19]:
rock_bands

['Pink Floyd',
 'The Beatles',
 'Led Zeppelin',
 'Fleetwood Mac',
 'Nirvana',
 'Dire Straits',
 'David Bowie',
 'AC/DC',
 'Black Sabbath',
 'Eagles']

# 2.  Songkick API Querying

In [20]:
apikey = '-'

Now that we have the bands list, we want to find out how many concerts they made, when and where. Because of the structure of this API and the data we collected, we'll have to break this step in two parts:

### 2.1 Artist ID gathering

In order to locate an artist we're interested in, we'll have to find the unique ID of this artist on a separated part of the API of Songkick.
https://www.songkick.com/developer/artist-search

In [21]:
id_url = 'https://api.songkick.com/api/3.0/search/artists.json?apikey={your_api_key}&query={artist_name}'

We'll test our code with just one band: Pink Floyd

First thing we find out is that in order to correctly acces the band name, we'll have to replace all whitespaces with '_' (underscores). So we're making a function for it:

In [22]:
def underscore_artist(artist):
    '''
    This function takes an artist name and transforms its string into 
    a 'Str_Str' format
    '''
    return artist.replace(' ', '_') 


In [23]:
underscore_artist('Pink Floyd')

'Pink_Floyd'

In [24]:
# FIND ARTIST ID:

band_name = 'Pink_Floyd'

pinkfloyd_url = 'https://api.songkick.com/api/3.0/search/artists.json?apikey=' + apikey + '&query=Pink_Floyd'

In [25]:
# Doing the request to the api
response = requests.get(pinkfloyd_url)
results = response.json()

After examining the JSON Dictionary, we find out that the ID Key it's stored in the following path:

In [26]:
# Display the naked json
artist_id = results['resultsPage']['results']['artist'][0]['id']

artist_id

400904

Similarly to the Discogs.com process, we'll make a function that returns all the Artists IDs (in the same order)

In [27]:
def songkick_artist_id(artist_list, apikey = apikey):
    '''
    This function locates an Songkick ID number for a given list of artists. Then, it creates a tuple
    with the band ID and the band name
    
    Takes two arguments:
        
        artist_list = type lst: List of artist names to be found. If an input it's a string, transform it
                                into a list for a correct processing.
                                
        api_key = type str: Your Songkick API key
    
    The output is a tuple composed of two parts:
    
        output[0]= type int: List of Songkick IDs
        output[1]= type str: Artist names
        
    '''
    
    
    if type(artist_list) == str:
        artist_list = [artist_list]
    
    IDs = []
    not_found = []
    
    
    for artist in artist_list:
        
        url = 'https://api.songkick.com/api/3.0/search/artists.json?apikey=' + apikey +'&query=' +underscore_artist(artist)
        response = requests.get(url)
        results = response.json()
        try:
            songkick_id = results['resultsPage']['results']['artist'][0]['id']
            IDs.append((str(songkick_id), str(artist)))
        except:
            not_found.append(artist)
        
        print(f'Found the artist {artist}, with Songkick ID: {songkick_id}')
        
    if len(not_found) > 0:
        print(f'The following artist couldnt be found: {not_found}')
        
    return IDs
    
    

In [28]:
bands = songkick_artist_id(artist_list = 'Pink Floyd')

Found the artist Pink Floyd, with Songkick ID: 400904


We want to store the IDs and the Artist Name, because later on we found out that if an Artist plays in a Festival, this artist ID it's kinda randomized on the API

### 2.2 Getting data from the API

Now that we have the artist name and the artist id, we can now find some more data we're interested on.
We want to find:
    - The concert name
    - The concert date
    - The concert time
    - The city where the concert was done
    - The venue where the concert was done
    - The coordinates of that venue
    - If the artist was headlining the concert (billing)
    

https://www.songkick.com/developer/past-events-for-artist

All the data we're looking for it's located on this API link, we'll have to explore the JSON object to find it out:

`https://api.songkick.com/api/3.0/artists/{artist_id}/gigography.json?apikey={your_api_key}`

Again, we'll use Pink Floyd and its ID for testing:

In [29]:
pinkfloyd_id = songkick_artist_id(['Pink Floyd'])[0][0]
pinkfloyd_id

Found the artist Pink Floyd, with Songkick ID: 400904


'400904'

This would be the URL where the Pink Floyd data is stored:

In [136]:
pinkfloyd_url = 'https://api.songkick.com/api/3.0/artists/'+pinkfloyd_id +'/gigography.json?apikey=' + api_key
pinkfloyd_url

NameError: name 'api_key' is not defined

Calling the API:

In [31]:
response = requests.get(pinkfloyd_url)
response

<Response [200]>

Works!

In [32]:
result = response.json()

### 2.2.1 Finding the concert name

In this phase, we're doing a bit of exploration. After finding the right information navegating through the `.keys()` dictionary method, since we're working with a `json` file

In [33]:
result['resultsPage']['results']['event'][0]['displayName']

# Show name

'Pink Floyd at Homerton College (May 22, 1965)'

How many results are on the page?

In [34]:
len(result['resultsPage']['results']['event'])

# Results per page

50

### 2.2.2 Finding the concert date

In [35]:
result['resultsPage']['results']['event'][0]['start']['date']

# Show date

'1965-05-22'

### 2.2.3 Finding the concert time

In [36]:
result['resultsPage']['results']['event'][0]['start']['time'] 

# Show time
# In this particular case we have no info, so the result is None.

### 2.2.4 Finding the city where the concert was done

In [37]:
result['resultsPage']['results']['event'][0]['location']['city']

# Show city

'Cambridge, UK'

### 2.2.5 Finding the venue name where the concert was played



In [38]:
result['resultsPage']['results']['event'][0]['venue']['displayName']

# Venue name

'Homerton College'

### 2.2.6 Finding the coordinates of that venue

In [39]:
result['resultsPage']['results']['event'][0]['venue']['lat'] # This particular show has no info

In [40]:
result['resultsPage']['results']['event'][0]['venue']['lng'] # This particular show has no info

### 2.2.7 Finding out if the artist was headlining the concert ( billing )

In [41]:
result['resultsPage']['results']['event'][0]['performance'][0]['billing']

# Was the artist the headliner of the show?

'headline'

### 2.2.8 Getting the artist's name
We are doing this to directly assing the artist in a row for each lecture of the data

In [42]:
result['resultsPage']['results']['event'][0]['performance'][0]['displayName']

'Pink Floyd'

## 2.3 Defining a function for our data rows:
Finding out that the pagination comes as an argument in the link= '&page=%s'. We have 50 entries per page

In [43]:
entries = result['resultsPage']['totalEntries']

elm_per_page = result['resultsPage']['perPage']

pages = entries / elm_per_page

print(f"The number of pages on each ID will be equal to its entries ({entries}), \ndivided by the elements by page ({elm_per_page}). \nFor example, {entries}/{elm_per_page}={pages} pages. \nSince the number of pages is not an integer, we'll round up this number to iter on the elements that are left. \nIf the pages are {pages}, we'll iter over {round(pages) + 1} pages" )
      
      
      

The number of pages on each ID will be equal to its entries (1104), 
divided by the elements by page (50). 
For example, 1104/50=22.08 pages. 
Since the number of pages is not an integer, we'll round up this number to iter on the elements that are left. 
If the pages are 22.08, we'll iter over 23 pages


Now, we're making a function to collect the data of each show for every artist id we pass on the list #artist_id

In [44]:
def collect_data(artist_id_list, apikey='-'):  
    '''
    This function iterates over an artist id list to get the links on each artist.
    Calculates in how many pages the data is stored, and iterates on all of them.
    Then, for each page, requests the information to the API and fetchs all the data we need.
    Stores this data as a list of lists object to ensure compatibility with Pandas Dataframes.
    '''
    
    artist_data = []
    
    for artist in artist_id_list:
        url = 'https://api.songkick.com/api/3.0/artists/'+str(artist[0]) +'/gigography.json?apikey=' + apikey
        response = requests.get(url)
        result = response.json()
        
        ## Iterating over different pages:
        entries = result['resultsPage']['totalEntries'] 
        per_page = result['resultsPage']['perPage']
        
        pages_round= int(math.ceil(entries/per_page))
        
        print('\n')
        
    
        for z in range(pages_round):
                
                # Page 0 and Page 1 are equal, so we'll start from page 1
                url = 'https://api.songkick.com/api/3.0/artists/'+str(artist[0]) +'/gigography.json?&page='+ str(z+1) +'&apikey=' + apikey
                response = requests.get(url)
                result = response.json()
        
    
                for i in range(len(result['resultsPage']['results']['event'])):
                        
                    print(f'Fetching {artist[1]} data... entry nº {str(z*50+i+1)} of {entries}', end='\r', flush=True)

        
                    band = artist[1]
                    b_id = artist[0]
                    bill = result['resultsPage']['results']['event'][i]['performance'][0]['billing']
                    name = result['resultsPage']['results']['event'][i]['displayName']
                    date = result['resultsPage']['results']['event'][i]['start']['date']
                    time = result['resultsPage']['results']['event'][i]['start']['time']
                    city = result['resultsPage']['results']['event'][i]['location']['city']
                    venu = result['resultsPage']['results']['event'][i]['venue']['displayName']
                    lat = result['resultsPage']['results']['event'][i]['venue']['lat'] 
                    lng = result['resultsPage']['results']['event'][i]['venue']['lng'] 
        
                    artist_data.append([band, b_id, bill, name, date, time, city, venu, lat, lng])
    
    return artist_data


In [45]:
data = collect_data(bands)



Fetching Pink Floyd data... entry nº 1104 of 1104

## Bringing it all together

Our goal is to make a dataframe with the gathered data, to be able to study it.
We'll define a function with the scripts we used through the process.
This function is called get_shows_data():

In [49]:
def get_shows_data(n=10, genres='Rock', apikey=apikey, artists = None):
    '''
    This function returns an organized Dataframe which contains information about all the shows played
    by that artist over its history. Contains the following variables:
        n = type int: Tells the function hoy many artists to get info
        
        genre = type str or type list: Tells the function which genre (or genres) of music to scrape
        
        apikey = type str: (needed) your Songkick API key
        
        artist = type str or type lst: By default, set to None. This means that artist will be chosen
                                       based on all-time popularity. If specified, tells what artist to scrape'''
    
    if artists == None:

        artists = artist_scrape(n=n, genres=genres)
    
    
    ids = songkick_artist_id(artists)
    
    data = collect_data(ids)
    
    print('\n\n-Done-')
    
    return data

To test it, we have a lot of genres to choose from!

In [50]:
genres_styles

['Rock',
 'Electronic',
 'Pop',
 'Folk, World, & Country',
 'Jazz',
 'Rock',
 'Electronic',
 'Pop',
 'Folk, World, & Country',
 'Jazz',
 'Funk / Soul',
 'Classical',
 'Hip Hop',
 'Latin',
 'Reggae',
 'Stage & Screen',
 'Blues',
 'Non-Music',
 "Children's",
 'Brass & Military',
 'House',
 'Pop Rock',
 'Punk',
 'Vocal',
 'Techno',
 'House',
 'Pop Rock',
 'Punk',
 'Vocal',
 'Techno',
 'Experimental',
 'Hardcore',
 'Synth-pop',
 'Soul',
 'Indie Rock',
 'Disco',
 'Alternative Rock',
 'Ambient',
 'Electro',
 'Country',
 'Trance',
 'Folk',
 'Chanson',
 'Ballad',
 'Rock & Roll',
 'Downtempo',
 'Hard Rock',
 'Psychedelic Rock',
 'Funk',
 'Heavy Metal',
 'Drum n Bass',
 'Tech House',
 'Deep House',
 'Folk Rock',
 'Schlager',
 'Euro House',
 'Easy Listening',
 'Black Metal',
 'Soundtrack',
 'Romantic',
 'Rhythm & Blues',
 'Industrial',
 'Prog Rock',
 'New Wave',
 'Garage Rock',
 'Noise',
 'Minimal',
 'Progressive House',
 'Death Metal',
 'Classic Rock',
 'Europop',
 'Abstract',
 'Classical',
 'Bl

In [51]:
data = get_shows_data(n=50, genres=['Classic Rock'])

Found the artist Led Zeppelin, with Songkick ID: 18833
Found the artist Dire Straits, with Songkick ID: 393353
Found the artist David Bowie, with Songkick ID: 468870
Found the artist Eagles, with Songkick ID: 556496
Found the artist Neil Young, with Songkick ID: 1980138
Found the artist Queen, with Songkick ID: 469904
Found the artist Supertramp, with Songkick ID: 420142
Found the artist The Doors, with Songkick ID: 477246
Found the artist Simon & Garfunkel, with Songkick ID: 213059
Found the artist The Rolling Stones, with Songkick ID: 379603
Found the artist Jethro Tull, with Songkick ID: 111953
Found the artist Pink Floyd, with Songkick ID: 400904
Found the artist Mike Oldfield, with Songkick ID: 486950
Found the artist Elton John, with Songkick ID: 371163
Found the artist Rolling Stones, with Songkick ID: 379603
Found the artist Bruce Springsteen, with Songkick ID: 227030
Found the artist Steely Dan, with Songkick ID: 87233
Found the artist Bowie, with Songkick ID: 468870
Found the

Ok, we have our data in a tidy way (as a list of lists for each show). Now, it's time to construct our DataFrame with this data:

## 4. Building the DataFrame

We stored our values in a convenient list of arrays for each show we collected. So, to construct it, we'll just have to call `pd.DataFrame` and introduce the list we created, along with the columns names:

In [90]:
df = pd.DataFrame(data)

In [91]:
df.shape

(59020, 10)

In [92]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Led Zeppelin,18833,headline,"Led Zeppelin at Teen Club Box 45 (September 7,...",1968-09-07,,"Copenhagen, Denmark",Teen Club Box 45,,
1,Led Zeppelin,18833,headline,"Led Zeppelin at Fjordvilla Club (September 8, ...",1968-09-08,,"Roskilde, Denmark",Fjordvilla Club,,
2,Led Zeppelin,18833,headline,"Led Zeppelin at Gröna Lund (September 12, 1968)",1968-09-12,,"Stockholm, Sweden",Gröna Lund,59.323487,18.097393
3,Led Zeppelin,18833,headline,"Led Zeppelin at Inside Club (September 13, 1968)",1968-09-13,,"Stockholm, Sweden",Inside Club,,
4,Led Zeppelin,18833,headline,"Led Zeppelin at Angby Park (September 14, 1968)",1968-09-14,,"Stockholm, Sweden",Angby Park,,


In [93]:
df.columns = ['artist', 'artist_id', 'relevance', 'show_name', 'date', 'time', 'city', 'venue', 'lat', 'lng']

In [94]:
df.head()

Unnamed: 0,artist,artist_id,relevance,show_name,date,time,city,venue,lat,lng
0,Led Zeppelin,18833,headline,"Led Zeppelin at Teen Club Box 45 (September 7,...",1968-09-07,,"Copenhagen, Denmark",Teen Club Box 45,,
1,Led Zeppelin,18833,headline,"Led Zeppelin at Fjordvilla Club (September 8, ...",1968-09-08,,"Roskilde, Denmark",Fjordvilla Club,,
2,Led Zeppelin,18833,headline,"Led Zeppelin at Gröna Lund (September 12, 1968)",1968-09-12,,"Stockholm, Sweden",Gröna Lund,59.323487,18.097393
3,Led Zeppelin,18833,headline,"Led Zeppelin at Inside Club (September 13, 1968)",1968-09-13,,"Stockholm, Sweden",Inside Club,,
4,Led Zeppelin,18833,headline,"Led Zeppelin at Angby Park (September 14, 1968)",1968-09-14,,"Stockholm, Sweden",Angby Park,,


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59020 entries, 0 to 59019
Data columns (total 10 columns):
artist       59020 non-null object
artist_id    59020 non-null object
relevance    59020 non-null object
show_name    59020 non-null object
date         59020 non-null object
time         12935 non-null object
city         59020 non-null object
venue        59020 non-null object
lat          48889 non-null float64
lng          48889 non-null float64
dtypes: float64(2), object(8)
memory usage: 4.5+ MB


Setting a datetime column:

In [96]:
df_time = df['date']

df_time = pd.to_datetime(df_time, errors='coerce')

We'll use a function to know the percentage of NA values:

In [97]:
def df_total_na(df):
    '''
    Returns the percentage of total NULL values in the dataset
    '''
    total_obs = df.count().sum()
    total_nas = df.isna().sum().sum()
    nas_percentage = total_nas *100 / total_obs
    
    print(f'Our dataset has {round(nas_percentage, 2)}% missing values overall' )

In [98]:
df_total_na(df)

Our dataset has 12.67% missing values overall


Also checking what columns have more percentage of null values

In [99]:
def column_nulls_percentage(df):
    '''
    Returns a series indicating percentage of NULLS per column
    '''
    # .mean() gets the NULL values and divides it by the total length of the column
    
    return df.isna().mean().round(4) * 100


print(column_nulls_percentage(df))

artist        0.00
artist_id     0.00
relevance     0.00
show_name     0.00
date          0.00
time         78.08
city          0.00
venue         0.00
lat          17.17
lng          17.17
dtype: float64


We find out that `time` column has a considerable amount of `NAs` an is not useful for us. We'll arrange the dataframe and don't preserve it for now.
`artist_id`, and `show_name` are also not useful for us for the moment.

In [100]:
df_original = df

df = df[['artist', 'date', 'time', 'venue', 'city', 'relevance', 'lat', 'lng']]

In [101]:
df.head()

Unnamed: 0,artist,date,time,venue,city,relevance,lat,lng
0,Led Zeppelin,1968-09-07,,Teen Club Box 45,"Copenhagen, Denmark",headline,,
1,Led Zeppelin,1968-09-08,,Fjordvilla Club,"Roskilde, Denmark",headline,,
2,Led Zeppelin,1968-09-12,,Gröna Lund,"Stockholm, Sweden",headline,59.323487,18.097393
3,Led Zeppelin,1968-09-13,,Inside Club,"Stockholm, Sweden",headline,,
4,Led Zeppelin,1968-09-14,,Angby Park,"Stockholm, Sweden",headline,,


## 5. Usage of the data

This dataframe is very interesting to understand the music movement and artists. For example, how many concerts did Bob Dylan made as a headliner since he started music compared to other ones?

In [125]:
df[df.artist == 'Bob Dylan'].relevance.value_counts()

headline    3738
Name: relevance, dtype: int64

In [135]:
df[df.artist == 'Pink Floyd'].relevance.value_counts()

headline    1104
Name: relevance, dtype: int64