# Data automatisation and Webscraping: Spotify Example
to see if it's legal to webscrape a website, add __/robots.txt__ at the end of the url.



In [1]:
# Importing required libraries
import requests # get data from websites
from bs4 import BeautifulSoup
import time
import datetime as dt
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
import io
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
### Step 1 : Let's extract all the weeks from Spotify Charts
new_URLS =[]
def DownloadPage(URL):     # Pass the global website
    s1 = requests.Session()
    headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"}
    response1 = s1.get(URL, headers=headers)
    soup_ww = BeautifulSoup(response1.content.decode("utf-8"),"html.parser") # get content of that page
    time.sleep(0.25)
    all_dates=[]                                                        # list for holding all dates on that page
    for li in soup_ww.find('div', {'data-type':'date'}).find_all('li'):
        all_dates.append((dt.datetime.strptime(li.get_text(), '%m/%d/%Y').date()) + dt.timedelta(days=1))
    [new_URLS.append(URL+str(date)+'/download') for date in all_dates]
    print(f"Dates from {URL} have been downloaded")
    
    return soup_ww, all_dates

In [3]:
website, all_dates = DownloadPage("https://spotifycharts.com/regional/global/weekly/latest")

Dates from https://spotifycharts.com/regional/global/weekly/latest have been downloaded


In [4]:
website.head()

[<meta charset="utf-8"/>,
 <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>,
 <title>Spotify Charts</title>,
 <meta content="Spotify Charts" property="og:title"/>,
 <meta content="website" property="og:type"/>,
 <meta content="https://www.spotifycharts.com/" property="og:url"/>,
 <meta content="http://d2c87l0yth4zbw-2.global.ssl.fastly.net/i/_global/open-graph-default.png" property="og:image"/>,
 <meta content="Spotify is all the music you’ll ever need." property="og:description"/>,
 <meta content="Spotify is all the music you’ll ever need." name="description"/>,
 <meta content="width=device-width, initial-scale=1.0" name="viewport"/>,
 <link href="/images/touch-icon-144.png" rel="apple-touch-icon-precomposed" sizes="144x144"/>,
 <link href="/images/touch-icon-114.png" rel="apple-touch-icon-precomposed" sizes="114x114"/>,
 <link href="/images/touch-icon-72.png" rel="apple-touch-icon-precomposed" sizes="72x72"/>,
 <link href="/images/touch-icon-57.png" rel="apple-touch-ic

In [5]:
all_dates

[datetime.date(2021, 10, 22),
 datetime.date(2021, 10, 15),
 datetime.date(2021, 10, 8),
 datetime.date(2021, 10, 1),
 datetime.date(2021, 9, 24),
 datetime.date(2021, 9, 17),
 datetime.date(2021, 9, 10),
 datetime.date(2021, 9, 3),
 datetime.date(2021, 8, 27),
 datetime.date(2021, 8, 20),
 datetime.date(2021, 8, 13),
 datetime.date(2021, 8, 6),
 datetime.date(2021, 7, 30),
 datetime.date(2021, 7, 23),
 datetime.date(2021, 7, 16),
 datetime.date(2021, 7, 9),
 datetime.date(2021, 7, 2),
 datetime.date(2021, 6, 25),
 datetime.date(2021, 6, 18),
 datetime.date(2021, 6, 11),
 datetime.date(2021, 6, 4),
 datetime.date(2021, 5, 28),
 datetime.date(2021, 5, 21),
 datetime.date(2021, 5, 14),
 datetime.date(2021, 5, 7),
 datetime.date(2021, 4, 30),
 datetime.date(2021, 4, 23),
 datetime.date(2021, 4, 16),
 datetime.date(2021, 4, 9),
 datetime.date(2021, 4, 2),
 datetime.date(2021, 3, 26),
 datetime.date(2021, 3, 19),
 datetime.date(2021, 3, 12),
 datetime.date(2021, 3, 5),
 datetime.date(2021, 

In [7]:
def downloadData(new_URL):
    s2 = requests.Session()
    headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"}
    response2 = s2.get(new_URL, headers=headers)
    soup_data = BeautifulSoup(response2.content,"html.parser")  # get page content
    temp_df = pd.read_csv(io.StringIO(soup_data.decode("utf-8")), sep='delimiter')
    # create csv data to a temporary data frame
    table_df = pd.DataFrame(columns=["Position", "Track", "Artist", "Streams", "Date"])
    table_date = url.split('--')[1]
    
    #Extract beginnings 
    row_start = temp_df[temp_df['<!DOCTYPE doctype html>'].str.contains("<td class=\"chart-table-position\"")].index
    for i in row_start:
        position = int(temp_df['<!DOCTYPE doctype html>'].iloc[i+1])
        track = temp_df['<!DOCTYPE doctype html>'].iloc[i+13]
        if(len(temp_df['<!DOCTYPE doctype html>'].iloc[i+16].split('by '))) == 2:
            artist = temp_df['<!DOCTYPE doctype html>'].iloc[i+16].split('by ')[1]
        else:
            artist = temp_df['<!DOCTYPE doctype html>'].iloc[i+16].split('by ')[0]
        streams = temp_df['<!DOCTYPE doctype html>'].iloc[i+20]
        table_df = table_df.append({'Position': position, 'Track':track, 'Artist':artist, 'Streams':streams, 'Date':table_date}, ignore_index = True)
    return(table_df)

In [8]:
# pulling last week's date
artist_name = 'Coldplay'
artist_df = pd.DataFrame()
url = f"https://spotifycharts.com/regional/global/weekly/{all_dates[1]}--{all_dates[0]}"
table_df_1w = downloadData(url)
if table_df_1w[table_df_1w.Artist.str.contains(f'{artist_name}')].shape[0] > 0:
    table_df_1w_with_artist = table_df_1w[table_df_1w.Artist.str.contains('Queen')]
    if artist_df.shape[0] == 0:
        artist_df = table_df_1w_with_artist.copy()
    else:
        artist_df = artist_df.append(table_df_1w_with_artist, ignore_index=True)
print(url)

artist_df = artist_df.reindex(index=artist_df.index[::-1])
artist_df['Streams'] = pd.to_numeric(artist_df['Streams'].str.replace(',',''))
artist_df['Position'] = pd.to_numeric(artist_df['Position'])
artist_df['Date'] = pd.to_datetime(artist_df['Date'])

  


https://spotifycharts.com/regional/global/weekly/2021-10-15--2021-10-22


In [9]:
table_df_1w.head()

Unnamed: 0,Position,Track,Artist,Streams,Date
0,1,Easy On Me,Adele,84952932,2021-10-22
1,2,STAY (with Justin Bieber),The Kid LAROI,46083756,2021-10-22
2,3,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,39049971,2021-10-22
3,4,Heat Waves,Glass Animals,30283212,2021-10-22
4,5,My Universe,"Coldplay, BTS",27587042,2021-10-22


In [10]:
# Pulling all the weeks
begin_time = dt.datetime.now()
table = pd.DataFrame(columns=["Position", "Track", "Artist", "Streams", "Date"])
for i in range(len(all_dates)-1):
    url = f"https://spotifycharts.com/regional/global/weekly/{all_dates[i+1]}--{all_dates[i]}"
    table_df = downloadData(url)
    table = table.append(table_df)
    print(url)

  


https://spotifycharts.com/regional/global/weekly/2021-10-15--2021-10-22
https://spotifycharts.com/regional/global/weekly/2021-10-08--2021-10-15
https://spotifycharts.com/regional/global/weekly/2021-10-01--2021-10-08
https://spotifycharts.com/regional/global/weekly/2021-09-24--2021-10-01
https://spotifycharts.com/regional/global/weekly/2021-09-17--2021-09-24
https://spotifycharts.com/regional/global/weekly/2021-09-10--2021-09-17
https://spotifycharts.com/regional/global/weekly/2021-09-03--2021-09-10
https://spotifycharts.com/regional/global/weekly/2021-08-27--2021-09-03
https://spotifycharts.com/regional/global/weekly/2021-08-20--2021-08-27
https://spotifycharts.com/regional/global/weekly/2021-08-13--2021-08-20
https://spotifycharts.com/regional/global/weekly/2021-08-06--2021-08-13
https://spotifycharts.com/regional/global/weekly/2021-07-30--2021-08-06
https://spotifycharts.com/regional/global/weekly/2021-07-23--2021-07-30
https://spotifycharts.com/regional/global/weekly/2021-07-16--202

KeyboardInterrupt: 

In [None]:
print("elapsed time:", dt.datetime.now() - begin_time)

In [None]:
table.reset_index(inplace = True, drop = True)
table.head()

In [None]:
# Change columns type
table['Streams'] = table['Streams'].str.replace('</tr>','0')
table['Streams'] = pd.to_numeric(table['Streams'].str.replace(',',''))

# ARTIST SELECTION

In [None]:
# Select only rows that contain Queen
artist_name = 'Rihanna'
artist_df = table[table['Artist'].str.contains(f'{artist_name}')]

In [None]:
artist_df.head(10)

In [None]:
# Let's see when did we have the best performance in the Charts
artist_df.sort_values('Position', ascending=True).head(20)

In [None]:
# Which wee has Queen had more streams?
artist_df.groupby(['Date']).sum()[['Streams']].sort_values('Streams', ascending=False).head(10)

In [None]:
# Let's make a quick graph to get a better idea
artist_by_streams = artist_df.groupby(['Date']).sum()[['Streams']]

artist_by_streams.plot(kind='bar', y='Streams', figsize=(35,10), legend=None, title=f'{artist_name} Streams Performance', xlabel = 'Date', ylabel = 'Streams');

In [None]:
artist_df.groupby(['Date']).count()[['Track']].sort_values('Track', ascending=False).head()

In [None]:
artist_df[artist_df['Date'] == '2018-11-16'].sort_values('Position')

## When was the movie "Bohemian Rhapsody" released?
Between 23 October to 2 November 2018

### How much does Spotify pay per stream? -- not official, just an assumption--
In 2020, a report from Business Insider found that Spotify has paid artists USD 0.0033 per stream, which means 250 streams to earn 1 dollar.

In [None]:
artist_df['Revenue'] = artist_df['Streams'] * 0.0033

In [None]:
artist_df.head()

In [None]:
sum(artist_df[artist_df.Track == 'Bohemian Rhapsody - Remastered 2011']['Revenue'])

In [None]:
# Let's Clean the names as most of them have the same 'Remasterd 2011'
artist_df['Track'] = artist_df['Track'].str.split(' - Remastered 2011', expand = True)[0]

In [None]:
# Let's gather the basics:
# Which and how many unique songs have performed in the top 200 charts?
unique_songs_top200 = artist_df.Track.unique()
count_unique_songs_top200 = len(unique_songs_top200)
# Which is the most streamed song? How many streams and how much revenue has it generated?
most_streamed_song = artist_df.groupby('Track').sum().sort_values('Streams', ascending=False).reset_index().iloc[0]['Track']
most_streamed_song_streams = artist_df.groupby('Track').sum().sort_values('Streams', ascending=False).reset_index().iloc[0]['Streams']
most_streamed_song_revenue = artist_df.groupby('Track').sum().sort_values('Streams', ascending=False).reset_index().iloc[0]['Revenue']

In [None]:
summary_text = f"The artist {artist_name} has had so far {int(count_unique_songs_top200)} songs at the Spotify's top 200 Global Charts, from which {most_streamed_song} has been the most viewed, with {most_streamed_song_streams} streams generating a revenue of ${int(most_streamed_song_revenue)}."

In [None]:
print(summary_text)

In [None]:
artist_pv_streams = artist_df.pivot_table(index='Date', columns=['Track'], values=['Streams'], fill_value=0)
artist_pv_streams = artist_pv_streams.replace(0, np.nan)
artist_pv_streams.plot(y=artist_pv_streams.columns,
                      figsize=(15,10),
                      legend=True, 
                      title ='Queen Charts Performance', 
                      xlabel = 'Date',
                      ylabel ='Streams');

In [None]:
artist_pv_streams.head()

In [None]:
# How many weeks have the songs been in the top 200 chart?
times_top200 = artist_df.groupby('Track').count()[['Date']].rename(columns={'Date':'Weeks'}).sort_values('Weeks', ascending=False).reset_index()
times_top200.head()

In [None]:
# Let's prepare the data for the summary
# How did Queen perform last week?
last_week = all_dates[0].strftime('%Y-%m-%d')
last_week_summary = artist_df[artist_df['Date'] == last_week].sort_values('Position', ascending = True).set_index('Position')[['Track', 'Artist', 'Date', 'Streams', 'Revenue']]
last_week_summary

In [None]:
# Top 5 Songs on Charts
top_5_songs = artist_df.groupby(['Track', 'Artist']).sum()[['Streams', 'Revenue']].sort_values('Streams', ascending = False).head(5).reset_index()
top_5_songs = top_5_songs.merge(times_top200, left_on='Track', right_on='Track', how='left').set_index(['Track'])[['Artist', 'Weeks', 'Streams', 'Revenue']]
top_5_songs

In [None]:
most_played_song = top_5_songs.index[0]
# Let's make a quick graph of the most played song
most_played_df = artist_df[artist_df['Track'] == most_played_song].groupby(['Date']).sum()[['Streams']]

# Let's make sur the graph includes dates in between
date1 = str(most_played_df.reset_index().iloc[1]['Date'])
date2 = str(most_played_df.reset_index().iloc[-1]['Date'])
mydates = pd.date_range(date1, date2, freq="7D").strftime('%Y-%m-%d').tolist()
date_range_df = pd.DataFrame(columns=['Date'], data = mydates).set_index('Date')

fig = most_played_df.merge(date_range_df, left_index=True, right_index=True, how='right').plot(kind='bar', y='Streams', figsize=(35,10), legend=None, title='Bohemian Rhapsody Streams Performance', xlabel = 'Date', ylabel ='Streams').get_figure()
fig.savefig(f'generated_plots/{most_played_song}_{artist_name}_graph.png')

In [None]:
## Top 10 positions ever
top_10_positions = artist_df.sort_values(['Position'], ascending=True).head(10).set_index('Position')[['Track', 'Artist', 'Date', 'Streams', 'Revenue']]
top_10_positions

In [None]:
# Some cleaning
artist_df.reset_index(drop=True, inplace=True)
artist_df = artist_df[['Position', 'Track', 'Artist', 'Date', 'Streams', 'Revenue']]

In [None]:
# Create a Pandas Excel writer using XlsxWriter as the engine.

writer = pd.ExcelWriter(f"/Users/valentinmonney/Documents/data_science/python/Data science projects examples/automatisation and webscraping/{artist_name}_Report.xlsx", engine='xlsxwriter', datetime_format='dd-mm-yyyy')

# Get the xlswriter workbook and worksheet objects.
workbook = writer.book

#Write each dataframe to a different worksheet.
last_week_summary.to_excel(writer, sheet_name='Summary', startrow=3, startcol=1)
top_5_songs.to_excel(writer, sheet_name='Summary', startrow=4+last_week_summary.shape[0]+2, startcol=2)
top_10_positions.to_excel(writer, sheet_name='Summary', startrow=4+last_week_summary.shape[0]+4+top_5_songs.shape[0]+1)
ws_song_df = workbook.add_worksheet(f'Top Song Graph')
ws_song_df.insert_image('A2', f'/Users/valentinmonney/Documents/data_science/python/Data science projects examples/automatisation and webscraping/generated_plots/{most_played_song}_{artist_name}_graph.png')
artist_df.to_excel(writer, sheet_name='Detail')

start_row_numbers = 4+last_week_summary.shape[0]+4
end_row_numbers = start_row_numbers+top_5_songs.shape[0]

# Create formats to apply later.
currency_format = workbook.add_format({'num_format': '$#,##0.00', 'align':'center'})
streams_format= workbook.add_format({'num_format': '$#,##0', 'align':'center'})
empty_format = workbook.add_format()
center_bold_format = workbook.add_format({'align':'center'})
center_bold_format.set_bold()

start_row_numbers = 4+last_week_summary.shape[0]+4
end_row_numbers = start_row_numbers+top_5_songs.shape[0]

ws_summary = writer.sheets['Summary']
ws_summary.set_column(f'C1:D{end_row_numbers}', 25, center_bold_format)
ws_summary.set_column(f'E1:E{end_row_numbers}', 18, center_bold_format)
ws_summary.set_column(f'F1:F{end_row_numbers}', 15, streams_format)
ws_summary.set_column(f'G1:G{end_row_numbers}', 15, currency_format)

ws_artist_df = writer.sheets['Detail']
ws_artist_df.set_column(f'B1:B{artist_df.shape[0]}', 8, center_bold_format)
ws_artist_df.set_column(f'C1:C{artist_df.shape[0]}', 25, center_bold_format)
ws_artist_df.set_column(f'E1:E{artist_df.shape[0]}', 10, center_bold_format)
ws_artist_df.set_column(f'F1:F{artist_df.shape[0]}', 15, streams_format)
ws_artist_df.set_column(f'G1:G{artist_df.shape[0]}', 15, currency_format)

# Create a new chart object
chart = workbook.add_chart({'type': 'bar'})

# Add a series to the chart.
chart.add_series({
    'name':        f'{artist_name} Top 5 Songs (Weeks on Chart)',
    'categories':  f"=Summary!$C${end_row_numbers}:$C${start_row_numbers}",
    'values':      f"=Summary!$E${end_row_numbers}:$E${start_row_numbers}",
    'data_labels': {'value': True}})

chart.set_style(2)
chart.set_legend({'Position': 'none'})

#
chart.set_title ({'name': f'{artist_name} Top 5 Songs (Weeks on Charts)'})
chart.set_x_axis({'name': 'Weeks on Charts'})
chart.set_y_axis({'reverse': True})

ws_summary.insert_chart('I7', chart)

# ws_summary.writer(f'B{end_row_numbers+6}", text) # writes a string
# Create a format to use in the merged range.
merge_title = workbook.add_format({
    'bold':1,
    'border':1,
    'align': 'center',
    'valign': 'vcenter',
    'fg_color': 'silver'})

merge_title.set_font_size(18)

table_title = workbook.add_format({
    'bold':1,
    'border':1,
    'align': 'center',
    'valign': 'vcenter',
    'fg_color': 'silver'})
table_title.set_font_size(14)

summary_text_format = workbook.add_format({
    'bold':1,
    'border':1,
    'align': 'center',
    'valign': 'vcenter',
    'fg_color': 'silver',
    'text_wrap': True})
summary_text_format.set_font_size(11)

# Add Title
ws_summary.merge_range(f"B1:N1", f"{artist_name} Spotify Summary", merge_title)
ws_summary.merge_range(f"B3:C3", f"LastWeek's Performance", table_title)
ws_summary.merge_range(f"B{2+last_week_summary.shape[0]+4}:C{2+last_week_summary.shape[0]+4}", f"Top 5 {artist_name} Songs", table_title)
ws_summary.merge_range(f"B{end_row_numbers+1}:C{end_row_numbers+1}", f"Top 10 {artist_name} Performances", table_title)
ws_summary.merge_range(f"I23:O26", summary_text, summary_text_format)

#### Autofilter
ws_artist_df.autofilter(f'B1:G{artist_df.shape[0]+1}')

ws_summary.set_zoom(90)
ws_song_df.set_zoom(55)
writer.save()