In [1]:
# import dependencies
import numpy as np
import pandas as pd
import datetime as dt
import requests
from bs4 import BeautifulSoup 
from collections import defaultdict
import time
import random
import spotipy
from selenium import webdriver



In [None]:
# Fix the constant portion of the url

url = 'https://spotifycharts.com/regional'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'html.parser')

In [None]:
song_dict = defaultdict(int)

for each_track in soup.find_all(class_ = 'chart-table-track'):
        title = each_track.find('strong')
        artist = each_track.find('span')
        if title != None or artist != None:
            song_dict[(title.text, artist.text.replace("by", ""))] += 1


In [None]:
# Collect all info at once

for each_track in soup.find_all('tr')[1:]:
    title = each_track.find('strong').text
    artist = each_track.find('span').text.replace("by", "")
    track_id = each_track.find('a')['href'][31:]
    stream_count = int(each_track.find( class_ = 'chart-table-streams').decode_contents(formatter="html").replace(",", ""))
    print(title, artist, track_id, stream_count)

In [None]:
# Define range of variable

end_date = dt.datetime.strptime("2019-10-12", "%Y-%m-%d")
start_date = dt.datetime.strptime("2019-01-01", "%Y-%m-%d")
date_values = [start_date + dt.timedelta(days = x) for x in range(0, (end_date-start_date).days)]
full_version = lambda x: url+

In [None]:
#for each_value in date_values:
    #print(each_value.date())

In [31]:
def spotify_viral(start_date, end_date, url):
    '''
    inputs (str): scrapes tracks from Spotify viral charts by 
    takeing in start_date and end_date with format YYYY-MM-DD
    
    
    outputs (dict): returns a dictonary with
    each combination of 'track_id', 'title', and 'artist' as a key; and
    no. of times the track appears in top 50 viral chart
    as value for the period.
    '''
    
    base_url = url
    
    end_date = dt.datetime.strptime(end_date, "%Y-%m-%d")
    start_date = dt.datetime.strptime(start_date, "%Y-%m-%d")
    
    dates = [start_date + dt.timedelta(days = x) for x in range(0, 1+(end_date-start_date).days,7)]

    full_version = lambda x: url+x

    song_dict = defaultdict(int)
    
    for each_date in dates:
        print(each_date.date())
        full_url = base_url + each_date.date().__str__() + '--' + (each_date.date() + dt.timedelta(days = 7)).__str__()
        response = requests.get(full_url)
        page = response.text
        soup = BeautifulSoup(page, 'html.parser')
        
        for each_track in soup.find_all('tr')[1:]:
            title = each_track.find('strong').text
            artist = each_track.find('span').text.replace("by", "")
            track_id = each_track.find('a')['href'][31:]
            stream_count = int(each_track.find( class_ = 'chart-table-streams').decode_contents(formatter="html").replace(",", ""))
            song_dict[(track_id, title, artist)] = stream_count

        
        time.sleep(random.randint(1,6))
        
    return song_dict

In [12]:
# Get no. of times a track has been on viral chart for a given period 

song_dict = spotify_viral(start_date = '2019-09-13', end_date = '2019-10-11', url = 'https://spotifycharts.com/regional/global/weekly/')

2019-09-13
2019-09-20
2019-09-27
2019-10-04
2019-10-11


In [13]:
print(song_dict)


defaultdict(<class 'int'>, {('6v3KW9xbzN5yKLt9YKDYA2', 'Señorita', ' Shawn Mendes', 2019): 32898929, ('21jGcNKet2qwijlDFuPiPb', 'Circles', ' Post Malone', 2019): 30376180, ('3NkJNL3WqO1Lqc3uNDxvCN', 'Don’t Call Me Angel (Charlie’s Angels) (with Miley Cyrus & Lana Del Rey)', ' Ariana Grande', 2019): 14602503, ('1rgnBhdG2JDFTbYkYRZAku', 'Dance Monkey', ' Tones and I', 2019): 37248929, ('6vBdBCoOhKHiYDDOcorfNo', 'Goodbyes (Feat. Young Thug)', ' Post Malone', 2019): 18125052, ('2ksOAxtIxY8yElEWw8RhgK', 'China', ' Anuel AA', 2019): 19162610, ('6EOKwHETwSkZ9gW2b6ASE0', 'Ransom', ' Lil Tecca', 2019): 20825198, ('51Fjme0JiitpyXKuyQiCDo', 'Lalala', ' Y2K', 2019): 20427457, ('05mDaV9Vb3wrzjF6OPZnhq', 'Saint-Tropez', ' Post Malone', 2019): 14091041, ('70eFcWOvlMObDhURTqT4Fv', 'Beautiful People (feat. Khalid)', ' Ed Sheeran', 2019): 20801853, ('7qEHsqek33rTcFNT9PFqLf', 'Someone You Loved', ' Lewis Capaldi', 2019): 23509860, ('2Fxmhks0bxGSBdJ92vM42m', 'bad guy', ' Billie Eilish', 2019): 18164119, (

In [24]:
# Use spotify API with spotipy library


with open('/home/thedatacurious/Documents/Metis/spotify.txt') as f:
    content = f.readlines()
    content = [x.strip() for x in content]

client_id = content[0]
secret = content[1]

from spotipy.oauth2 import SpotifyClientCredentials

client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [30]:
def extract_features(song_dict):
    '''
    inputs (dict): takes in dictionary 'track_id', 'title', and 'artist' key 
    use 'track_id' to call spotify api for attributes
    
    outputs (dataframe): returns a dataframe with track attributes as columns
    '''
    
    counter = 0
    tracks_list_wfeatures = list()
    release_date = list()

    for key in song_dict.keys():
        counter += 1
        track_id = key[0]
        if counter % 50 == 0:
            print(len(song_dict.keys())-counter, ' tracks left')
            time.sleep(60)
        a_track_features = sp.audio_features(track_id)[0]
        date = sp.track(track_id)['album']['release_date']
        if a_track_features == None or date == None:
            continue
        else:
            tracks_list_wfeatures.append(a_track_features)
            release_date.append(date)

    df = pd.DataFrame(tracks_list_wfeatures)
    df['release_date'] = release_date
    
    return df

In [29]:
test = {('6v3KW9xbzN5yKLt9YKDYA2', 'Señorita', ' Shawn Mendes', 2019): 32898929}
extract_features(test)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,release_date
0,0.759,0.548,9,-6.049,0,0.029,0.0392,0,0.0828,0.749,116.967,audio_features,6v3KW9xbzN5yKLt9YKDYA2,spotify:track:6v3KW9xbzN5yKLt9YKDYA2,https://api.spotify.com/v1/tracks/6v3KW9xbzN5y...,https://api.spotify.com/v1/audio-analysis/6v3K...,190800,4,2019-06-19


In [32]:
df = pd.DataFrame(song_dict.items(), columns = ['tup', 'chart_appearances'])
df[['track_id', 'title', 'artist']] = pd.DataFrame(df.iloc[:,0].tolist())
df

ValueError: Columns must be same length as key

In [None]:
df.drop(columns = 'tup', inplace = True)

In [None]:
df2 = extract_features(song_dict)

In [None]:
full_df = df.merge(df2, left_on='track_id', right_on='id')

In [None]:
len(song_dict.keys())

In [33]:
# Test all functions

# Get full year's worth of tracks on viral charts/top charts

url = 'https://spotifycharts.com/regional/global/weekly/'

song_dict = spotify_viral(start_date = '2016-12-23', end_date = '2019-10-04', url = url)


# Get first dataframe from results

df = pd.DataFrame(song_dict.items(), columns = ['tup', 'stream_count'])

# Because the initial dataframe provides a tuple of multiple variables 
#in the first column called 'tup', we recast the tuple as list so that 
# each value has its own column

df[['track_id', 'title', 'artist']] = pd.DataFrame(df.iloc[:,0].tolist())

# Drop the 'tup' column
df.drop(columns = 'tup', inplace = True)

2016-12-23
2016-12-30
2017-01-06
2017-01-13
2017-01-20
2017-01-27
2017-02-03
2017-02-10
2017-02-17
2017-02-24
2017-03-03
2017-03-10
2017-03-17
2017-03-24
2017-03-31
2017-04-07
2017-04-14
2017-04-21
2017-04-28
2017-05-05
2017-05-12
2017-05-19
2017-05-26
2017-06-02
2017-06-09
2017-06-16
2017-06-23
2017-06-30
2017-07-07
2017-07-14
2017-07-21
2017-07-28
2017-08-04
2017-08-11
2017-08-18
2017-08-25
2017-09-01
2017-09-08
2017-09-15
2017-09-22
2017-09-29
2017-10-06
2017-10-13
2017-10-20
2017-10-27
2017-11-03
2017-11-10
2017-11-17
2017-11-24
2017-12-01
2017-12-08
2017-12-15
2017-12-22
2017-12-29
2018-01-05
2018-01-12
2018-01-19
2018-01-26
2018-02-02
2018-02-09
2018-02-16
2018-02-23
2018-03-02
2018-03-09
2018-03-16
2018-03-23
2018-03-30
2018-04-06
2018-04-13
2018-04-20
2018-04-27
2018-05-04
2018-05-11
2018-05-18
2018-05-25
2018-06-01
2018-06-08
2018-06-15
2018-06-22
2018-06-29
2018-07-06
2018-07-13
2018-07-20
2018-07-27
2018-08-03
2018-08-10
2018-08-17
2018-08-24
2018-08-31
2018-09-07
2018-09-14

In [34]:
# Get track attributes as a second dataframe 

df2 = extract_features(song_dict)

2791  tracks left
2741  tracks left
2691  tracks left
2641  tracks left
2591  tracks left
2541  tracks left
2491  tracks left
2441  tracks left
2391  tracks left
2341  tracks left
2291  tracks left
2241  tracks left
2191  tracks left
2141  tracks left
2091  tracks left
2041  tracks left
1991  tracks left
1941  tracks left
1891  tracks left
1841  tracks left
1791  tracks left
1741  tracks left
1691  tracks left
1641  tracks left
1591  tracks left
1541  tracks left
1491  tracks left
1441  tracks left
1391  tracks left
1341  tracks left
1291  tracks left
1241  tracks left
1191  tracks left
1141  tracks left
1091  tracks left
1041  tracks left
991  tracks left
941  tracks left
891  tracks left
841  tracks left
791  tracks left
741  tracks left
691  tracks left
641  tracks left
591  tracks left
541  tracks left
491  tracks left
441  tracks left
391  tracks left
341  tracks left
291  tracks left
241  tracks left
191  tracks left
141  tracks left
91  tracks left
41  tracks left


In [42]:
# Merge dataframes

full_df = df.merge(df2, left_on='track_id', right_on='id')

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,release_date
2836,0.684,0.711,5,-3.073,0,0.166,0.168,0.0,0.107,0.474,152.043,audio_features,3TTMUI5dFcbeNSDTTDY9M8,spotify:track:3TTMUI5dFcbeNSDTTDY9M8,https://api.spotify.com/v1/tracks/3TTMUI5dFcbe...,https://api.spotify.com/v1/audio-analysis/3TTM...,181016,4,2019-10-04
2837,0.719,0.538,11,-6.897,0,0.28,0.632,0.0,0.135,0.803,204.132,audio_features,4e0FYxSROat25pH16zMTZI,spotify:track:4e0FYxSROat25pH16zMTZI,https://api.spotify.com/v1/tracks/4e0FYxSROat2...,https://api.spotify.com/v1/audio-analysis/4e0F...,117013,4,2019-08-30
2838,0.684,0.671,2,-6.424,0,0.289,0.0119,0.0,0.0769,0.45,179.987,audio_features,2WsuSYJNXGKXVYkHPnq2yp,spotify:track:2WsuSYJNXGKXVYkHPnq2yp,https://api.spotify.com/v1/tracks/2WsuSYJNXGKX...,https://api.spotify.com/v1/audio-analysis/2Wsu...,171253,4,2019-08-30
2839,0.743,0.68,5,-4.344,0,0.103,0.113,0.123,0.183,0.694,180.059,audio_features,4IJEw3fDvS6XF4sDc3bvjK,spotify:track:4IJEw3fDvS6XF4sDc3bvjK,https://api.spotify.com/v1/tracks/4IJEw3fDvS6X...,https://api.spotify.com/v1/audio-analysis/4IJE...,196583,4,2019-09-20
2840,0.809,0.545,7,-6.678,1,0.0735,0.392,0.0,0.151,0.244,97.008,audio_features,3QzAOrNlsabgbMwlZt7TAY,spotify:track:3QzAOrNlsabgbMwlZt7TAY,https://api.spotify.com/v1/tracks/3QzAOrNlsabg...,https://api.spotify.com/v1/audio-analysis/3QzA...,180435,4,2019-06-28


In [43]:
full_df.to_csv('../data/interim/global_top200_wyear.csv', index = False)

In [None]:

driver = webdriver.Firefox()
hit_genres = list()

for i in ful_df.index:
    search_text = "genre {} {}".format(full_df.title[i],full_df.artist[i]).replace(" ", "+")
    try:
        driver.get('https://www.google.com.np/#q='+ search_text)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        element = soup.find(class_ = 'Z0LcW')
        if element == None:
            hit_genres.append(np.nan)
            print("NONE")
        else:
            hit_genres.append(element.text)
            print(element.text)
        time.sleep(random.randint(1,8)) 
    except:
            hit_genres.append(np.nan)
            print("TimeoutException")

In [16]:
len(hit_genres)

273