In [69]:
from dotenv import load_dotenv
import os
import base64
import requests
import json
import pandas as pd


In [70]:
load_dotenv()

# Specify Spotify API login info
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

# Obtain Access Token from Spotify API
def get_token():
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes),"utf-8")

    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type": "client_credentials"}
    result = requests.post(url, headers=headers, data=data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token


def get_auth_header(token):
    return{"Authorization": "Bearer " + token}

    
token = get_token()


In [71]:
# Search Artist
def search_for_artist(token, artist_name):
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_header(token)
    query = f"?q={artist_name}&type=artist&limit=1"

    query_url = url + query
    result = requests.get(query_url, headers = headers)
    json_result = json.loads(result.content)["artists"]["items"]
    if len(json_result) == 0:
        print("No artist with this name exists.")
        return None
    return json_result[0]


def get_songs_by_artist(token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?country=US"
    headers = get_auth_header(token)
    result = requests.get(url,headers=headers)
    json_result = json.loads(result.content)["tracks"]
    return json_result

result = search_for_artist(token,"ACDC")
artist_id = result["id"]
songs = get_songs_by_artist(token, artist_id)
print(result)
print(artist_id)
print(songs)

for idx, song in enumerate(songs):
    print(f"{idx + 1}. {song['name']}")

{'external_urls': {'spotify': 'https://open.spotify.com/artist/711MCceyCBcFnzjGY4Q7Un'}, 'followers': {'href': None, 'total': 27798942}, 'genres': ['australian rock', 'hard rock', 'rock'], 'href': 'https://api.spotify.com/v1/artists/711MCceyCBcFnzjGY4Q7Un', 'id': '711MCceyCBcFnzjGY4Q7Un', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab6761610000e5ebc4c77549095c86acb4e77b37', 'width': 640}, {'height': 320, 'url': 'https://i.scdn.co/image/ab67616100005174c4c77549095c86acb4e77b37', 'width': 320}, {'height': 160, 'url': 'https://i.scdn.co/image/ab6761610000f178c4c77549095c86acb4e77b37', 'width': 160}], 'name': 'AC/DC', 'popularity': 78, 'type': 'artist', 'uri': 'spotify:artist:711MCceyCBcFnzjGY4Q7Un'}
711MCceyCBcFnzjGY4Q7Un
[{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/711MCceyCBcFnzjGY4Q7Un'}, 'href': 'https://api.spotify.com/v1/artists/711MCceyCBcFnzjGY4Q7Un', 'id': '711MCceyCBcFnzjGY4Q7Un', 'name': 'AC/DC', 't

## Obtain tracks based on specified genre and popularity

In [72]:
def get_data(url: str, token: str, verbose: bool = False):
    headers = get_auth_header(token)
    response = requests.get(url, headers=headers)
    result = json.loads(response.text)

    if verbose:
        print('Response body:\n', result)

    return result

def get_tracks(genres_list: list, steps: int, limit: int, offset: int, access_token: str):
    tracks_df = pd.DataFrame()
    _initial_offset = offset

    for genre in genres_list:
        
        for step in range(steps):
            url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&offset={}'.format(genre, limit, offset)
            search_item = get_data(url, access_token)

            for n in range(limit):   
                track_id = search_item['tracks']['items'][n]['id']
                track_name = search_item['tracks']['items'][n]['name']
                artist_name = search_item['tracks']['items'][n]['artists'][0]['name']
                popularity = search_item['tracks']['items'][n]['popularity']

                tracks_df = tracks_df.append({
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_name': artist_name,
                    'popularity': popularity,
                    'genre': genre
                }, ignore_index=True)

            offset += limit
        offset = _initial_offset

    return tracks_df

steps = 20
limit = 50
offset = 0
genres_list = ['rock','jazz']


In [73]:
# Test case 
url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&#offset={}'.format('jazz', limit, offset)
search_item = get_data(url,token,verbose=True)
print(search_item)

Response body:
 {'tracks': {'href': 'https://api.spotify.com/v1/search?query=genre%3Ajazz&type=track&offset=0&limit=50', 'items': [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/49e4v89VmlDcFCMyDv9wQ9'}, 'href': 'https://api.spotify.com/v1/artists/49e4v89VmlDcFCMyDv9wQ9', 'id': '49e4v89VmlDcFCMyDv9wQ9', 'name': 'Dean Martin', 'type': 'artist', 'uri': 'spotify:artist:49e4v89VmlDcFCMyDv9wQ9'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME'

In [18]:
# Using get tracks to parse data
tracks_df = get_tracks(genres_list, steps, limit, offset, token)

tracks_df

Unnamed: 0,track_id,track_name,artist_name,popularity,genre
0,2EjXfH91m7f8HiJN1yQg97,Rockin' Around The Christmas Tree,Brenda Lee,95.0,rock
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,92.0,rock
2,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,95.0,rock
3,3QiAAp20rPC3dcAtKtMaqQ,Blue Christmas,Elvis Presley,86.0,rock
4,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82.0,rock
...,...,...,...,...,...
1995,1iRpPjuliuSV0vbTVajdOn,Snoopy And The Leaf / Frieda (With The Natural...,Vince Guaraldi,49.0,jazz
1996,7z0Azh6QALRhIifBF1yZYK,Have Yourself A Merry Little Christmas,Duke Pearson,50.0,jazz
1997,6gzPmb75SbQGuCWwtAoutC,Chimes,Duponte et Duponte,54.0,jazz
1998,13Y8guAjsfss2WPIINuoYd,Favela - Remastered,Ike Quebec,49.0,jazz


## Collect audio features based on the scraped tracks

In [10]:
def get_track_features(tracks_df: pd.DataFrame, access_token: str):
    track_features_df = pd.DataFrame()

    ids_to_request = []
    for index, row in tracks_df.iterrows():
        track_id = tracks_df.iloc[index]['track_id']
        ids_to_request += [track_id]
    
    for i in range(len(ids_to_request) // 100 + 1):
        _list = ids_to_request[i*100:(i+1)*100]
        if len(_list) == 0: 
            break

        request_text = ",".join(_list) 
        url = 'https://api.spotify.com/v1/audio-features?ids=' + request_text
        result = get_data(url, access_token)
        track_features_list = result["audio_features"]

        for track_features in track_features_list:
            track_features_df = track_features_df.append(track_features, ignore_index=True)

    # drop negligible features
    track_features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url'], inplace=True)
    track_features_df.rename(columns={'id':'track_id'}, inplace=True)

    return track_features_df

In [11]:
track_features_df = get_track_features(tracks_df, token)
track_features_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_id,duration_ms,time_signature
0,0.589,0.4720,8.0,-8.749,1.0,0.0502,0.61400,0.000000,0.5050,0.898,67.196,2EjXfH91m7f8HiJN1yQg97,126267.0,4.0
1,0.612,0.8070,10.0,-2.810,1.0,0.0336,0.04950,0.017700,0.1010,0.398,124.053,2QjOHCTQ1Jl3zawyYOpxh6,240400.0,4.0
2,0.464,0.4170,0.0,-9.345,0.0,0.0256,0.13600,0.022000,0.0974,0.479,67.528,5XeFesFbtLpXzIVDNQP22n,183956.0,4.0
3,0.465,0.2700,4.0,-16.661,1.0,0.0425,0.90000,0.001040,0.1450,0.580,94.518,3QiAAp20rPC3dcAtKtMaqQ,129173.0,4.0
4,0.520,0.8520,0.0,-5.866,1.0,0.0543,0.00237,0.000058,0.0733,0.234,140.267,58ge6dfP91o9oXMzq3XkIS,253587.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.439,0.3330,0.0,-14.772,1.0,0.0615,0.98500,0.866000,0.0668,0.564,71.652,1iRpPjuliuSV0vbTVajdOn,33217.0,1.0
1996,0.495,0.2130,0.0,-14.328,1.0,0.0341,0.94700,0.768000,0.1020,0.278,84.320,7z0Azh6QALRhIifBF1yZYK,121773.0,4.0
1997,0.458,0.0533,10.0,-20.472,0.0,0.0289,0.86300,0.138000,0.1100,0.155,96.864,6gzPmb75SbQGuCWwtAoutC,163591.0,4.0
1998,0.605,0.1280,0.0,-18.793,0.0,0.1090,0.93100,0.601000,0.1270,0.796,148.197,13Y8guAjsfss2WPIINuoYd,240267.0,4.0


## Podcast parsing

In [74]:
# Test case
limit = 90
offset = 10
url = 'https://api.spotify.com/v1/me/shows?offset={}&limit={}'.format(offset, limit)
print(f"URL: {url}")
print(f"Access Token: {token}")
search_item = get_data(url, token,verbose=True)
print(search_item)

URL: https://api.spotify.com/v1/me/shows?offset=10&limit=90
Access Token: BQA7BTRKhq-M6O4xVKv9o72CYamRItTD5-BZBxAvblNH-aj9kNB6jjSWHnvOxWi6Aa_ok7p9lM2x5SGdaKfQtBi5dkO_xQoeIgfwY7_mYwkF1JhgabQ
Response body:
 {'error': {'status': 401, 'message': 'Missing token'}}
{'error': {'status': 401, 'message': 'Missing token'}}


In [75]:
# Test
offset = 0
limit = 2
url = 'https://api.spotify.com/v1/search?q=show&type=show&market={}&limit={}&offset={}'.format('US', limit, offset)
search_item = get_data(url, token,verbose=True)

Response body:
 {'shows': {'href': 'https://api.spotify.com/v1/search?query=show&type=show&market=US&offset=0&limit=2', 'items': [{'available_markets': ['AD', 'AE', 'AG', 'AL', 'AM', 'AR', 'AT', 'AU', 'BA', 'BB', 'BE', 'BF', 'BG', 'BH', 'BJ', 'BO', 'BR', 'BS', 'BT', 'BW', 'BZ', 'CA', 'CH', 'CL', 'CO', 'CR', 'CV', 'CW', 'CY', 'CZ', 'DE', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'FI', 'FJ', 'FM', 'FR', 'GB', 'GD', 'GE', 'GH', 'GM', 'GR', 'GT', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KI', 'KN', 'KR', 'KW', 'LB', 'LC', 'LI', 'LR', 'LS', 'LT', 'LU', 'LV', 'MA', 'MC', 'ME', 'MG', 'MH', 'MK', 'ML', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', 'NE', 'NG', 'NI', 'NL', 'NO', 'NR', 'NZ', 'OM', 'PA', 'PE', 'PG', 'PH', 'PL', 'PS', 'PT', 'PW', 'PY', 'QA', 'RO', 'RS', 'SA', 'SB', 'SC', 'SE', 'SG', 'SI', 'SK', 'SL', 'SM', 'SN', 'SR', 'ST', 'SV', 'TH', 'TL', 'TN', 'TO', 'TR', 'TT', 'TV', 'TW', 'UA', 'US', 'UY', 'VC', 'VN', 'VU', '

In [91]:
def get_shows(market: str, steps: int, limit: int, offset: int, access_token: str):
    shows_df = pd.DataFrame()
    _initial_offset = offset

    for step in range(steps):
        url = 'https://api.spotify.com/v1/search?q=show&type=show&market={}&limit={}&offset={}'.format('US', limit, offset)
        search_item = get_data(url, access_token)

        # Print response for debugging
        print(search_item)

        if 'shows' in search_item and 'items' in search_item['shows']:
            for n in range(min(limit, len(search_item['shows']['items']))):   
                available_market = search_item['shows']['items'][n]['available_markets']
                show_id = search_item['shows']['items'][n]['id']
                show_description = search_item['shows']['items'][n]['description']
                show_language = search_item['shows']['items'][n]['languages']
                show_name = search_item['shows']['items'][n]['name']
                artist_name = search_item['shows']['items'][n]['publisher']
                num_episodes = search_item['shows']['items'][n]['total_episodes']

                shows_df = shows_df.append({
                    'show_name': show_name,
                    'show_id': show_id,
                    'show_language': show_language,
                    'num_episodes': num_episodes,
                    'show_description': show_description,
                    'artist_name': artist_name,
                    'available_market': available_market
                }, ignore_index=True)

        offset += limit
    offset = _initial_offset

    return shows_df

steps = 20
limit = 50
offset = 1
market = 'US'
showDF = get_shows(market, steps, limit, offset, token)
display(showDF)

{'shows': {'href': 'https://api.spotify.com/v1/search?query=show&type=show&market=US&offset=101&limit=50', 'items': [{'available_markets': ['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BB', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BN', 'BO', 'BR', 'BS', 'BT', 'BW', 'BZ', 'CA', 'CH', 'CI', 'CL', 'CM', 'CO', 'CR', 'CV', 'CW', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'FI', 'FJ', 'FM', 'FR', 'GA', 'GB', 'GD', 'GE', 'GH', 'GM', 'GN', 'GQ', 'GR', 'GT', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KH', 'KI', 'KM', 'KN', 'KR', 'KW', 'LA', 'LB', 'LC', 'LI', 'LR', 'LS', 'LT', 'LU', 'LV', 'MA', 'MC', 'ME', 'MG', 'MH', 'MK', 'ML', 'MN', 'MO', 'MR', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', 'NE', 'NG', 'NI', 'NL', 'NO', 'NP', 'NR', 'NZ', 'OM', 'PA', 'PE', 'PG', 'PH', 'PL', 'PS', 'PT', 'PW', 'PY', 'QA', 'RO', 'RS', 'RW', 'SA', 'SB', 'SC', 'SE', 'SG', 'SI', 'SK', 'SL', 'SM', 'SN', 'SR', 'S

Unnamed: 0,show_name,show_id,show_language,num_episodes,show_description,artist_name,available_market
0,The Ben Shapiro Show,1WErgoXiZwgctkHLzqU6nf,[en],2214.0,Tired of the lies? Tired of the spin? Are you ...,The Daily Wire,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
1,公視主題之夜SHOW,6Zu99rhYZpqpc3GliuccCe,[zh],101.0,《公視主題之夜SHOW》從電影出發的公共論壇實境秀節目，激發你對社會議題不同的看法，拋出思辨...,公視主題之夜SHOW,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
2,Shawn Ryan Show,5eodRZd3qR9VT1ip1wI7xQ,[en-US],131.0,"The ""Shawn Ryan Show"" is hosted by Shawn Ryan,...",Shawn Ryan | Cumulus Podcast Network,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
3,笑えるホラーshow!,6bMhRSp2qkd0FR06fCrOID,[ja-JP],7.0,ホラーで笑おう！ 恐怖に慄き、家族愛に涙し、超展開に爆笑する。 コワイ・グロいだけがホラー映...,笑えるホラーshow!,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
4,The Adam Friedland Show Podcast,5MfQNeGILinrJFkf2UHLVK,[en-US],375.0,"Ladies and Gentlemen, the Adam Friedland Show.",The Adam Friedland Show,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
...,...,...,...,...,...,...,...
945,Motivational Speeches,3cTqh1f1Ytns9lw5lC885f,[en],1378.0,Motivational Speeches may be the force that is...,Motivationly,"[AD, AE, AG, AL, AM, AR, AT, AU, BA, BB, BE, B..."
946,The LloydAndMatt Show,2IHDo498vmih1ydiaaoyxy,[en],37.0,Welcome To The LloydAndMatt Show.,Lloyd & Matt,"[AD, AE, AG, AL, AM, AR, AT, AU, BA, BB, BE, B..."
947,The Carly P Reilly Show,5G2HSDneVg8mZUQ3zUIoLX,[en],255.0,Formerly: Overpriced JPEGs After 2 years of c...,Carly Reilly & Overpriced Media,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
948,That Dead Body Show,5ONYcZq8SDH88vzQZeY1RE,[en],14.0,"A podcast, about.... well ... murder,True crim...",That Dead Body Show,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."


In [90]:
# Export to cvs file
file_path = '/Users/blag/Documents/BPP/WebAutomation' # Need to change for different local users
file_name = 'SpotifyPodcasts.csv'
csv_file_path = os.path.join(file_path, file_name)
showDF.to_csv(csv_file_path, index = False, sep=';')