In [93]:
from dotenv import load_dotenv
import os
import base64
import requests
import json
import pandas as pd


In [94]:
load_dotenv()

# Specify Spotify API login info
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

# Obtain Access Token from Spotify API
def get_token():
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes),"utf-8")

    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type": "client_credentials"}
    result = requests.post(url, headers=headers, data=data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token


def get_auth_header(token):
    return{"Authorization": "Bearer " + token}

    
token = get_token()


In [95]:
# Search Artist
def search_for_artist(token, artist_name):
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_header(token)
    query = f"?q={artist_name}&type=artist&limit=1"

    query_url = url + query
    result = requests.get(query_url, headers = headers)
    json_result = json.loads(result.content)["artists"]["items"]
    if len(json_result) == 0:
        print("No artist with this name exists.")
        return None
    return json_result[0]


def get_songs_by_artist(token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?country=US"
    headers = get_auth_header(token)
    result = requests.get(url,headers=headers)
    json_result = json.loads(result.content)["tracks"]
    return json_result

result = search_for_artist(token,"ACDC")
artist_id = result["id"]
songs = get_songs_by_artist(token, artist_id)
print(result)
print(artist_id)
print(songs)

for idx, song in enumerate(songs):
    print(f"{idx + 1}. {song['name']}")

{'external_urls': {'spotify': 'https://open.spotify.com/artist/711MCceyCBcFnzjGY4Q7Un'}, 'followers': {'href': None, 'total': 27798942}, 'genres': ['australian rock', 'hard rock', 'rock'], 'href': 'https://api.spotify.com/v1/artists/711MCceyCBcFnzjGY4Q7Un', 'id': '711MCceyCBcFnzjGY4Q7Un', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab6761610000e5ebc4c77549095c86acb4e77b37', 'width': 640}, {'height': 320, 'url': 'https://i.scdn.co/image/ab67616100005174c4c77549095c86acb4e77b37', 'width': 320}, {'height': 160, 'url': 'https://i.scdn.co/image/ab6761610000f178c4c77549095c86acb4e77b37', 'width': 160}], 'name': 'AC/DC', 'popularity': 78, 'type': 'artist', 'uri': 'spotify:artist:711MCceyCBcFnzjGY4Q7Un'}
711MCceyCBcFnzjGY4Q7Un
[{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/711MCceyCBcFnzjGY4Q7Un'}, 'href': 'https://api.spotify.com/v1/artists/711MCceyCBcFnzjGY4Q7Un', 'id': '711MCceyCBcFnzjGY4Q7Un', 'name': 'AC/DC', 't

## Obtain tracks based on specified genre and popularity

In [96]:
def get_data(url: str, token: str, verbose: bool = False):
    headers = get_auth_header(token)
    response = requests.get(url, headers=headers)
    result = json.loads(response.text)

    if verbose:
        print('Response body:\n', result)

    return result

def get_tracks(genres_list: list, steps: int, limit: int, offset: int, access_token: str):
    tracks_df = pd.DataFrame()
    _initial_offset = offset

    for genre in genres_list:
        
        for step in range(steps):
            url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&offset={}'.format(genre, limit, offset)
            search_item = get_data(url, access_token)

            for n in range(limit):   
                track_id = search_item['tracks']['items'][n]['id']
                track_name = search_item['tracks']['items'][n]['name']
                artist_name = search_item['tracks']['items'][n]['artists'][0]['name']
                popularity = search_item['tracks']['items'][n]['popularity']

                tracks_df = tracks_df.append({
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_name': artist_name,
                    'popularity': popularity,
                    'genre': genre
                }, ignore_index=True)

            offset += limit
        offset = _initial_offset

    return tracks_df

In [97]:
# Test case 
url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&#offset={}'.format('jazz', limit, offset)
search_item = get_data(url,token,verbose=True)
#print(search_item)

Response body:
 {'tracks': {'href': 'https://api.spotify.com/v1/search?query=genre%3Ajazz&type=track&offset=0&limit=50', 'items': [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/49e4v89VmlDcFCMyDv9wQ9'}, 'href': 'https://api.spotify.com/v1/artists/49e4v89VmlDcFCMyDv9wQ9', 'id': '49e4v89VmlDcFCMyDv9wQ9', 'name': 'Dean Martin', 'type': 'artist', 'uri': 'spotify:artist:49e4v89VmlDcFCMyDv9wQ9'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME'

In [98]:
# Using get tracks to parse data
steps = 20
limit = 50
offset = 0
genres_list = ['rock','jazz']
tracks_df = get_tracks(genres_list, steps, limit, offset, token)

display(tracks_df.head(10))

Unnamed: 0,track_id,track_name,artist_name,popularity,genre
0,2EjXfH91m7f8HiJN1yQg97,Rockin' Around The Christmas Tree,Brenda Lee,95.0,rock
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,93.0,rock
2,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,95.0,rock
3,3QiAAp20rPC3dcAtKtMaqQ,Blue Christmas,Elvis Presley,86.0,rock
4,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82.0,rock
5,2pnPe4pJtq7689i5ydzvJJ,Run Rudolph Run,Chuck Berry,85.0,rock
6,0ofHAoxe9vBkTCp2UQIavz,Dreams - 2004 Remaster,Fleetwood Mac,88.0,rock
7,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,88.0,rock
8,5UWwZ5lm5PKu6eKsHAGxOk,Everlong,Foo Fighters,87.0,rock
9,3QIoEi8Enr9uHffwInGIsC,Please Come Home for Christmas - 2013 Remaster,Eagles,84.0,rock


## Collect audio features based on the scraped tracks

In [100]:
def get_track_features(tracks_df: pd.DataFrame, access_token: str):
    track_features_df = pd.DataFrame()

    ids_to_request = []
    for index, row in tracks_df.iterrows():
        track_id = tracks_df.iloc[index]['track_id']
        ids_to_request += [track_id]
    
    for i in range(len(ids_to_request) // 100 + 1):
        _list = ids_to_request[i*100:(i+1)*100]
        if len(_list) == 0: 
            break

        request_text = ",".join(_list) 
        url = 'https://api.spotify.com/v1/audio-features?ids=' + request_text
        result = get_data(url, access_token)
        track_features_list = result["audio_features"]

        for track_features in track_features_list:
            track_features_df = track_features_df.append(track_features, ignore_index=True)

    # drop negligible features
    track_features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url'], inplace=True)
    track_features_df.rename(columns={'id':'track_id'}, inplace=True)

    return track_features_df

In [101]:
track_features_df = get_track_features(tracks_df, token)
display(track_features_df.head(10))

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_id,duration_ms,time_signature
0,0.589,0.472,8.0,-8.749,1.0,0.0502,0.614,0.0,0.505,0.898,67.196,2EjXfH91m7f8HiJN1yQg97,126267.0,4.0
1,0.612,0.807,10.0,-2.81,1.0,0.0336,0.0495,0.0177,0.101,0.398,124.053,2QjOHCTQ1Jl3zawyYOpxh6,240400.0,4.0
2,0.464,0.417,0.0,-9.345,0.0,0.0256,0.136,0.022,0.0974,0.479,67.528,5XeFesFbtLpXzIVDNQP22n,183956.0,4.0
3,0.465,0.27,4.0,-16.661,1.0,0.0425,0.9,0.00104,0.145,0.58,94.518,3QiAAp20rPC3dcAtKtMaqQ,129173.0,4.0
4,0.52,0.852,0.0,-5.866,1.0,0.0543,0.00237,5.8e-05,0.0733,0.234,140.267,58ge6dfP91o9oXMzq3XkIS,253587.0,4.0
5,0.681,0.715,7.0,-10.609,0.0,0.0912,0.812,9e-06,0.0777,0.957,152.132,2pnPe4pJtq7689i5ydzvJJ,165733.0,4.0
6,0.828,0.492,0.0,-9.744,1.0,0.0276,0.0644,0.00428,0.128,0.789,120.151,0ofHAoxe9vBkTCp2UQIavz,257800.0,4.0
7,0.352,0.911,1.0,-5.23,1.0,0.0747,0.00121,0.0,0.0995,0.236,148.033,003vvx7Niy0yvhvHt4a68B,222973.0,4.0
8,0.413,0.881,11.0,-5.541,0.0,0.0367,6e-05,0.000308,0.0805,0.364,158.066,5UWwZ5lm5PKu6eKsHAGxOk,250547.0,4.0
9,0.612,0.289,9.0,-11.568,1.0,0.0421,0.599,0.00266,0.0561,0.466,183.18,3QIoEi8Enr9uHffwInGIsC,177440.0,3.0


## Podcast parsing

In [102]:
# Test
offset = 0
limit = 2
url = 'https://api.spotify.com/v1/search?q=show&type=show&market={}&limit={}&offset={}'.format('US', limit, offset)
search_item = get_data(url, token,verbose=True)

Response body:
 {'shows': {'href': 'https://api.spotify.com/v1/search?query=show&type=show&market=US&offset=0&limit=2', 'items': [{'available_markets': ['AD', 'AE', 'AG', 'AL', 'AM', 'AR', 'AT', 'AU', 'BA', 'BB', 'BE', 'BF', 'BG', 'BH', 'BJ', 'BO', 'BR', 'BS', 'BT', 'BW', 'BZ', 'CA', 'CH', 'CL', 'CO', 'CR', 'CV', 'CW', 'CY', 'CZ', 'DE', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'FI', 'FJ', 'FM', 'FR', 'GB', 'GD', 'GE', 'GH', 'GM', 'GR', 'GT', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KI', 'KN', 'KR', 'KW', 'LB', 'LC', 'LI', 'LR', 'LS', 'LT', 'LU', 'LV', 'MA', 'MC', 'ME', 'MG', 'MH', 'MK', 'ML', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', 'NE', 'NG', 'NI', 'NL', 'NO', 'NR', 'NZ', 'OM', 'PA', 'PE', 'PG', 'PH', 'PL', 'PS', 'PT', 'PW', 'PY', 'QA', 'RO', 'RS', 'SA', 'SB', 'SC', 'SE', 'SG', 'SI', 'SK', 'SL', 'SM', 'SN', 'SR', 'ST', 'SV', 'TH', 'TL', 'TN', 'TO', 'TR', 'TT', 'TV', 'TW', 'UA', 'US', 'UY', 'VC', 'VN', 'VU', '

In [103]:
def get_shows(market: str, steps: int, limit: int, offset: int, access_token: str):
    shows_df = pd.DataFrame()
    _initial_offset = offset

    for step in range(steps):
        url = 'https://api.spotify.com/v1/search?q=show&type=show&market={}&limit={}&offset={}'.format('US', limit, offset)
        search_item = get_data(url, access_token)

        if 'shows' in search_item and 'items' in search_item['shows']:
            for n in range(min(limit, len(search_item['shows']['items']))):   
                available_market = search_item['shows']['items'][n]['available_markets']
                show_id = search_item['shows']['items'][n]['id']
                show_description = search_item['shows']['items'][n]['description']
                show_language = search_item['shows']['items'][n]['languages']
                show_name = search_item['shows']['items'][n]['name']
                artist_name = search_item['shows']['items'][n]['publisher']
                num_episodes = search_item['shows']['items'][n]['total_episodes']

                shows_df = shows_df.append({
                    'show_name': show_name,
                    'show_id': show_id,
                    'show_language': show_language,
                    'num_episodes': num_episodes,
                    'show_description': show_description,
                    'artist_name': artist_name,
                    'available_market': available_market
                }, ignore_index=True)

        offset += limit
    offset = _initial_offset

    return shows_df

steps = 20
limit = 50
offset = 1
market = 'US'
showDF = get_shows(market, steps, limit, offset, token)
display(showDF)

Unnamed: 0,show_name,show_id,show_language,num_episodes,show_description,artist_name,available_market
0,The Ben Shapiro Show,1WErgoXiZwgctkHLzqU6nf,[en],2214.0,Tired of the lies? Tired of the spin? Are you ...,The Daily Wire,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
1,公視主題之夜SHOW,6Zu99rhYZpqpc3GliuccCe,[zh],101.0,《公視主題之夜SHOW》從電影出發的公共論壇實境秀節目，激發你對社會議題不同的看法，拋出思辨...,公視主題之夜SHOW,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
2,Shawn Ryan Show,5eodRZd3qR9VT1ip1wI7xQ,[en-US],131.0,"The ""Shawn Ryan Show"" is hosted by Shawn Ryan,...",Shawn Ryan | Cumulus Podcast Network,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
3,笑えるホラーshow!,6bMhRSp2qkd0FR06fCrOID,[ja-JP],7.0,ホラーで笑おう！ 恐怖に慄き、家族愛に涙し、超展開に爆笑する。 コワイ・グロいだけがホラー映...,笑えるホラーshow!,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
4,The Adam Friedland Show Podcast,5MfQNeGILinrJFkf2UHLVK,[en-US],375.0,"Ladies and Gentlemen, the Adam Friedland Show.",The Adam Friedland Show,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
...,...,...,...,...,...,...,...
945,Morning Announcements,4bBbDC2kEG6K8WAvzgSlkb,[en],737.0,Morning Announcements is a daily show brought ...,Betches Media,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
946,"BFFs with Dave Portnoy, Josh Richards, and Bri...",5pjCh71RVk9oE6DXagcKyK,[en-US],153.0,"The unlikely trio of Josh Richards, Dave Portn...",Barstool Sports,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
947,The Joe Santagato Show,2OvJBQIav3NZJhidJUGGFk,[en],28.0,Joe Santagato puts way too much effort into ex...,Joe Santagato,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."
948,Todd N Tyler Radio Empire,7tHHhPrub7HkFdfqNCsgR2,[en],500.0,The Todd-n-Tyler Show is a talk radio show bro...,Todd n Tyler,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B..."


In [90]:
# Export to cvs file
file_path = '/Users/blag/Documents/BPP/WebAutomation' # Need to change for different local users
file_name = 'SpotifyPodcasts.csv'
csv_file_path = os.path.join(file_path, file_name)
showDF.to_csv(csv_file_path, index = False, sep=';')