<a href="https://colab.research.google.com/github/tommydb26/datasci112-playlist-generator/blob/main/Final_Project_Data_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gaining Access to the API

Spotify's API requires that a user get authorization with OAuth, which requires that they disclose their app to spotify in order to retrieve a client_id and client_secret. Further, you have to specify the scope of endpoints which you will be using. By doing this you can recieve an access token, which must then be provided as a header to all of your requests, so I saved that in a variable.

In [1]:
import requests
import time
import pandas as pd

In [2]:
BASE_URL = 'https://api.spotify.com/v1/'

In [3]:
client_id = "92268fea8a0347819b9704544dfa2191"
scope = "user-top-read user-library-read playlist-read-collaborative"

client_secret = "1b4c3b1dee5740b19ac0a90d2084fefd"

In [4]:
AUTH_URL = 'https://accounts.spotify.com/api/token'

auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
    "scope": scope
})

access_token = auth_response.json()["access_token"]
access_token

'BQANnGemylq8qvUHQ5CbBg4SOefkFhztTUkZALK0KMtgrZCqPu_Miy4KkT88sSIjyi-_Snkc591bhdVe28ERPl3MBYU01Jd_4mOE56lPPzWANoZHOnN3rNAbouahg6SiFRZN3DsEZg'

In [5]:
headers = {
    'Authorization': f'Bearer {access_token}'
}

# Collecting All Liked Songs

For the basis of my playlists, I decided to use my liked songs on Spotify. I am such an ardent user of this feature — if there's a song I am even remotely interested in, I like it. So, I figured this would be a good pool to pull from for my playlists.

Spotify only allows you to access 50 tracks at a time from a collection of songs. So, I created a DataFrame for the first 50 tracks, then I repeatedly concatenated to it a DataFrame of the next 50. In the first request, I included in the fields the total number of tracks in the playlist, which I would use to control the iteration.

In [6]:
response = requests.get(f"{BASE_URL}playlists/2uvsRnwp5NdXJGnFzkaFbS/tracks?fields=total%2C%20items(track(artists.name%2C%20popularity%2C%20explicit%2C%20id%2C%20name))&limit=50&offset=0",
                        headers=headers)

data = response.json()
total = data["total"]
data

{'items': [{'track': {'artists': [{'name': 'Miley Cyrus'}],
    'explicit': False,
    'id': '6FAsJDkg4y5JSWP9ZMKlDj',
    'name': 'You',
    'popularity': 77}},
  {'track': {'artists': [{'name': 'Charles Givings'}],
    'explicit': False,
    'id': '5lnLxpRxCDlqedyWgkBLOh',
    'name': 'Easy (LIke Sunday Morning)',
    'popularity': 47}},
  {'track': {'artists': [{'name': 'Bread'}],
    'explicit': False,
    'id': '7IX2e7pEShera9T1QIMvi7',
    'name': "Baby I'm-a Want You",
    'popularity': 64}},
  {'track': {'artists': [{'name': 'Bread'}],
    'explicit': False,
    'id': '52VIdyKqp1pJRSyUQaxKUA',
    'name': 'Everything I Own',
    'popularity': 70}},
  {'track': {'artists': [{'name': 'Cigarettes After Sex'}],
    'explicit': False,
    'id': '3AVrVz5rK8Hrqo9YGiVGN5',
    'name': 'Apocalypse',
    'popularity': 81}},
  {'track': {'artists': [{'name': 'Maggie Rogers'}],
    'explicit': False,
    'id': '3rST7c7KrUNLAmWGb8ZKUI',
    'name': 'Alaska - Acoustic',
    'popularity': 41}

In [7]:
df_songs = pd.json_normalize(data["items"])
df_songs

Unnamed: 0,track.artists,track.explicit,track.id,track.name,track.popularity
0,[{'name': 'Miley Cyrus'}],False,6FAsJDkg4y5JSWP9ZMKlDj,You,77
1,[{'name': 'Charles Givings'}],False,5lnLxpRxCDlqedyWgkBLOh,Easy (LIke Sunday Morning),47
2,[{'name': 'Bread'}],False,7IX2e7pEShera9T1QIMvi7,Baby I'm-a Want You,64
3,[{'name': 'Bread'}],False,52VIdyKqp1pJRSyUQaxKUA,Everything I Own,70
4,[{'name': 'Cigarettes After Sex'}],False,3AVrVz5rK8Hrqo9YGiVGN5,Apocalypse,81
5,[{'name': 'Maggie Rogers'}],False,3rST7c7KrUNLAmWGb8ZKUI,Alaska - Acoustic,41
6,[{'name': 'The Dip'}],False,0lEWIegMNMQ7W1ooB1zWT2,Sure Don't Miss You,59
7,[{'name': 'Eloise'}],False,3yAlkOEMjk2rH2hZTgvfoH,Left Side,55
8,[{'name': 'Paul Simon'}],False,2qQrVJQ7zXAyvqX1IkwsS2,Take Me to the Mardi Gras,38
9,[{'name': 'Beyoncé'}],True,4DByEumlGTZKSzuVEZ35eo,PURE/HONEY,70


In [8]:
offset = 50
while offset <= total:
  response = requests.get((f"""{BASE_URL}playlists/2uvsRnwp5NdXJGnFzkaFbS/tracks?fields=items(track(artists.name%2C%20popularity%2C%20explicit%2C%20id%2C%20name))&limit=50&offset={offset}"""),
                        headers=headers)
  new_data = response.json()
  df_songs = pd.concat([df_songs, pd.json_normalize(new_data["items"])])

  time.sleep(0.5)
  offset += 50

In [9]:
# save this for later, when we will need it for naming the columns of joined df
indices = df_songs["track.name"]

In [10]:
df_songs.set_index("track.name", inplace=True)

I then did some brief cleaning on the artists column; although I will not be training on this data, it will be important that it is readable for displaying playlists

In [11]:
s = df_songs["track.artists"].astype(str).str[1:-1]
s = s.str.replace("{'name': ", "", regex=True)
s = s.str.replace("}", "", regex=True)
df_songs["track.artists"] = s
df_songs

Unnamed: 0_level_0,track.artists,track.explicit,track.id,track.popularity
track.name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
You,'Miley Cyrus',False,6FAsJDkg4y5JSWP9ZMKlDj,77
Easy (LIke Sunday Morning),'Charles Givings',False,5lnLxpRxCDlqedyWgkBLOh,47
Baby I'm-a Want You,'Bread',False,7IX2e7pEShera9T1QIMvi7,64
Everything I Own,'Bread',False,52VIdyKqp1pJRSyUQaxKUA,70
Apocalypse,'Cigarettes After Sex',False,3AVrVz5rK8Hrqo9YGiVGN5,81
...,...,...,...,...
Godspeed,'Frank Ocean',False,34xTFwjPQ1dC6uJmleno7x,72
Writer In The Dark,'Lorde',False,193Dm5SqYy3hTSbuzxbwKc,65
Mood Ring,'Lorde',False,6FE9EXi8TYg09hR4xv5PWJ,0
Bored,'Billie Eilish',False,04sN26COy28wTXYj3dMoiZ,81


Lastly, I just renamed the columns to simpler representations of the data.

In [12]:
df_songs.rename(mapper={
    "track.artists": "artists",
    "track.explicit": "explicit",
    "track.id": "id",
    "track.name": "name",
    "track.popularity": "popularity"
}, axis="columns", inplace=True)

I will be drawing from two main data sets that Spotify maintains — audio features and audio analysis. 

Audio features is much more surface level, and it includes objective traits like duration and loudness, but also some more subjective seeming ones like danceability or valence—which is the "positivity" of a song. 

Audio analysis is much more technical, and what I am interested in from it is the "timbre" vector. This takes timbre—the texture or color of a musical sound—and quantifies it as a vector of different qualities.

Even worse than extracting the songs from my liked songs, when collecting the data on each song the API can only process one song per request. So I begun by starting a dataframe for each endpoint with the first song. Then I looped through the rest of the songs, requesting from each endpoint and concatenating those to the respective data frames for each song.

In [13]:
track_id = df_songs.loc["You", "id"]

response = requests.get(f"{BASE_URL}audio-features/{track_id}", 
                        headers=headers)
df_features = pd.json_normalize(response.json())

In [14]:
track_id = df_songs.loc["You", "id"]
response = requests.get(f"{BASE_URL}audio-analysis/{track_id}", 
                        headers=headers)

# because I only wanted the timbre from the analysis, this data required some cleaning before I could manage it in a DataFrame
df_analysis = pd.json_normalize(response.json(), "segments")
df_timbres_throughout = pd.DataFrame(df_analysis["timbre"].to_list(), columns = range(1,13))
df_timbres = pd.DataFrame(df_timbres_throughout.mean()).transpose()

# I also had to manually re-add the track_id
df_timbres["id"] = track_id
df_timbres

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,id
0,48.146507,6.755951,11.539323,-12.510097,23.383751,-16.174316,8.328809,-2.556505,-4.282422,-3.378086,-10.891944,2.58411,6FAsJDkg4y5JSWP9ZMKlDj


In [15]:
for id in df_songs.iloc[1:, 2]:
  # add the audio features to the features df
  response = requests.get(f"{BASE_URL}audio-features/{id}", 
                        headers=headers)
  df_features = pd.concat([df_features, pd.json_normalize(response.json())])
  
  # rest to obey API principles
  time.sleep(0.25)

  # add the audio analysis to the timbres df
  response = requests.get(f"{BASE_URL}audio-analysis/{id}", 
                        headers=headers)
  df_analysis_ = pd.json_normalize(response.json(), "segments")
  df_timbres_throughout_ = pd.DataFrame(df_analysis_["timbre"].to_list(), columns = range(1,13))
  df_timbres_ = pd.DataFrame(df_timbres_throughout_.mean()).transpose()
  df_timbres_["id"] = id
  df_timbres = pd.concat([df_timbres, df_timbres_])

  # rest again
  time.sleep(0.25)

The cleaning for the features data came after; I figured it would be more efficient to do one drop of the unwanted rather than to drop them with every request. 

In [16]:
df_features.drop(["type", "uri", "track_href", "analysis_url"], 
                 axis="columns", inplace=True)
df_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature
0,0.625,0.5680,11,-5.663,1,0.0324,0.2880,0.000000,0.6740,0.3480,85.545,6FAsJDkg4y5JSWP9ZMKlDj,179427,3
0,0.584,0.4350,10,-11.736,0,0.0263,0.3010,0.007340,0.1260,0.3990,132.439,5lnLxpRxCDlqedyWgkBLOh,266320,4
0,0.613,0.2890,8,-16.529,1,0.0305,0.7430,0.000002,0.0617,0.5530,144.188,7IX2e7pEShera9T1QIMvi7,150707,4
0,0.365,0.3380,2,-13.406,1,0.0322,0.7350,0.000000,0.1040,0.4460,79.241,52VIdyKqp1pJRSyUQaxKUA,187000,4
0,0.369,0.4670,5,-9.018,1,0.0274,0.0194,0.460000,0.1090,0.1740,94.473,3AVrVz5rK8Hrqo9YGiVGN5,290147,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.399,0.0969,6,-12.578,1,0.0509,0.9290,0.000168,0.1210,0.0758,109.540,34xTFwjPQ1dC6uJmleno7x,177922,4
0,0.325,0.2710,7,-8.069,1,0.0418,0.7300,0.000000,0.3010,0.2790,71.855,193Dm5SqYy3hTSbuzxbwKc,216611,4
0,0.709,0.4590,5,-11.193,1,0.0654,0.4320,0.000006,0.2300,0.3770,95.014,6FE9EXi8TYg09hR4xv5PWJ,225863,4
0,0.614,0.3180,7,-12.695,1,0.0478,0.8960,0.002390,0.0795,0.1120,119.959,04sN26COy28wTXYj3dMoiZ,180933,4


In [17]:
df_timbres

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,id
0,48.146507,6.755951,11.539323,-12.510097,23.383751,-16.174316,8.328809,-2.556505,-4.282422,-3.378086,-10.891944,2.584110,6FAsJDkg4y5JSWP9ZMKlDj
0,44.260628,19.184289,18.214885,-0.140513,24.040165,-24.511405,-3.840630,-6.885928,-2.012070,-1.377134,-7.607562,1.503317,5lnLxpRxCDlqedyWgkBLOh
0,37.714839,-16.496544,-12.316468,-12.847976,25.861839,-19.737157,-10.335721,-1.827133,-7.192570,-1.878703,-7.335082,-1.149978,7IX2e7pEShera9T1QIMvi7
0,40.059674,-13.343394,2.044856,-5.262738,31.014177,-18.898640,-13.315408,-2.899277,-7.657300,-0.368305,-6.512232,-6.191590,52VIdyKqp1pJRSyUQaxKUA
0,47.540592,-12.982581,-16.650313,-22.373093,-1.733394,-29.764263,7.681281,-1.659771,-2.585452,-2.547789,-8.395570,-1.803018,3AVrVz5rK8Hrqo9YGiVGN5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,36.813218,-69.412050,5.840797,-22.330447,25.515002,-17.792400,-15.280099,1.405760,-9.261899,-9.622248,-5.609669,2.967544,34xTFwjPQ1dC6uJmleno7x
0,46.052878,3.880775,20.611754,-9.098635,29.450032,-13.189979,-17.050438,-2.646305,-14.314003,-4.656721,-10.477538,-4.439217,193Dm5SqYy3hTSbuzxbwKc
0,41.471024,1.550384,-33.694737,-19.618376,30.164256,-24.133622,-19.055563,-6.317596,-6.653428,-2.188997,-8.915563,-1.431539,6FE9EXi8TYg09hR4xv5PWJ
0,40.009522,-68.123945,-38.775284,-26.824530,42.881618,-28.542628,-12.061354,-0.955191,-2.552814,-1.145596,-9.702547,3.082146,04sN26COy28wTXYj3dMoiZ


Spotify uniquely identifies songs by something called a track_id, which made it very easy to join my three DataFrames into one with id as a primary key in all of them. The default inner merge was sufficient since all three DataFrames have the same 1493 keys.

In [18]:
df_tracks_intermediate = df_songs.merge(df_features, on="id")
df_tracks = df_tracks_intermediate.merge(df_timbres, on="id")
df_tracks

Unnamed: 0,artists,explicit,id,popularity,danceability,energy,key,loudness,mode,speechiness,...,3,4,5,6,7,8,9,10,11,12
0,'Miley Cyrus',False,6FAsJDkg4y5JSWP9ZMKlDj,77,0.625,0.5680,11,-5.663,1,0.0324,...,11.539323,-12.510097,23.383751,-16.174316,8.328809,-2.556505,-4.282422,-3.378086,-10.891944,2.584110
1,'Charles Givings',False,5lnLxpRxCDlqedyWgkBLOh,47,0.584,0.4350,10,-11.736,0,0.0263,...,18.214885,-0.140513,24.040165,-24.511405,-3.840630,-6.885928,-2.012070,-1.377134,-7.607562,1.503317
2,'Bread',False,7IX2e7pEShera9T1QIMvi7,64,0.613,0.2890,8,-16.529,1,0.0305,...,-12.316468,-12.847976,25.861839,-19.737157,-10.335721,-1.827133,-7.192570,-1.878703,-7.335082,-1.149978
3,'Bread',False,52VIdyKqp1pJRSyUQaxKUA,70,0.365,0.3380,2,-13.406,1,0.0322,...,2.044856,-5.262738,31.014177,-18.898640,-13.315408,-2.899277,-7.657300,-0.368305,-6.512232,-6.191590
4,'Cigarettes After Sex',False,3AVrVz5rK8Hrqo9YGiVGN5,81,0.369,0.4670,5,-9.018,1,0.0274,...,-16.650313,-22.373093,-1.733394,-29.764263,7.681281,-1.659771,-2.585452,-2.547789,-8.395570,-1.803018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1488,'Frank Ocean',False,34xTFwjPQ1dC6uJmleno7x,72,0.399,0.0969,6,-12.578,1,0.0509,...,5.840797,-22.330447,25.515002,-17.792400,-15.280099,1.405760,-9.261899,-9.622248,-5.609669,2.967544
1489,'Lorde',False,193Dm5SqYy3hTSbuzxbwKc,65,0.325,0.2710,7,-8.069,1,0.0418,...,20.611754,-9.098635,29.450032,-13.189979,-17.050438,-2.646305,-14.314003,-4.656721,-10.477538,-4.439217
1490,'Lorde',False,6FE9EXi8TYg09hR4xv5PWJ,0,0.709,0.4590,5,-11.193,1,0.0654,...,-33.694737,-19.618376,30.164256,-24.133622,-19.055563,-6.317596,-6.653428,-2.188997,-8.915563,-1.431539
1491,'Billie Eilish',False,04sN26COy28wTXYj3dMoiZ,81,0.614,0.3180,7,-12.695,1,0.0478,...,-38.775284,-26.824530,42.881618,-28.542628,-12.061354,-0.955191,-2.552814,-1.145596,-9.702547,3.082146


Lastly, I changed the indices to be the track name because each observation is a songs.

In [19]:
df_tracks.set_index(indices, inplace=True)
df_tracks

Unnamed: 0_level_0,artists,explicit,id,popularity,danceability,energy,key,loudness,mode,speechiness,...,3,4,5,6,7,8,9,10,11,12
track.name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
You,'Miley Cyrus',False,6FAsJDkg4y5JSWP9ZMKlDj,77,0.625,0.5680,11,-5.663,1,0.0324,...,11.539323,-12.510097,23.383751,-16.174316,8.328809,-2.556505,-4.282422,-3.378086,-10.891944,2.584110
Easy (LIke Sunday Morning),'Charles Givings',False,5lnLxpRxCDlqedyWgkBLOh,47,0.584,0.4350,10,-11.736,0,0.0263,...,18.214885,-0.140513,24.040165,-24.511405,-3.840630,-6.885928,-2.012070,-1.377134,-7.607562,1.503317
Baby I'm-a Want You,'Bread',False,7IX2e7pEShera9T1QIMvi7,64,0.613,0.2890,8,-16.529,1,0.0305,...,-12.316468,-12.847976,25.861839,-19.737157,-10.335721,-1.827133,-7.192570,-1.878703,-7.335082,-1.149978
Everything I Own,'Bread',False,52VIdyKqp1pJRSyUQaxKUA,70,0.365,0.3380,2,-13.406,1,0.0322,...,2.044856,-5.262738,31.014177,-18.898640,-13.315408,-2.899277,-7.657300,-0.368305,-6.512232,-6.191590
Apocalypse,'Cigarettes After Sex',False,3AVrVz5rK8Hrqo9YGiVGN5,81,0.369,0.4670,5,-9.018,1,0.0274,...,-16.650313,-22.373093,-1.733394,-29.764263,7.681281,-1.659771,-2.585452,-2.547789,-8.395570,-1.803018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Godspeed,'Frank Ocean',False,34xTFwjPQ1dC6uJmleno7x,72,0.399,0.0969,6,-12.578,1,0.0509,...,5.840797,-22.330447,25.515002,-17.792400,-15.280099,1.405760,-9.261899,-9.622248,-5.609669,2.967544
Writer In The Dark,'Lorde',False,193Dm5SqYy3hTSbuzxbwKc,65,0.325,0.2710,7,-8.069,1,0.0418,...,20.611754,-9.098635,29.450032,-13.189979,-17.050438,-2.646305,-14.314003,-4.656721,-10.477538,-4.439217
Mood Ring,'Lorde',False,6FE9EXi8TYg09hR4xv5PWJ,0,0.709,0.4590,5,-11.193,1,0.0654,...,-33.694737,-19.618376,30.164256,-24.133622,-19.055563,-6.317596,-6.653428,-2.188997,-8.915563,-1.431539
Bored,'Billie Eilish',False,04sN26COy28wTXYj3dMoiZ,81,0.614,0.3180,7,-12.695,1,0.0478,...,-38.775284,-26.824530,42.881618,-28.542628,-12.061354,-0.955191,-2.552814,-1.145596,-9.702547,3.082146


Finally, I downloaded the data as a csv, so I could explore it in another colab!

In [20]:
from google.colab import files

df_tracks.to_csv('tracks.csv', index=True)
files.download('tracks.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>