In [1]:
import requests
from urllib.parse import quote
import numpy as np
from tqdm import tqdm
import json
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import tqdm
import copy
import time
import pandas as pd

### Set up spotify scrapping functions

In [None]:
header = {
    'Authorization': 'Bearer BQAk16MOHENnpm8cqoqg-KnRMNOeEeePALBxSbFg7trtQ5hcBJXXB7JU3Se9oh0Edo6PLYRQXJzEM7HkChFurFZ6MXmHkTXT3ft5d_-R9GsW37ry5E8brWq1EUr88yqLvh4rNEZaB6ADAg',
    'Accept': 'application/json',
    'Content-Type': 'application/json'
}

def album_spotify_id(album_name, artist_name=None):
    query = None
    if artist_name is None:
        query = quote(album_name) + '&type=album'
    else:
        query = 'album:' + quote(album_name) + '%20artist:' + quote(artist_name) + '&type=album'
    r = requests.request('get', 'https://api.spotify.com/v1/search?q=' + query, headers=header)
    
    if len(r.json()['albums']['items']) != 0:
        return r.json()['albums']['items'][0]['id']
    else:
        return None

def album_tracklist(album_id):
    query = 'albums/' + album_id + '/tracks'
    r = requests.request('get', 'https://api.spotify.com/v1/' + query, headers=header)
    return [(song['id'], song['name']) for song in r.json()['items']]

def song_features(song_ids, song_names):
    query = 'audio-features/?ids=' + ','.join(song_ids)
    r = requests.request('get', 'https://api.spotify.com/v1/' + query, headers=header)
    features = r.json()['audio_features']
    for i in range(len(features)):
        features[i] = {k: v for k, v in features[i].items() if k not in ['uri', 'track_href', 'analysis_url']}
        features[i]['name'] = song_names[i]
    return features

def song_features_for_album(album_name, artist_name=None):
    album_id = album_spotify_id(album_name, artist_name)
    if album_id is None:
        return None
    tracklist = album_tracklist(album_id)        
    ids = [song[0] for song in tracklist]
    names = [song[1] for song in tracklist]
    return song_features(ids, names)

### Load the names of the albums to scrap

In [None]:
all_albums = json.load(open('all_albums.json'))
all_albums[39]

### Do the scrapping

all_albums_with_songs = []
problems = []

for album in tqdm.tqdm(all_albums):
    time.sleep(0.25)
    
    #ignore bad albums
    if album['album'][:3].lower() == 'now' or album['album'][:12].lower == 'totally hits':
        continue
        
    songs = song_features_for_album(album['album'], album['artist'])
    
    if songs is None:
        songs = song_features_for_album(album['album'])
        
    #store song info if not null
    if songs is not None:
        album_with_songs = copy.deepcopy(album)
        album_with_songs['songs'] = songs
        all_albums_with_songs.append(album_with_songs)
        
    else:
        problems.append(album)
        print(str(round(len(problems) / (len(all_albums_with_songs) + len(problems)) * 100, 2)) + '%')
        
#save songs in file
with open('all_albums_with_song.json', 'w') as outfile:
    json.dump(all_albums_with_songs, outfile)
        
        

### Load songs information from json file

In [8]:
all_info = json.load(open('all_albums_with_song.json'))
all_info[0]

{'album': 'The Eminem Show',
 'artist': 'Eminem',
 'year': 2002,
 'rank': 1,
 'songs': [{'danceability': 0.396,
   'energy': 0.137,
   'key': 2,
   'loudness': -19.912,
   'mode': 1,
   'speechiness': 0.037,
   'acousticness': 0.0102,
   'instrumentalness': 0.966,
   'liveness': 0.0984,
   'valence': 0.0327,
   'tempo': 78.45,
   'type': 'audio_features',
   'id': '0ttF21hZCPw2merE3GsHcP',
   'duration_ms': 29960,
   'time_signature': 4,
   'name': 'Curtains Up - Skit'},
  {'danceability': 0.649,
   'energy': 0.613,
   'key': 1,
   'loudness': -5.571,
   'mode': 1,
   'speechiness': 0.196,
   'acousticness': 0.0141,
   'instrumentalness': 0,
   'liveness': 0.169,
   'valence': 0.165,
   'tempo': 75.018,
   'type': 'audio_features',
   'id': '15qrWkkYCfmtu1uRpGf48L',
   'duration_ms': 324569,
   'time_signature': 4,
   'name': 'White America'},
  {'danceability': 0.916,
   'energy': 0.817,
   'key': 7,
   'loudness': -4.487,
   'mode': 1,
   'speechiness': 0.221,
   'acousticness': 0.04

### Process file to create dataframe

#information relative to song
columns_song = np.array(['name', 'danceability', 'energy', 'key', 'loudness', 'mode', 
           'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'id', 'duration_ms'])

#information relative to album
columns_album = np.array(['album', 'artist', 'year', 'rank'])

all_columns = np.append(columns_song, columns_album)
songs_df = pd.DataFrame(columns=all_columns)

row = 0
for album in tqdm.tqdm(all_info) :
    for song in album['songs'] :
        
        dict_values = {}
        
        for cs in columns_song :
            dict_values[cs] = song[cs]
            
        for ca in columns_album :
            dict_values[ca] = album[ca]
            
        #create new row in dataframe
        songs_df = songs_df.append(dict_values, ignore_index=True)
    
#save results
songs_df.to_pickle("songs_df.pkle") 

### Load songs dataframe

In [2]:
all_songs = pd.read_pickle("songs_df.pkle")
all_songs.head()

Unnamed: 0,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,album,artist,year,rank
0,Curtains Up - Skit,0.396,0.137,2,-19.912,1,0.037,0.0102,0.966,0.0984,0.0327,78.45,0ttF21hZCPw2merE3GsHcP,29960,The Eminem Show,Eminem,2002,1
1,White America,0.649,0.613,1,-5.571,1,0.196,0.0141,0.0,0.169,0.165,75.018,15qrWkkYCfmtu1uRpGf48L,324569,The Eminem Show,Eminem,2002,1
2,Business,0.916,0.817,7,-4.487,1,0.221,0.0435,3e-06,0.111,0.527,96.426,0x60P5taxdI5pcGbqbap6S,251760,The Eminem Show,Eminem,2002,1
3,Cleanin' Out My Closet,0.911,0.743,9,-5.005,0,0.219,0.0835,0.0,0.106,0.878,148.012,7BMO7O7ImjV8HNTH74Tshv,297933,The Eminem Show,Eminem,2002,1
4,Square Dance,0.726,0.684,8,-1.802,1,0.262,0.0296,0.0,0.0894,0.678,82.054,27mgDrExPa3obPAYXd3yQg,324040,The Eminem Show,Eminem,2002,1


### Build PCA json for each year

In [17]:
json_pca = {}

for y in tqdm.tqdm(range(2002, 2018)) :

    year_songs = all_songs[all_songs['year'] == y]

    all_songs_year = []

    #put this in PCA
    col_to_analyze = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']
    to_analyze = year_songs[col_to_analyze]

    pca = PCA(n_components=2)
    pca.fit(to_analyze)

    #compute PCA
    result = pd.DataFrame(pca.transform(to_analyze), columns=["x", "y"])
    result = pd.concat([result, year_songs[['artist', 'name', 'rank', 'album']]], axis = 1)

    for index, row in result.iterrows():
        song_dict = {"name" : row["name"],
                    "artist" : row["artist"],
                    "x" : row["x"],
                    "y" : row["y"],
                    "album" : row["album"],
                    "rank" : row["rank"]}

        all_songs_year.append(song_dict)

    json_pca[str(y)] = all_songs_year


#save json file
json.dump(json_pca, open("all_songs_pca.json", 'w'))

  0%|          | 0/16 [00:00<?, ?it/s]


NameError: name 'album' is not defined