In [None]:
import requests
from urllib.parse import quote
import numpy as np
from tqdm import tqdm
import json
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import tqdm
import copy
import re
import time
import pandas as pd

### Set up spotify scrapping functions

In [None]:
header = {
    'Authorization': 'Bearer BQCMFNhx0KM5rTjrfPF3PsjZojnWpGTakcou-ULBGScajafAfgy_0_ONljQ9LXU27_ZI79ZMRgscB8rdLgR87jZzoboX2yUJISKkcuzBGF9_tfMflQ-RIxiM5m9bO3Cb4A-CgD1en_QEug',
    'Accept': 'application/json',
    'Content-Type': 'application/json'
}

def album_spotify_id(album_name, artist_name=None):
    query = None
    if artist_name is None:
        query = quote(album_name) + '&type=album'
    else:
        query = 'album:' + quote(album_name) + '%20artist:' + quote(artist_name) + '&type=album'
    r = requests.request('get', 'https://api.spotify.com/v1/search?q=' + query, headers=header)
    
    if len(r.json()['albums']['items']) != 0:
        return r.json()['albums']['items'][0]['id']
    else:
        return None

def album_tracklist(album_id):
    query = 'albums/' + album_id + '/tracks'
    r = requests.request('get', 'https://api.spotify.com/v1/' + query, headers=header)
    return [(song['id'], song['name']) for song in r.json()['items']]

def song_features(song_ids, song_names):
    query = 'audio-features/?ids=' + ','.join(song_ids)
    r = requests.request('get', 'https://api.spotify.com/v1/' + query, headers=header)
    features = r.json()['audio_features']
    for i in range(len(features)):
        features[i] = {k: v for k, v in features[i].items() if k not in ['uri', 'track_href', 'analysis_url']}
        features[i]['name'] = song_names[i]
    return features

def song_features_for_album(album_name, artist_name=None):
    album_id = album_spotify_id(album_name, artist_name)
    if album_id is None:
        return None
    tracklist = album_tracklist(album_id)        
    ids = [song[0] for song in tracklist]
    names = [song[1] for song in tracklist]
    return song_features(ids, names)

### Load the names of the albums to scrap

In [None]:
all_albums = json.load(open('all_albums.json'))
all_albums[39]

### Do the scrapping

all_albums_with_songs = []
problems = []

for album in tqdm.tqdm(all_albums):
    time.sleep(0.25)
    
    #ignore bad albums
    if album['album'][:3].lower() == 'now' or album['album'][:12].lower == 'totally hits':
        continue
        
    songs = song_features_for_album(album['album'], album['artist'])
    
    if songs is None:
        songs = song_features_for_album(album['album'])
        
    #store song info if not null
    if songs is not None:
        album_with_songs = copy.deepcopy(album)
        album_with_songs['songs'] = songs
        all_albums_with_songs.append(album_with_songs)
        
    else:
        problems.append(album)
        print(str(round(len(problems) / (len(all_albums_with_songs) + len(problems)) * 100, 2)) + '%')
        
#save songs in file
with open('all_albums_with_song.json', 'w') as outfile:
    json.dump(all_albums_with_songs, outfile)
        
        

### Load songs information from json file

In [None]:
all_info = json.load(open('all_albums_with_song.json'))
all_info[0]

### Process file to create dataframe

#information relative to song
columns_song = np.array(['name', 'danceability', 'energy', 'key', 'loudness', 'mode', 
           'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'id', 'duration_ms'])

#information relative to album
columns_album = np.array(['album', 'artist', 'year', 'rank', 'img_src'])

all_columns = np.append(columns_song, columns_album)
songs_df = pd.DataFrame(columns=all_columns)

row = 0
for album in tqdm.tqdm(all_info) :
    for song in album['songs'] :
        
        dict_values = {}
        
        for cs in columns_song :
            dict_values[cs] = song[cs]
            
        for ca in columns_album :
            dict_values[ca] = album[ca]
            
        #create new row in dataframe
        songs_df = songs_df.append(dict_values, ignore_index=True)
    
#save results
songs_df.to_pickle("songs_df.pkle") 

### Load songs dataframe

In [None]:
all_songs = pd.read_pickle("songs_df.pkle")
all_songs[all_songs["year"] == 2003].head()

### Build PCA json for each year

In [None]:
json_pca = {}

for y in tqdm.tqdm(range(2002, 2018)) :

    year_songs = all_songs[all_songs['year'] == y]

    all_songs_year = []

    #put this in PCA
    col_to_analyze = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']
    to_analyze = year_songs[col_to_analyze]

    pca = PCA(n_components=2)
    pca.fit(to_analyze)

    #compute PCA
    result = pd.DataFrame(pca.transform(to_analyze), columns=["x", "y"])
    
    
    result = pd.concat([result.reset_index(), year_songs[['artist', 'name', 'rank', 'album']].reset_index()], axis = 1)

    if len(result[result.isnull().any(axis=1)]) != 0 :
        print(result[result.isnull().any(axis=1)].head())
    
    for index, row in result.iterrows():
        song_dict = {"name" : row["name"],
                    "artist" : row["artist"],
                    "x" : row["x"],
                    "y" : row["y"],
                    "album" : row["album"],
                    "rank" : row["rank"]}

        all_songs_year.append(song_dict)

    json_pca[str(y)] = all_songs_year


#save json file
json.dump(json_pca, open("all_songs_pca.json", 'w'))