In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import json
import requests
import warnings
warnings.filterwarnings('ignore')

from google.colab import files

In [0]:
## GET SHARED LINKS FOR ALL THE spotify json files from dropbox

urls = []

url = "https://api.dropboxapi.com/2/sharing/list_shared_links"

headers = {
    "Authorization": "Bearer NVNEjS6oZpAAAAAAAAAAUBDZqqjzlHshJp3o7ZKG8Owtp3kggBin9Jf28_qcBkub",
    "Content-Type": "application/json"
}

for i in range(1, 100):
  
  path = "mpd.slice." + str(i) + "000-" + str(i) + "999.json"

  data = {
      "path": "/data/" + path
  }

  r = requests.post(url, headers=headers, data=json.dumps(data))
  
  for x in r.json()['links']:
    if path == x['name']:
      uri = x['url'][:-4] + 'dl=1'
      urls.append(uri)


In [0]:
## Read Spotify Playlist Data from DropBox json urls and create a list of jsons

playlist_jsons = [] # list of jsons

for url in urls:
  json = requests.get(url).json()
  playlist_jsons.append(json)

In [0]:
#Convert list of jsons to a playlists dataframe 
playlists = pd.DataFrame() # data frame that has playlist info for 100000 playlists
for json in playlist_jsons:
  playlists = pd.concat([playlists, 
                            pd.io.json.json_normalize(json['playlists'])
                           ])


In [0]:
##Create a tracks data frame that also contains what playlist this track is in

lens = [len(item) for item in playlists['tracks']]
pids = pd.DataFrame({'pid': np.repeat(playlists['pid'].values, lens)})
tracks = pd.DataFrame(np.hstack(playlists['tracks']).tolist())
tracks = pd.concat([pids, tracks], axis=1)

## Save tracks info to csv
tracks.to_csv('tracks.csv', index=False)


In [0]:
## save tracks to google drive. It was too big to download to local drive unlike
## other csv files that we downloaded below

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a file.
uploaded = drive.CreateFile({'title': 'tracks.csv'})
uploaded.SetContentFile('tracks.csv')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 149uP83JYPg0y90i72TL2Aq64Ks9FThEg


In [0]:
### Remove all the songs that are in less than 5 playlists. As we are only looking for large trends, we can remove
### the smaller culprits and frees up our data for faster processing. Also getting track features from spotify API is
### computationally expensive and this helps us bypass that issue
pop_tracks = tracks[['pid','track_uri']].drop_duplicates()
ats = pd.DataFrame(pop_tracks['track_uri'].value_counts()).reset_index()
ats = ats.rename(columns = {'index':'track_uri','track_uri':'count'})
pop_tracks = ats[ats['count'] > 5]

tracks1 = pd.merge(tracks, pop_tracks[['track_uri']], on = ['track_uri'])

tracks1.to_csv('tracks_filtered.csv', index=True)
files.download('tracks_filtered.csv')

In [0]:
# Authentication for the Spotify API and the spotipy wrapper built on top of this api
!pip install spotipy

import spotipy 

client_id = '15cca5e7a05f4b6bbf07ea41fbfe29e1'
client_secret = '3b3af3159b004d6fa37ffa4d22ed3168'

body_params = {'grant_type' : 'client_credentials'}
url = 'https://accounts.spotify.com/api/token'
auth_data = requests.post(url, data=body_params, auth=(client_id, client_secret)).json()

sp = spotipy.Spotify(auth_data['access_token'])



In [0]:
## This code goes through all the unique tracks in our tracks list and 
## then hooks up to the spotify api and gets the release date, label, and popularity of the album.
## this only needs to be run once. I saved all the details in albums.csv

unique_tracks = tracks1['track_uri'].drop_duplicates().tolist()

limit = 100

details = {}
for i in range(int(len(unique_tracks)/limit)):
  jsons = sp.audio_features(unique_tracks[limit * i: limit * (i+1)])
  for track in jsons:
    if track is not None:
      details[track['uri']] = [track['acousticness'], track['danceability'], track['energy'], track['instrumentalness'], track['key'], track['loudness'], track['tempo'], track['valence'] ]
      
df_tracks = pd.DataFrame(details).T
df_tracks.columns = ['acousticness','danceability','energy', 'instrumentalness', 'key', 'loudness', 'tempo', 'valence']
df_tracks.index.rename("track_uri", True)
df_tracks.head()
df_tracks.to_csv('track_features.csv', index=True)
files.download('track_features.csv')

In [0]:
## This code goes through all the unique albums in our tracks list and 
## then hooks up to the spotify api and gets the release date, label, and popularity of the album.
## this only needs to be run once. I saved all the details in albums.csv

unique_albums = tracks1['album_uri'].drop_duplicates().tolist()
limit = 20

details = {}
for i in range(int(len(unique_albums)/limit)):
  jsons = sp.albums(unique_albums[limit * i: limit * (i+1)])
  for album in jsons['albums']:
    if album is not None:
      details[album['uri']] = [album['release_date'], album['label'], album['album_type'], album['popularity']] 
      
df_albums = pd.DataFrame(details).T
df_albums.columns = ['release_date','label','album_type', 'popularity']
df_albums.index.rename("album_uri", True)
df_albums.head()
df_albums.to_csv('albums.csv', index=True)
files.download('albums.csv')

In [0]:
## This code goes through all the unique artists in our tracks list and 
## then hooks up to the spotify api and gets the genres, followers and popularity of the album.
## this only needs to be run once. I saved all the details in artists.csv

unique_artists = tracks1['artist_uri'].drop_duplicates().tolist()
limit = 50

details = {}
for i in range(int(len(unique_artists)/limit)):
  jsons = sp.artists(unique_artists[limit * i: limit * (i+1)])
  for artist in jsons['artists']:
    if artist is not None:
      details[artist['uri']] = [artist["genres"][0] if artist['genres'] else "", artist['followers']["total"], artist['popularity']] 
      
df_artists = pd.DataFrame(details).T
df_artists.columns = ['genres','followers','popularity']
df_artists.index.rename("artist_uri", True)
df_artists.head()
df_artists.to_csv('artists.csv', index=True)
files.download('artists.csv')

retrying ...2secs
retrying ...1secs
retrying ...2secs
retrying ...2secs
retrying ...1secs
