# Spotify Funk Recommender - Data Collection
Gather features from various Spotify playlists and associated tracks and compile into a file. This data collection is intended for use in building a song recommender based on a playlist and other similar tracks.


## Imports

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np

# Spotify Credentials
import spot_creds

# Graphing
import matplotlib.pyplot as plt

# Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer

clid = spot_creds.client_id
secret = spot_creds.secret

#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=clid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

## Gather Data
First, find a number of Spotify funk music playlists.

* Old School Funk: https://open.spotify.com/playlist/37i9dQZF1EIfqkfSDVB2GV
* All Funked Up: https://open.spotify.com/playlist/37i9dQZF1DX4WgZiuR77Ef
* Funky Jams: https://open.spotify.com/playlist/37i9dQZF1DX6drTZKzZwSo
* Crisp: https://open.spotify.com/playlist/37i9dQZF1DXdb5FEvfgsH9
* Instrumental Funk: https://open.spotify.com/playlist/37i9dQZF1DX8f5qTGj8FYl
* Future Funk: https://open.spotify.com/playlist/37i9dQZF1DXbjGYBfEmjR5
* Nu Funk: https://open.spotify.com/playlist/37i9dQZF1DWZgauS5j6pMv
* I Love My 10s Funk: https://open.spotify.com/playlist/37i9dQZF1DX2zVOKolQSzi

In [None]:
playlist_links = {'Old School Funk':'https://open.spotify.com/playlist/37i9dQZF1EIfqkfSDVB2GV',
                 'All Funked Up':'https://open.spotify.com/playlist/37i9dQZF1DX4WgZiuR77Ef',
                 'Funky Jams':'https://open.spotify.com/playlist/37i9dQZF1DX6drTZKzZwSo',
                 'Crisp':'https://open.spotify.com/playlist/37i9dQZF1DXdb5FEvfgsH9',
                 'Instrumental Funk':'https://open.spotify.com/playlist/37i9dQZF1DX8f5qTGj8FYl',
                 'Future Funk': 'https://open.spotify.com/playlist/37i9dQZF1DXbjGYBfEmjR5',
                 'Nu Funk': 'https://open.spotify.com/playlist/37i9dQZF1DWZgauS5j6pMv',
                 'I Love My 10s Funk': 'https://open.spotify.com/playlist/37i9dQZF1DX2zVOKolQSzi', 
                 'Toms Funky Playlist': "https://open.spotify.com/playlist/7eWWLoTfmLUcD0viBP6Hr0?si=e8b0760749404749"}

In [53]:
def get_playlist_URI(playlist_link):
    """Extracts URI from playlist link"""
    playlist_URI = playlist_link.split("/")[-1].split("?")[0]
    return playlist_URI


def get_tracks(playlist_link):
    """Get the list of tracks from a Spotify playlist.

    Args:
        playlist_link: string, web-link
    Returns:
        list of track uris
    """
    playlist_URI = get_playlist_URI(playlist_Link)
    track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_URI)["items"]]
    return track_uris

def extract_audio_feat(track_uri, track_dict):
    """Extracts audio features for each track_uri and adds them to the track_dict.

    Args:
        track_uri: Spotify track URI
        track_dict: dictionary of track metadata
    Returns:
        track_dict: updated with audio features
    """

        # Audio Features
    audio_feat_list = ['acousticness',
                       'danceability',
                       'energy',
                       'instrumentalness',
                       'key',
                       'liveness',
                       'loudness',
                       'mode',
                       'speechiness',
                       'tempo',
                       'time_signature',
                       'valence']
                       
    audio_feat = sp.audio_features(track_uri)[0]
    
    for feat in audio_feat_list:
        track_dict[feat] = audio_feat[feat]
        
    return track_dict

def pl_track_features(playlist_link):
    """
    Calls the Spotify API to collect track listings for each playlist. 
    Pulls meta data and track data for each track and returns a dataframe with all of the features
    
    Input: playlist_link - URI for a Spotify playlist
    Returns: pandas dataframe with tracklisting and audio features
    """
    
    # initialize dataframe for results
    tracks_df = pd.DataFrame() 
    
    playlist_URI = get_playlist_URI(playlist_link)
    
    # Loop over tracks to gather info
    for track in sp.playlist_tracks(playlist_URI)["items"]:
        this_track = {}
        #URI
        track_uri = track["track"]["uri"]
        this_track['track_uri'] = track_uri

        #Track name
        this_track['track_name'] = track["track"]["name"]

        #Main Artist
        artist_uri = track["track"]["artists"][0]["uri"]
        this_track['artist_uri'] = artist_uri
        artist_info = sp.artist(artist_uri)

        #Name, popularity, genre
        this_track['artist_name'] = track["track"]["artists"][0]["name"]
        this_track['artist_pop'] = artist_info["popularity"]
        this_track['artist_genres'] = artist_info["genres"]

        #Album
        this_track['album'] = track["track"]["album"]["name"]

        #Track Metadata
        this_track['track_pop'] = track["track"]["popularity"]
        this_track['explicit'] = track["track"]['explicit']

        # Audio Features
        try:
            this_track = extract_audio_feat(track_uri, this_track)
        except:
            pass
    
    
        # Convert to DataFrame
        this_track_df = pd.json_normalize(this_track)
    
        tracks_df = pd.concat([tracks_df,this_track_df], ignore_index=True)
    
    # Make sure there are no duplicates
    tracks_df = tracks_df.drop_duplicates('track_uri')
    
    return tracks_df

### Collect data for all playlists

In [None]:
playlist = 'Funk and Soul Classics'
playlist = 'All Funked Up'

playlist_uri = get_playlist_URI(playlist_links[playlist])
# pl_tracks = sp.playlist_tracks(playlist_uri)["items"]
# len(pl_tracks)

In [None]:
playlist_df = pd.DataFrame()
for pl_name, pl_link in playlist_links.items():
    this_pl_df = gather_track_features(pl_link)
    this_pl_df['playlist'] = pl_name
    print(f'{pl_name}: {len(this_pl_df)} tracks')
    playlist_df = pd.concat([playlist_df, this_pl_df], ignore_index=True)
    
# Make sure there are no duplicates - saving this for later since a track was dropped from the comparison playlist
# playlist_df = playlist_df.drop_duplicates('track_uri')


In [None]:
# Save to Excel for future use
playlist_df.to_excel("funky_playlist_tracks.xlsx", index=False)

In [None]:
playlist_df.playlist.value_counts()

In [None]:
playlist_df.artist_name.value_counts()

In [None]:
playlist_df.loc[playlist_df.playlist !='Toms Funky Playlist'].artist_name.value_counts()

## Get Sappy Makes Playlists Playlists
Get all of Brian's playlists and then pull all of the track info.

In [4]:
userid = '31ijsgvk3npljcg7ky4fdwwyetiy'
playlists = sp.user_playlists(userid)
# playlists

In [9]:
?sp.user_playlists

[0;31mSignature:[0m [0msp[0m[0;34m.[0m[0muser_playlists[0m[0;34m([0m[0muser[0m[0;34m,[0m [0mlimit[0m[0;34m=[0m[0;36m50[0m[0;34m,[0m [0moffset[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Gets playlists of a user

Parameters:
    - user - the id of the usr
    - limit  - the number of items to return
    - offset - the index of the first item to return
[0;31mFile:[0m      ~/opt/anaconda3/lib/python3.8/site-packages/spotipy/client.py
[0;31mType:[0m      method


In [22]:
pls_found = len(sp.user_playlists(userid, offset=offset)["items"])
pls_found

50

In [45]:
offset = 0
playlists = sp.user_playlists(userid, limit=50, offset=offset)["items"]
for pl in playlists:
    this_pl = {}
    #URI
    this_pl['pl_id'] = pl["id"]
    this_pl['href'] = pl["href"]
    this_pl['pl_name'] = pl['name']
    this_pl['description'] = pl['description']
#     print(this_pl['pl_name'])

    # Convert to DataFrame
    this_pl_df = pd.json_normalize(this_pl)
this_pl_df

Unnamed: 0,pl_id,href,pl_name,description
0,33IPv0BtnEu7En0ckaPQDM,https://api.spotify.com/v1/playlists/33IPv0Btn...,"Sappy 22: 50s Rock, Vol 1",


In [46]:
# Gather all of the playlist meta data
# initialize dataframe for results
pls_df = pd.DataFrame() 

# Loop over playlists to gather info
offset = 0
pls_found = len(sp.user_playlists(userid, offset=offset)["items"])
print(f'Playlists found: {pls_found}')
while pls_found > 0:
    playlists = sp.user_playlists(userid, limit=50, offset=offset)["items"]
    for pl in playlists:
        this_pl = {}
        #URI
        this_pl['pl_id'] = pl["id"]
        this_pl['href'] = pl["href"]
        this_pl['pl_name'] = pl['name']
        this_pl['description'] = pl['description']
#         print(this_pl['pl_name'])

        # Convert to DataFrame
        this_pl_df = pd.json_normalize(this_pl)

        pls_df = pd.concat([pls_df,this_pl_df], ignore_index=True)
    offset = offset + pls_found
    pls_found = len(sp.user_playlists(userid, offset=offset)["items"])
    print(f'Next list count: {pls_found}')

# Make sure there are no duplicates
# pls_df = pls_df.drop_duplicates()
pls_df

Playlists found: 50
Next list count: 26
Next list count: 0


Unnamed: 0,pl_id,href,pl_name,description
0,16ThrcdwcMPcpATpSZqyVE,https://api.spotify.com/v1/playlists/16Thrcdwc...,Sappy 22: Vol 24,Sounds of the MCU
1,5vMRg3r9q5iyOrokgHsEur,https://api.spotify.com/v1/playlists/5vMRg3r9q...,Sappy 22: Vol 42,
2,23DSsB6s5gmR5L8doGeKVh,https://api.spotify.com/v1/playlists/23DSsB6s5...,Sappy 22: Vol 41,
3,4rrkgCcXTFjmZPSWRaFfy7,https://api.spotify.com/v1/playlists/4rrkgCcXT...,Sappy 22: Vol 40,
4,1WqFCm7y5jtTboCWVNbDb4,https://api.spotify.com/v1/playlists/1WqFCm7y5...,Sappy 22: Volume 40,
...,...,...,...,...
71,1ettjTzByG9E9ZvPJET7D3,https://api.spotify.com/v1/playlists/1ettjTzBy...,Sappy 22: Vol 5,
72,4iMFNx1SPSumbzAOm1uotF,https://api.spotify.com/v1/playlists/4iMFNx1SP...,Sappy 22: Vol 7,
73,3I9fDwQJBhrQMoE03zzWjn,https://api.spotify.com/v1/playlists/3I9fDwQJB...,Sappy 22: Vol 8,NYHC x NJHC
74,5bcpasB3EzZnlFc9MwNvdq,https://api.spotify.com/v1/playlists/5bcpasB3E...,Sappy 22: Vol 9,


## Collect Data for All Playlists

In [54]:
playlist_df = pd.DataFrame()

for i in range(len(pls_df)):
    pl_link = pls_df.iloc[i]['href']
    pl_name = pls_df.iloc[i]['pl_name']
    print(f'Name: {pl_name}: {href}')
               
    this_pl_df = gather_track_features(pl_link)
    this_pl_df['playlist'] = pl_name
    print(f'{pl_name}: {len(this_pl_df)} tracks')
    playlist_df = pd.concat([playlist_df, this_pl_df], ignore_index=True)

Name: Sappy 22: Vol 24: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 24: 22 tracks
Name: Sappy 22: Vol 42: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 42: 22 tracks
Name: Sappy 22: Vol 41: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 41: 22 tracks
Name: Sappy 22: Vol 40: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 40: 22 tracks
Name: Sappy 22: Volume 40: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Volume 40: 63 tracks
Name: Sappy 22: Vol 39: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 39: 22 tracks
Name: Sappy 22: Vol 38: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 38: 22 tracks
Name: Sappy 22: Vol 37: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 37: 21 tracks
Name: Sappy 22: Vol 45: https://api.spotify.com/v1/playlists/276HTkTYo9ceNXteNuMMOp
Sappy 22: Vol 

In [55]:
# Save to Excel for future use
playlist_df.to_excel("sappy_playlist_tracks.xlsx", index=False)