# Spotify Funk Recommender - Data Collection
Gather features from various Spotify playlists and associated tracks and compile into a file. This data collection is intended for use in building a song recommender based on a playlist and other similar tracks.


## Imports

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np

# Spotify Credentials
import spot_creds

# Graphing
import matplotlib.pyplot as plt

# Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer

clid = spot_creds.client_id
secret = spot_creds.secret

#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=clid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

## Gather Data
First, find a number of Spotify funk music playlists.

* Old School Funk: https://open.spotify.com/playlist/37i9dQZF1EIfqkfSDVB2GV
* All Funked Up: https://open.spotify.com/playlist/37i9dQZF1DX4WgZiuR77Ef
* Funky Jams: https://open.spotify.com/playlist/37i9dQZF1DX6drTZKzZwSo
* Crisp: https://open.spotify.com/playlist/37i9dQZF1DXdb5FEvfgsH9
* Instrumental Funk: https://open.spotify.com/playlist/37i9dQZF1DX8f5qTGj8FYl
* Future Funk: https://open.spotify.com/playlist/37i9dQZF1DXbjGYBfEmjR5
* Nu Funk: https://open.spotify.com/playlist/37i9dQZF1DWZgauS5j6pMv
* I Love My 10s Funk: https://open.spotify.com/playlist/37i9dQZF1DX2zVOKolQSzi

In [2]:
playlist_links = {'Old School Funk':'https://open.spotify.com/playlist/37i9dQZF1EIfqkfSDVB2GV',
                 'All Funked Up':'https://open.spotify.com/playlist/37i9dQZF1DX4WgZiuR77Ef',
                 'Funky Jams':'https://open.spotify.com/playlist/37i9dQZF1DX6drTZKzZwSo',
                 'Crisp':'https://open.spotify.com/playlist/37i9dQZF1DXdb5FEvfgsH9',
                 'Instrumental Funk':'https://open.spotify.com/playlist/37i9dQZF1DX8f5qTGj8FYl',
                 'Future Funk': 'https://open.spotify.com/playlist/37i9dQZF1DXbjGYBfEmjR5',
                 'Nu Funk': 'https://open.spotify.com/playlist/37i9dQZF1DWZgauS5j6pMv',
                 'I Love My 10s Funk': 'https://open.spotify.com/playlist/37i9dQZF1DX2zVOKolQSzi', 
                 'Toms Funky Playlist': "https://open.spotify.com/playlist/7eWWLoTfmLUcD0viBP6Hr0?si=e8b0760749404749"}

In [23]:
def get_playlist_URI(playlist_link):
    """Extracts URI from playlist link"""
    playlist_URI = playlist_link.split("/")[-1].split("?")[0]
    return playlist_URI


def get_tracks(playlist_link):
    """Get the list of tracks from a Spotify playlist.

    Args:
        playlist_link: string, web-link
    Returns:
        list of track uris
    """
    playlist_URI = get_playlist_URI(playlist_Link)
    track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_URI)["items"]]
    return track_uris

def extract_audio_feat(track_uri, track_dict):
    """Extracts audio features for each track_uri and adds them to the track_dict.

    Args:
        track_uri: Spotify track URI
        track_dict: dictionary of track metadata
    Returns:
        track_dict: updated with audio features
    """

        # Audio Features
    audio_feat_list = ['acousticness',
                       'danceability',
                       'energy',
                       'instrumentalness',
                       'key',
                       'liveness',
                       'loudness',
                       'mode',
                       'speechiness',
                       'tempo',
                       'time_signature',
                       'valence']
                       
    audio_feat = sp.audio_features(track_uri)[0]
    
    for feat in audio_feat_list:
        track_dict[feat] = audio_feat[feat]
        
    return track_dict


def artist_bank(artist_id, artist_df=None):
    if artist_df is None:
        artist_df = pd.DataFrame()
        
    if 'artist_id' in artist_df.columns:
        if artist_id in artist_df.artist_id:
            return artist_df
    else:
        this_artist = sp.artist(artist_id)
        this_artist_df = pd.json_normalize(this_artist)
        this_artist_df = this_artist_df[['genres','id','name','popularity','uri','followers.total']]
        this_artist_df.columns = ['artist_genres','artist_id','aritst_name','artist_pop','artist_uri','artist_followers_total']

        #combine with running list
        artist_df = pd.concat([artist_df, this_artist_df ])

    return artist_df


def pl_track_features(playlist_link):
    """
    Calls the Spotify API to collect track listings for each playlist. 
    Pulls meta data and track data for each track and returns a dataframe with all of the features
    
    Input: playlist_link - URI for a Spotify playlist
    Returns: pandas dataframe with tracklisting and audio features
    """
    
    # initialize dataframe for results
    tracks_df = pd.DataFrame()
    artists_df = pd.DataFrame()
    
    playlist_URI = get_playlist_URI(playlist_link)
    
    # Loop over tracks to gather info
    for track in sp.playlist_tracks(playlist_URI)["items"]:
        this_track = {}
        #URI
        track_uri = track["track"]["uri"]
        this_track['track_uri'] = track_uri

        #Track name
        this_track['track_name'] = track["track"]["name"]

        #Main Artist
        artist_id = track["track"]["artists"][0]["id"]
        this_track['artist_id'] = artist_id
        
        # Add artist info to the artist-bank to minimize API calls and avoid rate limits
        artists_df = artist_bank(artist_id, artists_df)

        #Name, popularity, genre
        this_track['artist_name'] = track["track"]["artists"][0]["name"]

        #Album
        this_track['album'] = track["track"]["album"]["name"]

        #Track Metadata
        this_track['track_pop'] = track["track"]["popularity"]
        this_track['explicit'] = track["track"]['explicit']

        # Audio Features
        try:
            this_track = extract_audio_feat(track_uri, this_track)
        except:
            pass
    
    
        # Convert to DataFrame
        this_track_df = pd.json_normalize(this_track)
    
        tracks_df = pd.concat([tracks_df,this_track_df], ignore_index=True)
        
    # Merge with Artist data
    tracks_df = tracks_df.merge(artists_df, how='left',on='artist_id')
    
    # Make sure there are no duplicates
    tracks_df = tracks_df.drop_duplicates('track_uri')
    
    return tracks_df

### Collect data for all playlists

In [24]:
playlist = 'Funk and Soul Classics'
playlist = 'All Funked Up'

playlist_uri = get_playlist_URI(playlist_links[playlist])
# pl_tracks = sp.playlist_tracks(playlist_uri)["items"]
# len(pl_tracks)

track_features_df = pl_track_features(playlist_uri)
track_features_df.head()

Unnamed: 0,track_uri,track_name,artist_id,artist_name,album,track_pop,explicit,acousticness,danceability,energy,...,mode,speechiness,tempo,time_signature,valence,artist_genres,aritst_name,artist_pop,artist_uri,artist_followers_total
0,spotify:track:4hQWcL2ABDLDSL3SCAJMNg,We Are The Party,3VNITwohbvU5Wuy5PC6dsI,Kool & The Gang,We Are The Party,48,False,0.00152,0.746,0.648,...,1,0.0349,104.92,4,0.336,"[disco, funk, motown, soul]",Kool & The Gang,67.0,spotify:artist:3VNITwohbvU5Wuy5PC6dsI,2301864.0
1,spotify:track:2X1jifAuCUkeIiidlRDxFl,Up Is Just A Place Ft. George Clinton x Fred W...,5MFlcmLtTY9qSoLsrUmbwU,Smudge All Stars,Up Is Just A Place Ft. George Clinton x Fred W...,36,False,0.0272,0.747,0.579,...,0,0.141,99.039,4,0.868,,,,,
2,spotify:track:3lClFv9E9nNuq5fcbCmjIT,Beautiful Dreams,6WrjOtCau0UPAB3QSeOWzO,Acantha Lang,Beautiful Dreams,26,False,0.0601,0.665,0.787,...,1,0.328,187.943,4,0.536,,,,,
3,spotify:track:1a0MoIIXDUQc25MgthHkc7,On The Road Again,4WmMnGO1nLIsE85XwcBAZE,Orgone,On The Road Again,35,False,0.0176,0.721,0.727,...,1,0.0424,118.016,4,0.801,,,,,
4,spotify:track:3zdE6X6hdKyth9TTDTEpcn,Stanky Funk - feat. Bootie Brown,2v2cdjqYIpT8ZBpflNTttY,The Allergies,Stanky Funk (feat. Bootie Brown),35,False,0.218,0.692,0.883,...,1,0.125,104.921,4,0.828,,,,,


In [22]:
# artists_df.head()
artist_id = '2pXFmyqPm7wHJ1HGAwyR3L'
artist_df = pd.DataFrame()
artist_bank(artist_id, artist_df)

Unnamed: 0,artist_genres,artist_id,aritst_name,artist_pop,artist_uri,artist_followers_total
0,[funk rock],2pXFmyqPm7wHJ1HGAwyR3L,Here Come The Mummies,34,spotify:artist:2pXFmyqPm7wHJ1HGAwyR3L,44362


In [None]:
playlist_df = pd.DataFrame()
for pl_name, pl_link in playlist_links.items():
    this_pl_df = pl_track_features(pl_link)
    this_pl_df['playlist'] = pl_name
    print(f'{pl_name}: {len(this_pl_df)} tracks')
    playlist_df = pd.concat([playlist_df, this_pl_df], ignore_index=True)
    
# Make sure there are no duplicates - saving this for later since a track was dropped from the comparison playlist
# playlist_df = playlist_df.drop_duplicates('track_uri')


In [None]:
# Save to Excel for future use
playlist_df.to_excel("funky_playlist_tracks.xlsx", index=False)

In [None]:
playlist_df.playlist.value_counts()

In [None]:
playlist_df.artist_name.value_counts()

In [None]:
playlist_df.loc[playlist_df.playlist !='Toms Funky Playlist'].artist_name.value_counts()

## Get Sappy Makes Playlists Playlists
Get all of Brian's playlists and then pull all of the track info.

In [None]:
userid = '31ijsgvk3npljcg7ky4fdwwyetiy'
playlists = sp.user_playlists(userid)
# playlists

In [None]:
pls_found = len(sp.user_playlists(userid, offset=offset)["items"])
pls_found

In [None]:
offset = 0
playlists = sp.user_playlists(userid, limit=50, offset=offset)["items"]
for pl in playlists:
    this_pl = {}
    #URI
    this_pl['pl_id'] = pl["id"]
    this_pl['href'] = pl["href"]
    this_pl['pl_name'] = pl['name']
    this_pl['description'] = pl['description']
#     print(this_pl['pl_name'])

    # Convert to DataFrame
    this_pl_df = pd.json_normalize(this_pl)
this_pl_df

In [None]:
# Gather all of the playlist meta data
# initialize dataframe for results
pls_df = pd.DataFrame() 

# Loop over playlists to gather info
offset = 0
pls_found = len(sp.user_playlists(userid, offset=offset)["items"])
print(f'Playlists found: {pls_found}')
while pls_found > 0:
    playlists = sp.user_playlists(userid, limit=50, offset=offset)["items"]
    for pl in playlists:
        this_pl = {}
        #URI
        this_pl['pl_id'] = pl["id"]
        this_pl['href'] = pl["href"]
        this_pl['pl_name'] = pl['name']
        this_pl['description'] = pl['description']
#         print(this_pl['pl_name'])

        # Convert to DataFrame
        this_pl_df = pd.json_normalize(this_pl)

        pls_df = pd.concat([pls_df,this_pl_df], ignore_index=True)
    offset = offset + pls_found
    pls_found = len(sp.user_playlists(userid, offset=offset)["items"])
    print(f'Next list count: {pls_found}')

# Make sure there are no duplicates
# pls_df = pls_df.drop_duplicates()
pls_df

## Collect Data for All Playlists

In [None]:
playlist_df = pd.DataFrame()

for i in range(len(pls_df)):
    pl_link = pls_df.iloc[i]['href']
    pl_name = pls_df.iloc[i]['pl_name']
    print(f'Name: {pl_name}: {href}')
               
    this_pl_df = pl_track_features(pl_link)
    this_pl_df['playlist'] = pl_name
    print(f'{pl_name}: {len(this_pl_df)} tracks')
    playlist_df = pd.concat([playlist_df, this_pl_df], ignore_index=True)

In [None]:
# Save to Excel for future use
playlist_df.to_excel("sappy_playlist_tracks.xlsx", index=False)

In [None]:
# Can we get all tracks from an album as if it were a playlist???
