In [3]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
import statistics

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [4]:
sp_df = pd.read_csv('SpotifyFeatures.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 14: invalid continuation byte

In [None]:
sp_df['artist_name']=sp_df['artist_name'].str.upper()
sp_df['track_name']=sp_df['track_name'].str.upper()

In [None]:
sp_df

In [None]:
sp_df.dtypes

In [None]:
sp_df.info()

In [None]:
sp_df.shape

In [None]:
sp_df['genre'].values[0]

In [None]:
sp_df['artist_name'].values[0]

In [None]:
sp_df[sp_df['artist_name']=='HARRY STYLES']

In [None]:
sp_df[sp_df['id']=='5Q0Nhxo0l2bP3pNjpGJwV1'].nunique()

In [None]:
id_genres = sp_df.groupby('id')['genre'].apply(list).reset_index()

In [None]:
id_popularity = sp_df.groupby('id')['popularity'].apply(list).reset_index()

In [None]:
id_genres

In [None]:
id_popularity

In [None]:
sp_df.drop(['genre','popularity'], axis=1, inplace=True)

In [None]:
sp_df.drop_duplicates(inplace=True)

In [None]:
sp_df=sp_df.merge(id_genres[['id','genre']], on = 'id',how = 'left')
sp_df=sp_df.merge(id_popularity[['id','popularity']], on = 'id',how = 'left')


In [None]:
sp_df.head()

In [None]:
sp_df['mean_popularity'] = [np.array(x).mean() for x in sp_df.popularity.values]

In [None]:
float_cols = sp_df.dtypes[sp_df.dtypes == 'float64'].index.values

In [None]:
ohecol = 'mean_popularity'

In [None]:
sp_df['mean_popularity'].describe()

In [None]:
sp_df['tempo'].describe()

In [None]:
sp_df['popularitybucket'] = sp_df['mean_popularity'].apply(lambda x: int(x/5))

In [None]:
sp_df.loc[ sp_df['tempo'] <= 30.00, 'cat_tempo'] = 0
sp_df.loc[(sp_df['tempo'] > 30.00) & (sp_df['tempo'] <= 40.00), 'cat_tempo'] = 1
sp_df.loc[(sp_df['tempo'] > 40.00) & (sp_df['tempo'] <= 60.0), 'cat_tempo']   = 2
sp_df.loc[(sp_df['tempo'] > 60.00) & (sp_df['tempo'] <= 66.00), 'cat_tempo']   = 3
sp_df.loc[(sp_df['tempo'] > 66.00) & (sp_df['tempo'] <= 80.00), 'cat_tempo']   = 4
sp_df.loc[(sp_df['tempo'] > 80.00) & (sp_df['tempo'] <= 108.00), 'cat_tempo']   = 5
sp_df.loc[(sp_df['tempo'] > 108.00) & (sp_df['tempo'] <= 124.00), 'cat_tempo']   = 6
sp_df.loc[(sp_df['tempo'] > 124.00) & (sp_df['tempo'] <= 168.00), 'cat_tempo']   = 7
sp_df.loc[(sp_df['tempo'] > 168.00) & (sp_df['tempo'] <= 176.00), 'cat_tempo']   = 8
sp_df.loc[(sp_df['tempo'] > 176.00) & (sp_df['tempo'] <= 200.00), 'cat_tempo']   = 9
sp_df.loc[ sp_df['tempo'] > 200, 'cat_tempo'] = 10
sp_df['cat_tempo'] = sp_df['cat_tempo'].astype(int)

In [None]:
sp_df

In [None]:
sp_df.info()

In [None]:
def ohe_prep(df, column, new_name): 
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [None]:
def create_feature_set(df, float_cols):
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(sp_df['genre'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    tempo_ohe = ohe_prep(df, 'cat_tempo','ctempo') * 0.5
    popularity_ohe = ohe_prep(sp_df, 'popularitybucket','pbucket') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe,tempo_ohe], axis = 1)
     
    #add song id
    final['id']=sp_df['id'].values
    
    return final

In [None]:
complete_feature_set = create_feature_set(sp_df, float_cols=float_cols)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(complete_feature_set.head(5))


In [None]:
clientid='8bcfe563f2b4441290539a7e4a4f0d9c'
clientsecret='b01eb6f1b11840b8a1d310f2f402c4fe'

In [None]:
scope = 'user-library-read'

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

In [None]:
auth_manager = SpotifyClientCredentials(client_id=clientid, client_secret=clientsecret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [None]:
token = util.prompt_for_user_token(scope, client_id= clientid, client_secret=clientsecret, redirect_uri='http://localhost:8881/')

In [None]:
sp = spotipy.Spotify(auth=token)

In [None]:
id_name = {}
list_photo = {}
for i in sp.current_user_playlists()['items']:

    id_name[i['name']] = i['uri'].split(':')[2]
    list_photo[i['uri'].split(':')[2]] = i['images'][0]['url']

In [None]:
id_name

In [None]:
def create_necessary_outputs(playlist_name,id_dic, df):    

    playlist = pd.DataFrame()
    playlist_name = playlist_name
    for ix, i in enumerate(sp.playlist(id_dic[playlist_name])['tracks']['items']):
       
        playlist.loc[ix, 'artist'] = i['track']['artists'][0]['name']
        playlist.loc[ix, 'name'] = i['track']['name']
        playlist.loc[ix, 'id'] = i['track']['id'] # ['uri'].split(':')[2]
        playlist.loc[ix, 'url'] = i['track']['album']['images'][1]['url']
        playlist.loc[ix, 'date_added'] = i['added_at']

    playlist['date_added'] = pd.to_datetime(playlist['date_added'])  
    
    playlist = playlist[playlist['id'].isin(df['id'].values)].sort_values('date_added',ascending = False)
    
    return playlist

In [None]:
playlist_mine = create_necessary_outputs('Mixed Feelings',id_name,sp_df)

In [None]:
from skimage import io
import matplotlib.pyplot as plt

def visualize_songs(df):

    temp = df['url'].values
    plt.figure(figsize=(15,int(0.625 * len(temp))))
    columns = 5
    
    for i, url in enumerate(temp):
        plt.subplot(len(temp) / columns + 1, columns, i + 1)

        image = io.imread(url)
        plt.imshow(image)
        plt.xticks(color = 'w', fontsize = 0.1)
        plt.yticks(color = 'w', fontsize = 0.1)
        plt.xlabel(df['name'].values[i], fontsize = 12)
        plt.tight_layout(h_pad=0.4, w_pad=0)
        plt.subplots_adjust(wspace=None, hspace=None)

    plt.show()

In [None]:
playlist_mine

In [None]:
visualize_songs(playlist_mine)

In [None]:
def generate_playlist_feature(complete_feature_set, playlist_df, weight_factor):
    """ 
    Summarize a user's playlist into a single vector

    Parameters: 
        complete_feature_set (pandas dataframe): Dataframe which includes all of the features for the spotify songs
        playlist_df (pandas dataframe): playlist dataframe
        weight_factor (float): float value that represents the recency bias. The larger the recency bias, the most priority recent songs get. Value should be close to 1. 
        
    Returns: 
        playlist_feature_set_weighted_final (pandas series): single feature that summarizes the playlist
        complete_feature_set_nonplaylist (pandas dataframe): 
    """
    
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1).mean(axis =0)
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['id','date_added']], on = 'id', how = 'inner')
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1)
    
    playlist_feature_set = complete_feature_set_playlist.sort_values('date_added',ascending=False)

    most_recent_date = playlist_feature_set.iloc[0,-1]
    
    for ix, row in playlist_feature_set.iterrows():
        playlist_feature_set.loc[ix,'months_from_recent'] = int((most_recent_date.to_pydatetime() - row.iloc[-1].to_pydatetime()).days / 30)
        
    playlist_feature_set['weight'] = playlist_feature_set['months_from_recent'].apply(lambda x: weight_factor ** (-x))
    
    playlist_feature_set_weighted = playlist_feature_set.copy()
    #print(playlist_feature_set_weighted.iloc[:,:-4].columns)
    playlist_feature_set_weighted.update(playlist_feature_set_weighted.iloc[:,:-4].mul(playlist_feature_set_weighted.weight,0))
    playlist_feature_set_weighted_final = playlist_feature_set_weighted.iloc[:, :-4]
    #playlist_feature_set_weighted_final['id'] = playlist_feature_set['id']
    
    return playlist_feature_set_weighted_final.sum(axis = 0), complete_feature_set_nonplaylist

In [None]:
complete_feature_set_playlist_vector_mine, complete_feature_set_nonplaylist_mine = generate_playlist_feature(complete_feature_set, playlist_mine, 1.09)

In [None]:
complete_feature_set_playlist_vector_mine.shape

In [None]:
def generate_playlist_recos(df, features, nonplaylist_features):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        df (pandas dataframe): spotify dataframe
        features (pandas series): summarized playlist feature
        nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Returns: 
        non_playlist_df_top_40: Top 40 recommendations for that playlist
    """
    
    non_playlist_df = df[df['id'].isin(nonplaylist_features['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    non_playlist_df_top_40['url'] = non_playlist_df_top_40['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])
    
    return non_playlist_df_top_40

In [None]:
mine_top40 = generate_playlist_recos(sp_df, complete_feature_set_playlist_vector_mine, complete_feature_set_nonplaylist_mine)