In [1]:
import pandas as pd
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth

import datetime
from dateutil import tz

In [2]:
def importSpotifyKaggleDataset():
    """
    Function definition: 
        This function imports a local version of the Spotify dataset from kaggleat. 
        This dataset can be found here: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks

    Arguments: 
        None

    Returns:
        'spotifyTracksdf' -- A dataframe containing the data from kaggle
    """

    spotifyTracksdf = pd.read_csv('archive/tracks.csv')
    return spotifyTracksdf

In [3]:
def cleanSpotifyData(dataframe,datatype=None):

    """
    Function definition: 
        This function performs data cleaning on the data from Spotify

    Arguments: 
        'dataframe' -- Spotify dataframe containing data from Spotify
        'datatype' -- Type of Spotify data. Could be either from the kaggle dataset or pulled from user's Spotify

    Returns:
        'dataframe' -- A dataframe containing cleaned Spotify data
    """

    
        
    dataframe.dropna(inplace=True)
    dataframe.reset_index(drop=True, inplace=True)
    
    
    """
    Values in the explicit column for the data from user's Spotify is represented in false/true format. 
    This 'if' statement is used to convert values from false/true format to 0/1 format like is represented in
    the kaggle dataset
    """
    if datatype == "user_data": 
        dataframe['explicit'] = dataframe["explicit"].astype(int)
    
    if datatype == "kaggle_data":
        """
        Creating unique IDs to remove duplicate where the same song has different IDs by concatanating the following
        1. Artist name
        2. Name of the track
        
        We ONLY want to do this for the kaggle dataset and NOT the user's data because if a user has listened to the 
        same song multiple times we want that to be reflected and have some effect (bias) on the outcome
        """
        dataframe['unique_id'] = dataframe.apply(lambda x: str(x['artists']) + str(x['name']) ,axis = 1)
        dataframe.drop_duplicates('unique_id',inplace=True)

        """
        Removing the 'unique_id' column since now all songs should be unique and
        can be uniquely idenfitied using the 'id' column
        """
        dataframe.drop(columns=['unique_id'],inplace=True)


    """
    Dropping the following columns/features since the experiment is about recommending tracks only 
    based on musical features:
    1. Popularity
    2. Duration
    3. Artist Name(s)
    4. Artists ID(s)
    5. Release date       
    """
    dataframe.drop(columns=['popularity','duration_ms','artists','id_artists',
                            'release_date', ],
                     inplace=True)
    

    dataframe.rename(columns={"id": "track_id", "name":"track_name"},inplace=True)
    
    dataframe.reset_index(drop = True,inplace = True)
    
    return dataframe
        

#### OUTLINE OF THE FEATURES NEEDED TO BE PRE-PROCESSED

##### Numerical:

1. acousticness (Ranges from 0 to 1)
2. danceability (Ranges from 0 to 1)
3. energy (Ranges from 0 to 1)
4. instrumentalness (Ranges from 0 to 1)
5. valence (Ranges from 0 to 1)
6. liveness (Ranges from 0 to 1)
7. tempo (Float typically ranging from 50 to 150)
8. loudness (Float typically ranging from -60 to 0)
9. speechiness (Ranges from 0 to 1)

##### Categorical:

10. mode (0 = Minor, 1 = Major)
11. explicit (0 = No explicit content, 1 = Explicit content)
12. timesignature (The predicted timesignature, most typically 4)
13. key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)

In [4]:
def preProcessSpotifyData(dataframe, datatype = None, baselinedf = None, preDefinedScaler = None):
    
    """
    Function definition: 
        This function performs data pre-processing on the cleaned Spotify data

    Arguments: 
        'dataframe' -- Dataframe containing cleaned Spotify data
        'datatype' -- Type of Spotify data. Could be either from the kaggle dataset or pulled from user's Spotify
        'baselinedf' -- Dataframe to be used as a baseline while reindexing the newly pre-processed dataframe
        'preDefinedScaler' -- scaler to be used while performing data scaling/normalization

    Returns:
        'processedDataframe' -- A dataframe containing pre-processed Spotify data
        'newScaler' -- Scaler so it can be used to (only)"transform" the real life data points coming from 
                    users spotify. This scaler has already been "fit" with the data points from kaggle dataset
    """
    
    
    dataframe.reset_index(drop = True,inplace = True)
    
    """
    Performing One-Hot Encoding on categorical variables. This method of encoding retains all the columns
    in the original dataframe and adds all the new columns created after the onehot encoding process
    """
    processedDataframe = pd.get_dummies(dataframe, columns = ['mode','explicit','time_signature', 'key'])
    
    
    featuresToBeScaled = ['danceability', 'energy','loudness','speechiness','acousticness',
                          'instrumentalness','liveness','valence','tempo']
    
    if datatype == "kaggle_data":
        
        """
        Performing data scaling/normalization
        """
        newScaler = MinMaxScaler()
        processedDataframe[featuresToBeScaled] = newScaler.fit_transform(processedDataframe[featuresToBeScaled])
       
        processedDataframe.reset_index(drop = True,inplace = True)
        return processedDataframe, newScaler

    
    if datatype == "user_data": 
        
        """
        Using the already "fit" scaler with the kaggle dataset to "transform" the data from users spotify
        """
        processedDataframe[featuresToBeScaled] = preDefinedScaler.transform(processedDataframe[featuresToBeScaled])
       
       
        
        """
        This step ensures that 'user data' dataframe has the same One-Hot Encoding schema as the 'kaggle dataset'
        dataframe. The 'kaggle dataset' is represented by the 'baselinedf" dataframe since kaggle is the baseline. 
        
        !NOTE!: Since 'user_listening_datetime' column doesn't exist in the dataframe representing 'kaggle dataset', 
                this reindexing step WILL loose the column 'user_listening_datetime' from the dataframe representing 
                'user data' 
        """
        tempUserDataFrame = processedDataframe.reindex(columns = baselinedf.columns,
                                                        fill_value = 0)
        
        
        """
        This step will re-add the column "user_listening_datetime" back to the dataframe representing 'user data'
        """
        tempUserDataFrame['user_listening_datetime'] = processedDataframe['user_listening_datetime']
        
        
        
        """
        This step will convert 'user's listening date/time' to 'pandas datetime format' and then 
        extract date-only from the date/time (leaving out time)
        """
        tempUserDataFrame['user_listening_datetime'] = pd.to_datetime(tempUserDataFrame['user_listening_datetime'])
        tempUserDataFrame['user_listening_dateonly'] = pd.DatetimeIndex(tempUserDataFrame['user_listening_datetime']).date
        
        
        tempUserDataFrame.drop(columns=['user_listening_datetime'],inplace=True)
        
        processedDataframe = tempUserDataFrame
        
        processedDataframe.reset_index(drop = True,inplace = True)
        return processedDataframe
    

In [5]:
def connectToUserSpotify(spotifyClientID,spotifySecret,spotifyRedirectUri,scope):

    
    """
    Function definition: 
        This function connects to the users Spotify account

    Arguments: 
        'spotifyClientID' -- User's Spotify client ID
        'spotifySecret' -- User's Spotify secret
        'spotifyRedirectUri' -- URI/URL to redirect the user to be authenticated so the user can authorize the app 
                                as the consumer of the data in the user's Spotify account
        'scope' -- List of resources that can be be accessed by this app (scope of authorization)

    Returns:
        'sp' -- Authenticated connection to the user's Spotify
    """
    
    
    
    """
    This step will create an authentication manager object to authenticate the app. It will then make a connection
    to the user's Spotify account
    """
    authManager = SpotifyOAuth(client_id =spotifyClientID, 
                               client_secret = spotifySecret,
                               redirect_uri = spotifyRedirectUri, 
                               scope = scope )
    sp = spotipy.Spotify(auth_manager = authManager)
    
    return sp

In [6]:
def createSpotifyUserData(sp):
    
    """
    Function definition: 
        This function creates a dataframe by pulling information from the user's Spotify account

    Arguments: 
        'sp' -- Authenticated connection to the user's Spotify

    Returns:
        'spotifyUserDatadf' -- Dataframe created by pulling information from the user's Spotify account
    """
    
    
    
    """
    This step defines the schema of the 'spotifyUserDatadf' dataframe which will be similar to the schema of the 
    dataframe representing the 'kaggle dataset'. The only difference will be the column 'user_listening_datetime'
    which represents the date/time when the user listening to a particular song/track
    """
    spotifyUserDatadf_columnNames = ['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
                               'id_artists', 'release_date', 'danceability', 'energy', 'key',
                               'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
                               'liveness', 'valence', 'tempo', 'time_signature','user_listening_datetime']
    
    """
    Declaring an empty dataframe with the predefined column names
    """
    spotifyUserDatadf = pd.DataFrame(columns = spotifyUserDatadf_columnNames)
    
    
    """
    This step pull's/get's tracks from the user’s recently played track history (listening history)
    """
    userPlayBackHistory = sp.current_user_recently_played(limit = 50)
    
    

    """
    This step will populate the 'spotifyUserDatadf' dataframe with the user's listening history information. 
    Any information that is not available in user's playback history but is a feature defined in the 'spotifyUserDatadf'
    dataframe will be populated with a value = 'unknown'
    """
    for idx, itemDict in enumerate(userPlayBackHistory['items']):
        rowIndex = idx
        trackDict = itemDict['track']

        spotifyUserDatadf.loc[rowIndex,'id'] = trackDict['id']
        spotifyUserDatadf.loc[rowIndex,'name'] = trackDict['name']
        spotifyUserDatadf.loc[rowIndex,'popularity'] = trackDict['popularity']
        spotifyUserDatadf.loc[rowIndex,'duration_ms'] = trackDict['duration_ms']
        spotifyUserDatadf.loc[rowIndex,'explicit'] = trackDict['explicit']
        spotifyUserDatadf.loc[rowIndex,'artists'] = trackDict['artists'][0]['name']
        
  
        spotifyUserDatadf.loc[rowIndex,'id_artists'] = 'unknown' 
        spotifyUserDatadf.loc[rowIndex,'release_date'] = 'unknown'
        
       
        """
        This step utilizes spotipy to get audio feature for the current track
        """
        trackAudioFeaturesDict = sp.audio_features(trackDict['id'])[0]

        spotifyUserDatadf.loc[rowIndex,'danceability'] = trackAudioFeaturesDict['danceability']
        spotifyUserDatadf.loc[rowIndex,'energy'] = trackAudioFeaturesDict['energy']
        spotifyUserDatadf.loc[rowIndex,'key'] = trackAudioFeaturesDict['key']
        spotifyUserDatadf.loc[rowIndex,'loudness'] = trackAudioFeaturesDict['loudness']
        spotifyUserDatadf.loc[rowIndex,'mode'] = trackAudioFeaturesDict['mode']
        spotifyUserDatadf.loc[rowIndex,'speechiness'] = trackAudioFeaturesDict['speechiness']
        spotifyUserDatadf.loc[rowIndex,'acousticness'] = trackAudioFeaturesDict['acousticness']
        spotifyUserDatadf.loc[rowIndex,'instrumentalness'] = trackAudioFeaturesDict['instrumentalness']
        spotifyUserDatadf.loc[rowIndex,'liveness'] = trackAudioFeaturesDict['liveness']
        spotifyUserDatadf.loc[rowIndex,'valence'] = trackAudioFeaturesDict['valence']
        spotifyUserDatadf.loc[rowIndex,'tempo'] = trackAudioFeaturesDict['tempo']
        spotifyUserDatadf.loc[rowIndex,'time_signature'] = trackAudioFeaturesDict['time_signature']

       
        """
        Accessing when the user played the current track by tapping into 'itemDict' dictionary
        """
        spotifyUserDatadf.loc[rowIndex,'user_listening_datetime'] = itemDict['played_at']

        
    return spotifyUserDatadf


In [7]:
def createUserPersona(dataframe):
    
    """
    Function definition: 
        This function creates the user's persona which represents the user's current mood/listening style. 
        The user persona is created by summarizing the user's listening history. This summarization process also
        involved FEATURE ENGINEERING to add temporal information in order to give more weight to more recent
        songs listened to

    Arguments: 
        'dataframe' -- Dataframe containing pre-processed data from the user's Spotify account

    Returns:
        'userPersonadf' -- Dataframe containing user's summarized listening history
    """

    
    
    """
    This step will create a new column representing the weight of each song in the user's listening history
    based on when the song was heard/listened to (today's date - date when the user listened to the song). 
    
    Also convert today's date from local time zone to UTC since Spotify uses UTC time zone
    """
    
    dataframe.reset_index(drop = True,inplace = True)
    
    
    todays_datetime = datetime.datetime.today()
    fromZone = tz.tzlocal()
    toZone = tz.tzutc()
    todays_datetime = todays_datetime.replace(tzinfo=fromZone)
    todays_datetime_utc = todays_datetime.astimezone(toZone)
    todays_date_utc = todays_datetime_utc.date()
    
    
    dataframe['days_since_song_was_played'] = todays_date_utc - dataframe['user_listening_dateonly']
    dataframe['days_since_song_was_played'] = dataframe['days_since_song_was_played'].astype('timedelta64[D]')
    dataframe['days_since_song_was_played'] = dataframe['days_since_song_was_played'].astype(int)

    
    """
    This step will add weights/recency-bias for each song based on when the song was heard/listened to 
    (today's date - date when the user listened to the song). 
    
    The formula for adding recency-bias/weight is: weight = 1/((a+1)^weight_factor); where "a" is the 
    number of days since the song was played 
    """
    weight_factor = 2
    dataframe['track_weight'] = dataframe['days_since_song_was_played'].apply(lambda x: 1/((x+1) ** weight_factor))
    
    
    
    """
    This step creates a 'weightedDataframe' dataframe which is created by multiplying the weight/recency-bias of each
    song with all the elements of that song/track
    """
    weightedDataframe = dataframe.loc[:,'danceability':'key_11'].mul(dataframe['track_weight'],0)

    
    """
    This step creates the final 'userPersonadf' dataframe with just 1 row and 30 columns (one for each feature). This 
    1 row represents the final 'user vector'
    """
   
    userPersonadf = weightedDataframe.sum().div(len(dataframe)).to_frame().transpose()  
 
    userPersonadf.reset_index(drop = True,inplace = True)
    return userPersonadf
    

In [8]:
def generateRecommendations(userPerona,spotifySongData,numberOfRecommendations = 10):
    
    
    """
    Function definition: 
        This function generates songs recommendations by using a similarity metric called 'cosine similarity'

    Arguments: 
        'userPerona' -- Dataframe with a singular row (user vector) representing user's persona
        'spotifySongData' -- Dataframe containing pre-processed Spotify data from the kaggle dataset 
        'numberOfRecommendations' -- Total number of recommendations to be produced. Default is 10

    Returns:
        'recommendations' -- Dataframe containing recommendations for the user
    """


    """
    This step performs 4 different sub-steps
    
    1. Calculates 'cosine similarity' between the user persona vector and each element in the pre-processed  
       kaggle Spotify dataset resulting in a matrix. This matrix ONLY contains the 'angular distance' between the
       user persona vector and each song in the pre-processed kaggle dataset
    
    2. It then converts the resulting matrix to a pandas dataframe. This conversion step results in a dataframe with 
       n-number of columns (where n is the total number of elements in the pre-processed kaggle dataset)
       and 1 row
    
    3. The conversion step is followed by a transposing step so that we have only 1 column and n-number of rows. 
       We want only 1 column so that it can be appneded to the dataframe representing the pre-processed kaggle dataset.
       Appending this column will help us in sorting the dataset based on the value of 'cosine similarity' enabling
       us to make the recommendations
       
    4. Naming the column after transposing and before appending
    """
    
    spotifySongData.reset_index(drop = True,inplace = True)
    
    cosineSimilarityMatrix = cosine_similarity(spotifySongData.loc[:,'danceability':'key_11'].values,userPerona.values)
    cosineSimilaritydf = pd.DataFrame(cosineSimilarityMatrix) 
    cosineSimilaritydf.columns = ['cosine_similarity'] 
    spotifySongData['cosine_similarity'] = cosineSimilaritydf['cosine_similarity']

    
    """
    This step helps us make the song recommendations by sorting the dataset based on the value of 'cosine similarity'
    """
    sortedSpotifySongData = spotifySongData.sort_values(by='cosine_similarity',ascending = False)
    sortedSpotifySongData.reset_index(drop = True,inplace = True)
    
    recommendations = sortedSpotifySongData.head(numberOfRecommendations)
    recommendations = recommendations[['track_id','track_name']]
    
    recommendations.reset_index(drop = True,inplace = True)
    
    return recommendations


## WORKING WITH THE KAGGLE'S SPOTIFY DATASET

### 1. Import the dataset
### 2. Perform data cleaning
### 3. Perform data pre-processing (One-Hot Encoding and Data Scaling/Normalization)



In [9]:
#Step 1
spotifyTracksDatasetdf = importSpotifyKaggleDataset()

In [10]:
#Step 2
cleanedSpotifyTracksDatasetdf = cleanSpotifyData(dataframe = spotifyTracksDatasetdf,
                                                     datatype = 'kaggle_data')


In [11]:
#Step 3    
processedSpotifyTracksDatasetdf, scaler= preProcessSpotifyData(dataframe = cleanedSpotifyTracksDatasetdf,
                                                           datatype = 'kaggle_data')


## WORKING WITH THE USER'S SPOTIFY DATA

### 1. Connect to the user's Spotify account
### 2. Create a dataframe containing the user's Spotify listening history
### 3. Perform data cleaning
### 4. Perform data pre-processing (One-Hot Encoding and Data Scaling/Normalization)
### 5. Create user's persona (user vector) representing their current musical taste (mood)

In [12]:
#Step 1
spotifyClientID = ''
spotifySecret = ''
spotifyRedirectUri = ''
scope = ''
userSpotifyClient = connectToUserSpotify(spotifyClientID,spotifySecret,
                                         spotifyRedirectUri,scope)


In [13]:
#Step 2
spotifyUserDatadf = createSpotifyUserData(userSpotifyClient)


In [14]:
#Step 3
cleanedSpotifyUserDatadf = cleanSpotifyData(dataframe = spotifyUserDatadf,
                                                datatype = 'user_data')


In [15]:
#Step 4
processedSpotifyUserDatadf = preProcessSpotifyData(dataframe = cleanedSpotifyUserDatadf,
                                                      datatype = 'user_data',
                                                      baselinedf = processedSpotifyTracksDatasetdf,
                                                      preDefinedScaler = scaler)

In [16]:
#Step 5
userPersonadf = createUserPersona(dataframe = processedSpotifyUserDatadf)


## GENERATING RECOMMENDATIONS
### 1. Perform pairwise comparison to compare user persona and songs from the Kaggle dataset by using 'cosine similarity' as the similarity metric

In [17]:
recommendations = generateRecommendations(userPerona = userPersonadf,
                         spotifySongData = processedSpotifyTracksDatasetdf,
                           numberOfRecommendations = 50)

In [18]:
recommendations

Unnamed: 0,track_id,track_name
0,3RK1WwfiDHAKHw5N7HetGq,Love
1,1WBpKHfjwfG9PjOe0m8H9U,Erzurum Yaylasıyam
2,3lBA4KEHNesOCKYE9J4ntM,Loppuviikko
3,2I0IqsRziPqef8UIVoo9r6,La Gente Esta Borracha
4,2fOjendzUzepHwR9QrVyYF,La Respuesta
5,6DX3A1hDedL5EfmF9PrEDt,Everywhere I Go
6,4Sy2utZ3u5mWnmu6LZHQkl,Cehennemin Dibi
7,0XWE43513Bldnr8DOqNiOO,Nagy Buli Lesz Minálunk
8,0TyH4jFn7eeSJM4gfkql7h,Super Riddim Internacional
9,7v22aNI4265GCu0WNXG6MV,Que la violence stoppe
