In [2]:
# Basic Libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Extraction

In [3]:
musicdata = pd.read_csv('Spotify Dataset.csv')
musicdata = musicdata.drop(["Unnamed: 0", "track_id"], axis = 1) 
musicdata['songlists'] = musicdata.apply(lambda row: row['artists'] + row['track_name'], axis = 1)
musicdata = musicdata.drop_duplicates('songlists')
musicdata[['artists','track_name', 'album_name']]
print("Are all track_names in the database unique? ",len(pd.unique(musicdata.songlists))==len(musicdata))

Are all track_names in the database unique?  True


In [4]:
data = musicdata.sort_values(by=['popularity'], ascending = False).head(50000) #First 50K popular songs

In [5]:
genre_vectors = CountVectorizer()
genre_vectors.fit(data['track_genre'])

CountVectorizer()

In [6]:
def recommended_songs(song_name):

    #If the song could not be found within the dataset
    if data[data['track_name'] == song_name].shape[0] == 0:
        
        print('This song is not available. Here are some other popular songs that you may like:\n')
        suggested_music_list = data.nlargest(100, 'popularity')
        suggested_music = suggested_music_list.sample(n = 5) #any 5 samples

        return display(suggested_music[['artists', 'track_name', 'album_name']])
    
    #If the song could be found within the dataset
    song_input_array1 = genre_vectors.transform(data[data['track_name'] == song_name]['track_genre']).toarray() 
    #for text comparison
    song_input_array2 = data[data['track_name']==song_name].select_dtypes(include = np.number).to_numpy() 
    #for other numeric factors comparison
   
    similarity_index = []
    for index, row in data.iterrows():
        song = row['track_name']
     
        # Getting vector for existing songs in the playlist.
        existing_song_array1 = genre_vectors.transform(data[data['track_name'] == song]['track_genre']).toarray()
        #for text comparison
        existing_song_array2 = data[data['track_name'] == song].select_dtypes(include = np.number).to_numpy()
        #for other numeric factors comparison
 
        # Calculating similarities for text as well as numeric features
        text_similarity = cosine_similarity(song_input_array1, existing_song_array1)[0][0]
        feature_similarity = cosine_similarity(song_input_array2, existing_song_array2)[0][0]
        
        #adding the results into the array created
        similarity_index.append(text_similarity + feature_similarity)
 
    data['similarity'] = similarity_index
    
    data.sort_values(by=['similarity'], ascending = [False], inplace=True)
   
    display(data[['artists', 'track_name', 'album_name']][1:6]) 
    #Ignore song 1 as it is just the input song itself

In [7]:
recommended_songs('Love Someone')

Unnamed: 0,artists,track_name,album_name
465,Joshua Hyslop,Wells,Wells
208,Aron Wright,You Were Supposed to Be Different,You Were Supposed to Be Different
109,Tyler Ward,How To Lose a Girl,Songs From Nashville
508,Greg Laswell,And Then You,Three Flights From Alto Nido
960,Bootstraps,Whenever You're Around,Demo Love


In [8]:
recommended_songs('Efecto')

Unnamed: 0,artists,track_name,album_name
67353,Daddy Yankee,Gasolina,Barrio Fino (Bonus Track Version)
67359,Bad Bunny,Tití Me Preguntó,Un Verano Sin Ti
67801,Rauw Alejandro;Lyanno;Brray,LOKERA,LOKERA
67806,Ozuna,Mañana,OzuTochi
67552,Rvssian;Rauw Alejandro;Chris Brown,Nostálgico,Nostálgico


In [9]:
recommended_songs('Love me like you do')

This song is not available. Here are some other popular songs that you may like:



Unnamed: 0,artists,track_name,album_name
38003,Arctic Monkeys,505,Favourite Worst Nightmare
20001,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),Unholy (feat. Kim Petras)
20017,Charlie Puth;Jung Kook;BTS,Left and Right (Feat. Jung Kook of BTS),Left and Right (Feat. Jung Kook of BTS)
81150,The Weeknd,Call Out My Name,"My Dear Melancholy,"
15013,Tom Odell,Another Love,Long Way Down (Deluxe)
