In [73]:
# Basic Libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Extraction

In [66]:
musicdata = pd.read_csv('Spotify Dataset.csv')
musicdata = musicdata.drop(["Unnamed: 0", "track_id"], axis = 1) 
musicdata['songlists'] = musicdata.apply(lambda row: row['artists'] + row['track_name'], axis = 1)
musicdata = musicdata.drop_duplicates('songlists')
musicdata[['artists','track_name', 'album_name']]
print("Are all track_names in the database unique? ",len(pd.unique(musicdata.songlists))==len(musicdata))

Are all track_names in the database unique?  True


In [67]:
data = musicdata.sort_values(by=['popularity'], ascending = False).head(10000) #First 10K popular songs

In [68]:
genre_vectors = CountVectorizer()
genre_vectors.fit(data['track_genre'])

CountVectorizer()

In [69]:
def recommended_songs_10k(song_name):

    #If the song could not be found within the dataset
    if data[data['track_name'] == song_name].shape[0] == 0:
        
        print('This song is not available. Here are some other popular songs that you may like:\n')
        suggested_music_list = data.nlargest(100, 'popularity')
        suggested_music = suggested_music_list.sample(n = 5) #any 5 samples

        return display(suggested_music[['artists', 'track_name', 'album_name']])
    
    #If the song could be found within the dataset
    song_input_array1 = genre_vectors.transform(data[data['track_name'] == song_name]['track_genre']).toarray() 
    #for text comparison
    song_input_array2 = data[data['track_name']==song_name].select_dtypes(include = np.number).to_numpy() 
    #for other numeric factors comparison
   
    similarity_index = []
    for index, row in data.iterrows():
        song = row['track_name']
     
        # Getting vector for existing songs in the playlist.
        existing_song_array1 = genre_vectors.transform(data[data['track_name'] == song]['track_genre']).toarray()
        #for text comparison
        existing_song_array2 = data[data['track_name'] == song].select_dtypes(include = np.number).to_numpy()
        #for other numeric factors comparison
 
        # Calculating similarities for text as well as numeric features
        text_similarity = cosine_similarity(song_input_array1, existing_song_array1)[0][0]
        feature_similarity = cosine_similarity(song_input_array2, existing_song_array2)[0][0]
        
        #adding the results into the array created
        similarity_index.append(text_similarity + feature_similarity)
 
    data['similarity'] = similarity_index
    
    data.sort_values(by=['similarity'], ascending = [False], inplace=True)
   
    display(data[['artists', 'track_name', 'album_name']][1:6]) 
    #Ignore song 1 as it is just the input song itself

In [70]:
recommended_songs_10k('Love Someone')

Unnamed: 0,artists,track_name,album_name
68,Andrew Belle,In My Veins - Feat. Erin Mccarley,In My Veins (Feat. Erin Mccarley)
822,Aimyon,君はロックを聴かない,青春のエキサイトメント
176,Brandi Carlile,The Story,The Story
126,Howie Day,Collide - Acoustic Version,Stop All the World Now (Special Edition)
55,Andrew Belle,The Enemy,Black Bear


In [71]:
recommended_songs_10k('Efecto')

Unnamed: 0,artists,track_name,album_name
67353,Daddy Yankee,Gasolina,Barrio Fino (Bonus Track Version)
67359,Bad Bunny,Tití Me Preguntó,Un Verano Sin Ti
67801,Rauw Alejandro;Lyanno;Brray,LOKERA,LOKERA
67806,Ozuna,Mañana,OzuTochi
67552,Rvssian;Rauw Alejandro;Chris Brown,Nostálgico,Nostálgico


In [72]:
recommended_songs_10k('Love me like you do')

This song is not available. Here are some other popular songs that you may like:



Unnamed: 0,artists,track_name,album_name
65053,IVE,After LIKE,After LIKE
81101,The Weeknd;Gesaffelstein,I Was Never There,"My Dear Melancholy,"
20910,Bruno Mars,Locked out of Heaven,Unorthodox Jukebox
34014,Seafret,Atlantis,Tell Me It's Real (Expanded Edition)
68663,Chris Jedi;Anuel AA;Chencho Corleone;Ñengo Flow,La Llevo Al Cielo (Ft. Ñengo Flow),La Llevo Al Cielo (Ft. Ñengo Flow)
