In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

Load the Data in

In [2]:
data = pd.read_csv("spotify_dataset.csv")
print(data.head())

   Index  Highest Charting Position  Number of Times Charted  \
0      1                          1                        8   
1      2                          2                        3   
2      3                          1                       11   
3      4                          3                        5   
4      5                          5                        1   

  Week of Highest Charting                          Song Name     Streams  \
0   2021-07-23--2021-07-30                            Beggin'  48,633,449   
1   2021-07-23--2021-07-30          STAY (with Justin Bieber)  47,248,719   
2   2021-06-25--2021-07-02                           good 4 u  40,162,559   
3   2021-07-02--2021-07-09                         Bad Habits  37,799,456   
4   2021-07-23--2021-07-30  INDUSTRY BABY (feat. Jack Harlow)  33,948,454   

           Artist Artist Followers                 Song ID  \
0        Måneskin          3377762  3Wrjm47oTz2sjIgck11l5e   
1   The Kid LAROI          2

Check for NAs

In [3]:
print(data.isnull().sum())

Index                        0
Highest Charting Position    0
Number of Times Charted      0
Week of Highest Charting     0
Song Name                    0
Streams                      0
Artist                       0
Artist Followers             0
Song ID                      0
Genre                        0
Release Date                 0
Weeks Charted                0
Popularity                   0
Danceability                 0
Energy                       0
Loudness                     0
Speechiness                  0
Acousticness                 0
Liveness                     0
Tempo                        0
Duration (ms)                0
Valence                      0
Chord                        0
dtype: int64


Find relevant column names and drop others

In [4]:
print(data.columns)

Index(['Index', 'Highest Charting Position', 'Number of Times Charted',
       'Week of Highest Charting', 'Song Name', 'Streams', 'Artist',
       'Artist Followers', 'Song ID', 'Genre', 'Release Date', 'Weeks Charted',
       'Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness',
       'Acousticness', 'Liveness', 'Tempo', 'Duration (ms)', 'Valence',
       'Chord'],
      dtype='object')


In [5]:
data = data[["Song Name", "Streams", "Artist", "Genre"]]
print(data.head())

                           Song Name     Streams          Artist  \
0                            Beggin'  48,633,449        Måneskin   
1          STAY (with Justin Bieber)  47,248,719   The Kid LAROI   
2                           good 4 u  40,162,559  Olivia Rodrigo   
3                         Bad Habits  37,799,456      Ed Sheeran   
4  INDUSTRY BABY (feat. Jack Harlow)  33,948,454       Lil Nas X   

                                    Genre  
0  ['indie rock italiano', 'italian pop']  
1                  ['australian hip hop']  
2                                 ['pop']  
3                       ['pop', 'uk pop']  
4           ['lgbtq+ hip hop', 'pop rap']  


Here I will use Genre to determine my recommendation and use cosine similarity

In [6]:
feature = data["Genre"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)


Use song name as an index to later offer recomendations

In [9]:
indices = pd.Series(data.index,
index=data["Song Name"]).drop_duplicates()

Create recomendation function

In [10]:
def spotify_recommendation(song, similarity = similarity):
    index = indices[song]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    songindices = [i[0] for i in similarity_scores]
    return print(data["Song Name"].iloc[songindices], data["Artist"].iloc[songindices])
  

print(spotify_recommendation("Wasting Time ( feat. Drake )"))

237                          Wasting Time ( feat. Drake )
571                    Gravity (feat. Tyler, The Creator)
1004                                          Hit My Line
927                            Gifted (feat. Roddy Ricch)
822                                            THE SCOTTS
1040                                               Lalala
1148                                            Daechwita
104             Lemonade (feat. Gunna, Don Toliver & NAV)
219     His & Hers (feat. Don Toliver, Lil Uzi Vert & ...
858     Lemonade (Feat. Roddy Ricch & Don Toliver) [Re...
Name: Song Name, dtype: object 237                           Brent Faiyaz
571                  Brent Faiyaz, DJ Dahi
1004                                 Logic
927                                 Cordae
822     THE SCOTTS, Travis Scott, Kid Cudi
1040                            Y2K, bbno$
1148                               Agust D
104                         Internet Money
219                         Internet Money
85