In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# loading the data from the csv file to apandas dataframe
songs_data = pd.read_csv('data\SpotifyFeatures.csv')

In [3]:
# printing the first 5 rows of the dataframe
songs_data.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [4]:
# number of rows and columns in the data frame
songs_data.shape

(232725, 18)

In [5]:
# selecting the relevant features for recommendation
selected_features = ['track_id', 'popularity', 'acousticness', 'danceability', 'duration_ms']
print(selected_features)

['track_id', 'popularity', 'acousticness', 'danceability', 'duration_ms']


In [6]:
# replaced null values with null string
for feature in selected_features:
    songs_data[feature] = songs_data[feature].fillna('')

In [7]:
#combining features to generate vectores
combined_features = songs_data['track_id']+' '+ str(songs_data['popularity'])+' '+ str(songs_data['acousticness'])+' '+ str(songs_data['danceability'])+' '+str(songs_data['duration_ms'])

In [8]:
print(combined_features)

0         0BRjO6ga9RKCKjfDqeFgWV 0          0\n1        ...
1         0BjC1NfoEOOusryehmNudP 0          0\n1        ...
2         0CoSDzoNIKCRs124s9uTVy 0          0\n1        ...
3         0Gc6TVm52BwZD07Ki6tIvf 0          0\n1        ...
4         0IuslXpMROHdEPvSl1fTQK 0          0\n1        ...
                                ...                        
232720    2XGLdVl7lGeq8ksM6Al7jT 0          0\n1        ...
232721    1qWZdkBl4UVPj9lK6HuuFM 0          0\n1        ...
232722    2ziWXUmQLrXTiYjCg2fZ2t 0          0\n1        ...
232723    6EFsue2YbIG4Qkq8Zr9Rir 0          0\n1        ...
232724    34XO9RwPMKjbvRry54QzWn 0          0\n1        ...
Name: track_id, Length: 232725, dtype: object


In [9]:
# converting the text data to feature vectors
vectorizer = TfidfVectorizer()

In [10]:
feature_vectors = vectorizer.fit_transform(combined_features)
print(feature_vectors)

  (0, 176818)	0.05334651385856258
  (0, 68830)	0.05334651385856258
  (0, 46081)	0.05334651385856258
  (0, 24759)	0.05334651385856258
  (0, 48298)	0.05334651385856258
  (0, 68850)	0.05334651385856258
  (0, 176810)	0.05334651385856258
  (0, 24355)	0.05334651385856258
  (0, 25103)	0.05334651385856258
  (0, 23669)	0.05334651385856258
  (0, 176814)	0.05334651385856258
  (0, 176816)	0.05334651385856258
  (0, 160680)	0.05334651385856258
  (0, 160313)	0.05334651385856258
  (0, 114069)	0.05334651385856258
  (0, 161786)	0.05334651385856258
  (0, 139307)	0.05334651385856258
  (0, 69160)	0.05334651385856258
  (0, 46833)	0.05334651385856258
  (0, 138558)	0.05334651385856258
  (0, 116984)	0.05334651385856258
  (0, 71076)	0.05334651385856258
  (0, 176819)	0.10669302771712516
  (0, 176815)	0.05334651385856258
  (0, 3331)	0.05334651385856258
  :	:
  (232724, 47556)	0.05334651385856258
  (232724, 176811)	0.05334651385856258
  (232724, 1100)	0.05334651385856258
  (232724, 23)	0.05334651385856258
  (23272

In [None]:
# similarity scores using cosine similarity
similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity)

In [None]:
print(similarity.shape)

In [18]:
song_name = input(' Enter a song name : ')

 Enter a song name : Somewhere I Belong


In [19]:
list_of_all_titles = songs_data['track_name'].tolist()
print(list_of_all_titles)



In [20]:
find_close_match = difflib.get_close_matches(song_name, list_of_all_titles)
print(find_close_match)

['Somewhere I Belong', 'Somewhere I Belong', 'Somewhere I Belong']


In [21]:
close_match = find_close_match[0]
print(close_match)

Somewhere I Belong


In [22]:
popularity_of_the_song = songs_data[songs_data.track_name == close_match]['popularity'].values[0]
print(popularity_of_the_song)

61


In [23]:
similarity_score = list(enumerate(similarity[popularity_of_the_song]))
print(similarity_score)

NameError: name 'similarity' is not defined

In [None]:
# sorting the movies based on their similarity score

sorted_similar_songs = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_songs)