In [None]:
# Step 1: Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Step 2: Load dataset
df = pd.read_csv("data/spotify.csv").head(50000)  # first 50k rows
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (50000, 18)


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [None]:
# Step 3: Select features and scale
feature_columns = ['danceability', 'energy', 'valence', 'tempo', 'loudness']
features = df[feature_columns]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

scaled_features[:5] 

array([[-1.13463042,  1.05225668,  1.45278357,  1.49549528,  1.36207074],
       [ 0.18734379,  0.30748154,  1.46113726,  1.73423663,  0.45843625],
       [ 0.66746378, -2.30138398, -0.41008902, -0.79488077, -1.55663718],
       [-2.11460135, -1.46189755, -0.99902408,  1.65803897, -1.14466123],
       [-1.51609561, -1.89670847, -0.31819845,  0.59968913, -3.31764666]])

In [None]:
# Step 4: Recommendation function with error handling
def recommend(song_name, n=5):
    # Convert all song names in dataset to lowercase for matching
    lower_songs = df['track_name'].str.lower()
    song_name_lower = song_name.lower()
    
    # Check if song exists
    if song_name_lower not in lower_songs.values:
        # Optional: suggest similar songs
        suggestions = df[lower_songs.str.contains(song_name_lower)]
        if not suggestions.empty:
            print(f"Song not found exactly. Did you mean:")
            print(suggestions[['track_name', 'artist_name']].head(5))
        else:
            print(f"Song '{song_name}' not found in the dataset.")
        return None
    
    # Song exists, compute recommendations
    idx = lower_songs[lower_songs == song_name_lower].index[0]
    song_vector = scaled_features[idx].reshape(1, -1)
    sim_scores = cosine_similarity(song_vector, scaled_features).flatten()
    sim_scores[idx] = -1  # exclude itself
    top_idx = np.argsort(sim_scores)[-n:][::-1]
    recs = df.iloc[top_idx][['track_name', 'artist_name']]
    return recs


In [None]:
# Step 5: Test recommendations 
test_songs = [ "Havana"]

for song in test_songs:
    print(f"\nTop 5 songs similar to '{song}':")
    recs = recommend(song)
    if recs is not None:
        display(recs)


Top 5 songs similar to 'Havana':


Unnamed: 0,track_name,artist_name
25926,Feel Alright,Ookay
18514,All Hands on Deck,Tinashe
14799,Psychopath - Recorded at Spotify Studios NYC,Charlotte Lawrence
17964,All Hands On Deck REMIX,Tinashe
16607,You Can Buy Everything,SoMo


In [None]:
#Step 6(Optional): Check if the song is present in the dataset
# song_name = "No Lie"

#if song_name.lower() in df['track_name'].str.lower().values:
#    print(recommend(song_name))
#else:
#    print(f"'{song_name}' not found in the dataset. Try another song.")


               track_name     artist_name
24115  Jugg (feat. Bbno$)        josh pan
15911               Ghost  Ella Henderson
24439                  XL      Bro Safari
12199           Follow Me         Transit
16492          Ciao Adios      Anne-Marie
