## Imports

In [75]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances

## Dataframe Loading and Song Selection

In [76]:
# Load the dataframe
df = pd.read_csv('spotify_songs.csv')

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [77]:
# Enter a song title
song_title = input("Enter a song title to find similar songs: ")

# Normalize the input for case and remove leading or trailing whitespace
target_song_title = song_title.lower().strip()

# Check if the target_song_title is in the dataset
while target_song_title not in df['name'].str.lower().str.strip().values:
    print("- - - - - ")
    print(f"Error: Song '{target_song_title}' not found in the dataset.")
    target_song_title = input("Enter a song title to find similar songs: ")
    target_song_title = target_song_title.lower().strip()

Enter a song title to find similar songs: 
- - - - - 
Error: Song '' not found in the dataset.
Enter a song title to find similar songs: danny boy


In [78]:
# Retrieve all artists for a particular song title and sort alphabetically
available_artists = df.loc[df['name'].str.lower().str.strip() == target_song_title, 'artists'].unique()
available_artists.sort(axis=0)
print("Available artists:")
for i, artist in enumerate(available_artists):
    print(f"{i}. {artist}")

# Prompt user for artist input
artist_choice = input("Select the artist (enter the corresponding number): ")

# Error handling
try:
    artist_choice = int(artist_choice)
except:
    pass

while type(artist_choice) != int or artist_choice < 0 or artist_choice >= len(available_artists):
    print("- - - - - ")
    print("Invalid choice. Please select a valid artist.")
    
    for i, artist in enumerate(available_artists):
        print(f"{i}. {artist}")
    
    artist_choice = input("Select the artist (enter the corresponding number): ")
    
    try:
        artist_choice = int(artist_choice)
    except:
        pass

# Assign artist choice to variable
target_artist = available_artists[artist_choice]

Available artists:
0. ['Andy Williams']
1. ['Ben Webster']
2. ['Bill Evans', 'Shelly Manne']
3. ['Bill Evans']
4. ['Celtic Woman']
5. ['Conway Twitty']
6. ['Frank Parker']
7. ['Jackie Wilson']
8. ['Johnny Cash']
9. ['Judy Garland']
10. ['Patti Page']
11. ['Ray Price']
12. ['Traditional', 'Mario Lanza']
Select the artist (enter the corresponding number): 8


In [79]:
# # Retrieve all years for an artists for a particular song title and sort alphabetically
available_years = df.loc[(df['name'].str.lower().str.strip() == target_song_title) & (df['artists'] == target_artist), 'year'].unique()
available_years.sort(axis=0)

print(f"Available song years for {target_artist}:")
for i, year in enumerate(available_years):
    print(f"{i}. {year}")

# Prompt user for song year
year_choice = input("Select the song year (enter the corresponding number): ")

# Error handling
try:
    year_choice = int(year_choice)
except:
    pass

while type(year_choice) != int or year_choice < 0 or year_choice >= len(available_years):
    print("- - - - - ")
    print(f"Invalid choice. Please select a valid song year for {target_artist}.")
    for i, year in enumerate(available_years):
        print(f"{i}. {year}")
    year_choice = input("Select the song year (enter the corresponding number): ")

    try:
        year_choice = int(year_choice)
    except:
        pass

# Assign year choice to variable
target_year = available_years[year_choice]

Available song years for ['Johnny Cash']:
0. 1965
1. 2002
Select the song year (enter the corresponding number): 1


In [80]:
# Retrieve the target song based on name, artist, and year
target_song = df[(df['name'].str.lower().str.strip() == target_song_title) & (df['artists'] == target_artist) & (df['year'] == target_year)]

In [81]:
target_song

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
121411,0.241,2002,0.983,['Johnny Cash'],0.334,198507,0.0595,0,5o4SqGekEfvdkNuOVx5d3S,8.7e-05,9,0.124,-11.069,1,Danny Boy,48,2002-01-01,0.0416,177.07


In [82]:
# Retrieve index for selected song
target_song_index = target_song.index
target_song_index

Int64Index([121411], dtype='int64')

## Preprocessing

In [83]:
# Index dataframe by song names and drop unnecessary columns
df = df.set_index('name')
df = df.drop(columns=['artists', 'id', 'release_date'])

# Preprocess all other features and create scaled dataframe
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)

df_scaled.head()

Unnamed: 0_level_0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve",-1.782825,-2.15247,1.276187,-1.467013,4.763146,-1.013988,-0.303955,2.268102,1.365588,2.626719,-1.514237,0.643912,-1.256808,-0.379706,-1.169307
Clancy Lowered the Boom,1.650688,-2.15247,0.611347,1.598779,-0.399747,-0.52827,-0.303955,-0.532771,0.512123,-0.262229,-0.170766,0.643912,-1.210993,1.945481,-1.82118
Gati Bali,-1.858821,-2.15247,1.22034,-1.18882,2.133824,-1.182122,-0.303955,2.379754,-0.62583,-0.599749,-0.593551,0.643912,-1.210993,-0.396297,-0.212404
Danny Boy,-1.381564,-2.15247,1.236296,-1.489722,-0.166101,-0.647832,-0.303955,-0.532682,-0.056853,1.002043,0.37768,0.643912,-1.302624,-0.38708,-0.545537
When Irish Eyes Are Smiling,-1.04718,-2.15247,1.209703,-0.677855,-0.509485,-1.081242,-0.303955,-0.532765,-0.62583,0.132499,0.240788,0.643912,-1.34844,-0.371104,-0.494867


In [84]:
# Retrieve specific song features via index
target_song = df_scaled.iloc[target_song_index]
target_song

Unnamed: 0_level_0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Danny Boy,-1.092778,0.972798,1.278846,-1.154756,-0.25723,-1.580037,-0.303955,-0.532492,1.0811,-0.468173,0.070024,0.643912,0.759085,-0.348982,1.960647


## Calculate Euclidean Distances

In [85]:
# Calculate euclidean distances between the target song and all other songs
distances = euclidean_distances(target_song, df_scaled)

In [86]:
distances

array([[8.35101421, 7.09366253, 6.05740865, ..., 6.27223438, 7.57691089,
        6.78702779]])

In [87]:
len(distances)

1

In [88]:
# Insert calculated distances from target song into original dataframe at column position 0
df_distance_col = distances[0]
df.insert(0, 'distance', df_distance_col)
df

Unnamed: 0_level_0,distance,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve",8.351014,0.0594,1921,0.98200,0.279,831667,0.211,0,0.878000,10,0.6650,-20.096,1,4,0.0366,80.954
Clancy Lowered the Boom,7.093663,0.9630,1921,0.73200,0.819,180533,0.341,0,0.000000,7,0.1600,-12.441,1,5,0.4150,60.936
Gati Bali,6.057409,0.0394,1921,0.96100,0.328,500062,0.166,0,0.913000,3,0.1010,-14.850,1,5,0.0339,110.339
Danny Boy,4.992615,0.1650,1921,0.96700,0.275,210000,0.309,0,0.000028,5,0.3810,-9.316,1,3,0.0354,100.109
When Irish Eyes Are Smiling,4.908085,0.2530,1921,0.95700,0.418,166693,0.193,0,0.000002,3,0.2290,-10.096,1,2,0.0380,101.665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
China,5.661983,0.6080,2020,0.08460,0.786,301714,0.808,0,0.000289,7,0.0822,-3.702,1,72,0.0881,105.029
Halloweenie III: Seven Days,4.842598,0.7340,2020,0.20600,0.717,150654,0.753,0,0.000000,7,0.1010,-6.020,1,68,0.0605,137.936
AYA,6.272234,0.6370,2020,0.10100,0.634,211280,0.858,0,0.000009,4,0.2580,-2.226,0,76,0.0809,91.688
Darkness,7.576911,0.1950,2020,0.00998,0.671,337147,0.623,1,0.000008,2,0.6430,-7.161,1,70,0.3080,75.055


## Sort Dataframe by Distance to Selected Song

In [89]:
# Sort the dataframe by distance
df_sorted = df.sort_values(by='distance')

df_sorted.head()

Unnamed: 0_level_0,distance,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Danny Boy,0.0,0.241,2002,0.983,0.334,198507,0.0595,0,8.7e-05,9,0.124,-11.069,1,48,0.0416,177.07
How Great Thou Art,0.793372,0.174,2006,0.853,0.348,212827,0.204,0,2e-06,8,0.139,-10.576,1,47,0.0292,174.519
Gimme Gimme,0.901353,0.21,2002,0.886,0.328,214227,0.213,0,0.0,11,0.113,-10.698,1,43,0.0469,176.136
S'il suffisait d'aimer,0.97568,0.257,1998,0.921,0.346,214827,0.197,0,2e-06,10,0.134,-14.641,1,56,0.0393,176.646
Mad About The Boy,0.996125,0.263,1987,0.922,0.344,182573,0.147,0,0.00019,8,0.0482,-10.3,1,58,0.0369,177.192


In [90]:
df_sorted.describe()

Unnamed: 0,distance,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
count,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0
mean,5.281924,0.528587,1976.787241,0.502115,0.537396,230948.3,0.482389,0.084575,0.16701,5.199844,0.205839,-11.46799,0.706902,31.431794,0.098393,116.86159
std,1.22431,0.263171,25.917853,0.376032,0.176138,126118.4,0.267646,0.278249,0.313475,3.515094,0.174805,5.697943,0.455184,21.826615,0.16274,30.708533
min,0.0,0.0,1921.0,0.0,0.0,5108.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0
25%,4.551535,0.317,1956.0,0.102,0.415,169827.0,0.255,0.0,0.0,2.0,0.0988,-14.615,0.0,11.0,0.0349,93.421
50%,5.23017,0.54,1977.0,0.516,0.548,207467.0,0.471,0.0,0.000216,5.0,0.136,-10.58,1.0,33.0,0.045,114.729
75%,5.908968,0.747,1999.0,0.893,0.668,262400.0,0.703,0.0,0.102,8.0,0.261,-7.183,1.0,48.0,0.0756,135.537
max,41.900335,1.0,2020.0,0.996,0.988,5403500.0,1.0,1.0,1.0,11.0,1.0,3.855,1.0,100.0,0.97,243.507


In [91]:
# Display the top n most similar songs based on minimal distance to selected song
n = 5
df_sorted.head(n+1)

Unnamed: 0_level_0,distance,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Danny Boy,0.0,0.241,2002,0.983,0.334,198507,0.0595,0,8.7e-05,9,0.124,-11.069,1,48,0.0416,177.07
How Great Thou Art,0.793372,0.174,2006,0.853,0.348,212827,0.204,0,2e-06,8,0.139,-10.576,1,47,0.0292,174.519
Gimme Gimme,0.901353,0.21,2002,0.886,0.328,214227,0.213,0,0.0,11,0.113,-10.698,1,43,0.0469,176.136
S'il suffisait d'aimer,0.97568,0.257,1998,0.921,0.346,214827,0.197,0,2e-06,10,0.134,-14.641,1,56,0.0393,176.646
Mad About The Boy,0.996125,0.263,1987,0.922,0.344,182573,0.147,0,0.00019,8,0.0482,-10.3,1,58,0.0369,177.192
Baby Mine,1.018232,0.247,2007,0.906,0.362,217360,0.242,0,9e-06,11,0.098,-11.356,1,52,0.0299,184.428
