In [166]:
import pandas as pd

In [167]:
df = pd.read_csv('dataset.csv')

In [168]:
df = df.dropna()

In [169]:
# Make the track_genre column numeric
df['track_genre'] = pd.Categorical(df['track_genre'])
df['track_genre'] = df['track_genre'].cat.codes


# Ask user for 3 artists

In [170]:
artists = []

while len(artists) < 3:
    artist = input("Enter an artist: ")
    
 # Check if the artist is in the dataframe
    if (df['artists'].str.lower() == artist.lower()).any():
        # Check to see if the artist has at least three unique song names
        if df[df['artists'].str.lower() == artist.lower()]['track_name'].nunique() < 3:
            print("The artist does not have at least three unique songs. Please enter another artist.")
        else:
            # Find the original artist name in the dataset and append it to the list of artists
            artists.append(df[df['artists'].str.lower() == artist.lower()]['artists'].values[0])

    else:
        print("The artist is not in the dataset. Please enter another artist.")
    
for artist in artists:
    print(artist.title())



The artist is not in the dataset. Please enter another artist.
The artist does not have at least three unique songs. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist does not have at least three unique songs. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
The artist is not in the dataset. Please enter another artist.
Simon & Garfunkel
Pond
The Smashing Pumpkins


# Cluster songs based on attributes

In [171]:
from sklearn.cluster import KMeans

def cluster_songs_by_artist(df, artist_name, n_clusters=3):
    # Filter the dataframe to include only songs by the specified artist
    artist_songs = df[df['artists'].apply(lambda x: x.lower() == artist_name.lower())]

    # Select the song features for clustering
    song_features = artist_songs[['acousticness', 'danceability', 'energy', 'instrumentalness', 'speechiness', 'valence']]

    # Perform K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(song_features)

    # Add the cluster labels to the dataframe
    artist_songs['cluster'] = kmeans.labels_

    # Initialize a list to store top songs from each cluster
    top_songs = []

    # Iterate through each cluster and find the top song based on popularity
    for cluster_label in artist_songs['cluster'].unique():
        cluster_group = artist_songs[artist_songs['cluster'] == cluster_label]
        
        # Find the most popular song in this cluster
        top_song = cluster_group.loc[cluster_group['popularity'].idxmax()]

        # Add the artist's name to the top song data
        top_song['artist'] = artist_name

        top_songs.append(top_song)

    # Convert the list of top songs to a DataFrame
    top_songs_df = pd.DataFrame(top_songs)

    return artist_songs[['track_name', 'cluster', 'popularity']], top_songs_df[['track_name', 'cluster', 'popularity', 'artist']]

# Initialize an empty DataFrame to store top songs from all artists
all_top_songs = pd.DataFrame()

# Loop through the list of artists and find top songs from each cluster
for artist in artists:
    clustered_songs, top_songs = cluster_songs_by_artist(df, artist.title())
    # If a song is in the top songs of more than one cluster, remove duplicates
    top_songs = top_songs.drop_duplicates(subset=['track_name'])
    print(f"Clustered songs for {artist}:\n{clustered_songs}\n")
    


    # Append the top songs from this artist to the continuous DataFrame
    all_top_songs = pd.concat([all_top_songs, top_songs], ignore_index=True)

    
        

# Display the continuous DataFrame of top songs from all artists
print("All top songs from each cluster of each artist:\n", all_top_songs)


Clustered songs for Simon & Garfunkel:
                                           track_name  cluster  popularity
34388         The Sound of Silence - Acoustic Version        1          73
34578                            The Sound of Silence        1          61
34596  Mrs. Robinson - From "The Graduate" Soundtrack        0          75
34688                                       The Boxer        2          71
34828                             April Come She Will        1          66

Clustered songs for Pond:
                          track_name  cluster  popularity
84305                Paint Me Silver        2          50
84554           Sweep Me off My Feet        1          45
84556            Holding Out For You        0          52
84813  Man It Feels Like Space Again        1          45

Clustered songs for The Smashing Pumpkins:
                                          track_name  cluster  popularity
2077   Bullet With Butterfly Wings - Remastered 2012        0           2
20

  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_songs['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_song['artist'] = artist_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_song['artist'] = artist_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

In [172]:
print(all_top_songs)

                                       track_name  cluster  popularity  \
0         The Sound of Silence - Acoustic Version        1          73   
1  Mrs. Robinson - From "The Graduate" Soundtrack        0          75   
2                                       The Boxer        2          71   
3                                 Paint Me Silver        2          50   
4                            Sweep Me off My Feet        1          45   
5                             Holding Out For You        0          52   
6   Bullet With Butterfly Wings - Remastered 2012        0          75   
7                                    Adrennalynne        1          28   
8              Tonight, Tonight - Remastered 2012        2          69   

                  artist  
0      Simon & Garfunkel  
1      Simon & Garfunkel  
2      Simon & Garfunkel  
3                   Pond  
4                   Pond  
5                   Pond  
6  The Smashing Pumpkins  
7  The Smashing Pumpkins  
8  The Smashing 

# Finding closest non-selected-artist songs

In [173]:
from scipy.spatial.distance import cdist

def find_closest_songs_for_artists(spotify_data, artist_list, top_list):
    for artist_name in artist_list:
        # Filter songs by the current artist
        artist_songs = df[df['artists'] == artist_name]
        # Find the top songs in the original dataset from top_list
        artist_songs = artist_songs[artist_songs['track_name'].isin(top_list[top_list['artist'] == artist_name]['track_name'])]
        artist_song_features = artist_songs[['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'speechiness', 'valence', 'track_genre']]

        # Filter non-artist songs
        non_artist_songs = spotify_data[spotify_data['artists'] != artist_name]
        non_artist_song_features = non_artist_songs[['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'speechiness', 'valence', 'track_genre']]

        # Calculate the Euclidean distance between each non-artist song and each song by the current artist
        distances = cdist(non_artist_song_features, artist_song_features, metric='euclidean')

        closest_songs_info = []
        for i in range(len(artist_songs)):
            closest_song_index = distances[:, i].argmin()
            closest_song = non_artist_songs.iloc[closest_song_index]
            closest_song_info = {'track_name': closest_song['track_name'], 'artist': closest_song['artists']}
            closest_songs_info.append(closest_song_info)

        # Print the non-artist song closest to each song by the current artist along with the artist name
        artist_songs_list = artist_songs['track_name'].tolist()
        for i in range(len(artist_list)):
            closest_song_info = closest_songs_info[i]
            print(f"Non-{artist_name} song closest to '{artist_songs_list[i]}': '{closest_song_info['track_name']}' by {closest_song_info['artist']}")



In [174]:
spotify_data, artist_list, top_list = df, all_top_songs['artist'].unique(), all_top_songs

audio_features = ['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
for artist_name in artist_list:

    # Filter songs by the current artist
    artist_songs = df[df['artists'] == artist_name]
    top_song_names = all_top_songs[all_top_songs['artist'] == artist_name]['track_name'].values

    top_songs_w_features = artist_songs[artist_songs['track_name'].apply(lambda x: x in top_song_names)]
    top_songs_w_features = top_songs_w_features.groupby('track_name')[audio_features].mean()

    # Find the top songs in the original dataset from top_list
    # artist_songs = artist_songs[artist_songs['track_name'].isin(top_list[top_list['artist'] == artist_name]['track_name'])]
    # artist_song_features = artist_songs[['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']]

    artist_song_features = top_songs_w_features


    # Filter non-artist songs
    non_artist_songs = spotify_data[spotify_data['artists'] != artist_name]
    non_artist_song_features = non_artist_songs[['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']]

    # Calculate the Euclidean distance between each non-artist song and each song by the current artist
    distances = cdist(non_artist_song_features, artist_song_features, metric='euclidean')

    closest_songs_info = []
    for i in range(len(artist_song_features)):
        closest_song_index = distances[:, i].argmin()
        closest_song = non_artist_songs.iloc[closest_song_index]
        closest_song_info = {'track_name': closest_song['track_name'], 'artist': closest_song['artists']}
        closest_songs_info.append(closest_song_info)



    artist_songs_list = artist_song_features.index.tolist()
    for i in range(len(artist_song_features)):
        closest_song_info = closest_songs_info[i]
        print(f"Non-{artist_name} song closest to '{artist_songs_list[i]}': '{closest_song_info['track_name']}' by {closest_song_info['artist']}")



Non-Simon & Garfunkel song closest to 'Mrs. Robinson - From "The Graduate" Soundtrack': 'I'm Yours' by Jason Mraz
Non-Simon & Garfunkel song closest to 'The Boxer': 'I'd Rather Go Blind' by Etta James
Non-Simon & Garfunkel song closest to 'The Sound of Silence - Acoustic Version': 'Aziyat 2.0 - Reprise Version' by Pratsofficial
Non-Pond song closest to 'Holding Out For You': 'Mess Like Me' by Foxblood
Non-Pond song closest to 'Paint Me Silver': 'アルミナ' by Nightmare
Non-Pond song closest to 'Sweep Me off My Feet': 'Only Wanna Sing (Live)' by Hillsong Young & Free
Non-The Smashing Pumpkins song closest to 'Adrennalynne': 'Breaking Away' by Rata Blanca
Non-The Smashing Pumpkins song closest to 'Bullet With Butterfly Wings - Remastered 2012': '174 Bpm' by Klute
Non-The Smashing Pumpkins song closest to 'Tonight, Tonight - Remastered 2012': 'Children' by Robert Miles
