In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('dataset.csv')

# Ask user for 3 artists

In [8]:
artists = []

while len(artists) < 3:
    artist = input("Enter an artist: ")
    
    # Check if the artist is in the dataframe 
    if (df['artists'].str.lower() == artist.lower()).any():
        artists.append(artist)
    else:
        print("The artist is not in the dataset. Please enter another artist.")

print("The entered artists are:")
for artist in artists:
    print(artist.title())



The artist is not in the dataset. Please enter another artist.
The entered artists are:
Pink Floyd
Counting Crows
Led Zeppelin


# Cluster songs based on attributes

In [4]:
from sklearn.cluster import KMeans

def cluster_songs_by_artist(df, artist_name, n_clusters=3):
    # Filter the dataframe to include only songs by the specified artist
    artist_songs = df[df['artists'] == artist_name]

    # Select the song features for clustering
    song_features = artist_songs[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']]

    # Perform K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(song_features)

    # Add the cluster labels to the dataframe
    artist_songs['cluster'] = kmeans.labels_

    # Initialize a list to store top songs from each cluster
    top_songs = []

    # Iterate through each cluster and find the top song based on popularity
    for cluster_label in artist_songs['cluster'].unique():
        cluster_group = artist_songs[artist_songs['cluster'] == cluster_label]
        
        # Find the most popular song in this cluster
        top_song = cluster_group.loc[cluster_group['popularity'].idxmax()]

        # Add the artist's name to the top song data
        top_song['artist'] = artist_name

        top_songs.append(top_song)

    # Convert the list of top songs to a DataFrame
    top_songs_df = pd.DataFrame(top_songs)

    return artist_songs[['track_name', 'cluster', 'popularity']], top_songs_df[['track_name', 'cluster', 'popularity', 'artist']]

# Initialize an empty DataFrame to store top songs from all artists
all_top_songs = pd.DataFrame()

# Loop through the list of artists and find top songs from each cluster
for artist in artists:
    clustered_songs, top_songs = cluster_songs_by_artist(df, artist.title())
    print(f"Clustered songs for {artist}:\n{clustered_songs}\n")
    
    # Append the top songs from this artist to the continuous DataFrame
    all_top_songs = pd.concat([all_top_songs, top_songs], ignore_index=True)

# Display the continuous DataFrame of top songs from all artists
print("All top songs from each cluster of each artist:\n", all_top_songs)


  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_songs['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_song['artist'] = artist_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_song['artist'] = artist_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

Clustered songs for Pink Floyd:
                             track_name  cluster  popularity
84001                Wish You Were Here        2          77
84002                  Comfortably Numb        1          74
84051  Another Brick in the Wall, Pt. 2        1          76
84054                              Time        1          71
84103              Breathe (In the Air)        0          71
...                                 ...      ...         ...
84910                 High Hopes - Live        2          45
84964                    Hey You - Live        2          46
91302                Wish You Were Here        2          77
91510                  Comfortably Numb        1          73
91717  Another Brick in the Wall, Pt. 2        1          76

[89 rows x 3 columns]

Clustered songs for Led Zeppelin:
                                     track_name  cluster  popularity
47007                 Immigrant Song - Remaster        1          78
47010             Stairway to Heaven - R

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_songs['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_song['artist'] = artist_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_song['artist'] = artist_name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_

# Finding closest non-selected-artist songs

In [5]:
from scipy.spatial.distance import cdist

def find_closest_songs_for_artists(spotify_data, artist_list):
    for artist_name in artist_list:
        # Filter songs by the current artist
        artist_songs = df[df['artists'] == artist_name]
        artist_song_features = artist_songs[['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']]

        # Filter non-artist songs
        non_artist_songs = spotify_data[spotify_data['artists'] != artist_name]
        non_artist_song_features = non_artist_songs[['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']]

        # Calculate the Euclidean distance between each non-artist song and each song by the current artist
        distances = cdist(non_artist_song_features, artist_song_features, metric='euclidean')

        closest_songs_info = []
        for i in range(len(artist_songs)):
            closest_song_index = distances[:, i].argmin()
            closest_song = non_artist_songs.iloc[closest_song_index]
            closest_song_info = {'track_name': closest_song['track_name'], 'artist': closest_song['artists']}
            closest_songs_info.append(closest_song_info)

        # Print the non-artist song closest to each song by the current artist along with the artist name
        artist_songs_list = artist_songs['track_name'].tolist()
        for i in range(len(artist_list)):
            closest_song_info = closest_songs_info[i]
            print(f"Non-{artist_name} song closest to '{artist_songs_list[i]}': '{closest_song_info['track_name']}' by {closest_song_info['artist']}")

# Example usage
find_closest_songs_for_artists(df, all_top_songs['artist'].unique())


Non-Pink Floyd song closest to 'Wish You Were Here': 'wish you were gay' by Billie Eilish
Non-Pink Floyd song closest to 'Comfortably Numb': 'The Only Exception' by Paramore
Non-Pink Floyd song closest to 'Another Brick in the Wall, Pt. 2': 'Knee Socks' by Arctic Monkeys
Non-Led Zeppelin song closest to 'Immigrant Song - Remaster': 'Dani California' by Red Hot Chili Peppers
Non-Led Zeppelin song closest to 'Stairway to Heaven - Remaster': 'Runaway' by AURORA
Non-Led Zeppelin song closest to 'Whole Lotta Love - 1990 Remaster': '新時代 - ウタ from ONE PIECE FILM RED' by Ado
Non-Red Hot Chili Peppers song closest to 'Tippa My Tongue': 'Animals' by Nickelback
Non-Red Hot Chili Peppers song closest to 'Tippa My Tongue': 'Buddy Holly' by Weezer
Non-Red Hot Chili Peppers song closest to 'Californication': 'Sanctuary' by Welshly Arms
