In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spotify_dataset.csv')

# Data Preprocessing
df = df.dropna()
df = df.drop(['time_signature', 'key'], axis=1)
df.drop_duplicates(subset=['track_id'], inplace=True)

In [2]:
# Step 1: Unsupervised Learning: K-means Clustering

In [3]:
# Encode the genre column for K means clustering
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['track_genre_encoded'] = le.fit_transform(df['track_genre'].astype(str))
features = ['track_genre_encoded']


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Use the encoded genre as the feature for clustering
features = ['track_genre_encoded', 'danceability', 'energy', 'valence', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo']
df_features = df[features]

# Normalize the features
scaler = StandardScaler()
df_features_normalized = scaler.fit_transform(df_features)

# K-Means Clustering
n_clusters = len(df['track_genre'].unique())  # Number of unique genres (for initial approach)
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(df_features_normalized)  # Use normalized features for clustering
df['cluster'] = kmeans.labels_


In [5]:
def recommend_songs_by_cluster(input_song, n_recommendations=5):
    # Convert input song details to lowercase for case-insensitive comparison
    input_song_name = input_song['track_name'].strip().lower()
    input_artist_name = input_song['artist_name'].strip().lower()
    
    # Find the input song's features
    song = df[(df['track_name'].str.lower() == input_song_name) & 
              (df['artists'].apply(lambda x: any(artist.lower() == input_artist_name for artist in x.lower().split(';'))))]
    
    if song.empty:
        return "Input song not found in the dataset."
    
    # Get the cluster of the input song
    cluster = song['cluster'].values[0]
    
    # Get similar songs within the same cluster
    similar_songs = df[df['cluster'] == cluster]
    
    # Exclude the input song itself
    similar_songs = similar_songs[~similar_songs['track_name'].str.lower().isin([input_song_name])]
    
    # Ensure unique track names
    unique_recommendations = similar_songs.drop_duplicates(subset='track_name')
    
    # Select the top recommendations
    recommendations_df = unique_recommendations.head(n_recommendations)
    
    if not recommendations_df.empty:
        return recommendations_df[['track_name', 'artists']]
    else:
        return "No recommendations found for the input song's cluster."



In [6]:
# Get user input
track_name = input("Enter the track name: ").strip()
artist_name = input("Enter the artist name(s): ").strip()

# Define the input song
input_song = {
    'track_name': track_name,
    'artist_name': artist_name
}

# Get recommendations using cluster-based filtering
recommendations = recommend_songs_by_cluster(input_song, n_recommendations=5)

print("\nCluster-Based Recommendations:")
print(recommendations)


Enter the track name:  august
Enter the artist name(s):  Taylor swift



Cluster-Based Recommendations:
                                track_name  \
34115                             Saaiyaan   
34575                               Hollow   
35202                         Pétalas Neon   
35695  Impossível Acreditar Que Perdi Você   
36203                             Avec Toi   

                                   artists  
34115  Salim–Sulaiman;Rahat Fateh Ali Khan  
34575                              Seafret  
35202                         Noda de Caju  
35695                                 Zezo  
36203                                 OBOY  


In [7]:
# Step 2: Supervised Learning: Content-Based Filtering

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Select relevant features for CONTENT BASED FILTERING
features = ['danceability', 'energy', 'valence', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo']
X = df[features]

# Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

In [9]:
def recommend_songs_by_content(input_song, num_recommendations=5):
    # Convert input song details to lowercase for case-insensitive comparison
    input_song_name = input_song['track_name'].strip().lower()
    input_artist_name = input_song['artist_name'].strip().lower()
    
    # Find the input song's features
    song = df[(df['track_name'].str.lower() == input_song_name) & 
              (df['artists'].apply(lambda x: any(artist.lower() == input_artist_name for artist in x.lower().split(';'))))]
    
    if song.empty:
        return "Input song not found in the dataset."
    
    # Get the features of the input song
    input_index = song.index[0]
    input_features = X_normalized[input_index].reshape(1, -1)
    
    # Calculate cosine similarity between input song features and all songs
    similarities = cosine_similarity(input_features, X_normalized)
    
    # Create a DataFrame of similarities
    similarity_df = pd.DataFrame(similarities.T, index=df.index, columns=['similarity'])
    
    # Exclude the input song from recommendations to avoid suggesting the same song
    similarity_df = similarity_df.drop(index=input_index)
    
    # Sort by similarity and get top N recommendations
    similarity_df = similarity_df.sort_values(by='similarity', ascending=False)
    top_n = num_recommendations * 2
    recommendations = similarity_df.head(top_n).join(df[['track_name', 'artists']])
    
    # Drop duplicates and ensure there are exactly num_recommendations
    recommendations = recommendations.drop_duplicates(subset='track_name')
    recommendations = recommendations.head(num_recommendations)
    
    return recommendations

# Get user input
track_name = input("Enter the track name: ").strip()
artist_name = input("Enter the artist name(s): ").strip()

# Define the input song
input_song = {
    'track_name': track_name,
    'artist_name': artist_name
}

# Get recommendations using content-based filtering
recommendations = recommend_songs_by_content(input_song, num_recommendations=5)

print("\nContent-Based Recommendations:")
print(recommendations)


Enter the track name:  august
Enter the artist name(s):  Taylor swift



Content-Based Recommendations:
        similarity                          track_name  \
104232    1.000000             Por la boca vive el pez   
60025     0.990193                         Last Summer   
95043     0.988733                   Tú Me Haces Falta   
35205     0.984680                    Sem Se Apaixonar   
14358     0.981430  Vampirina Theme - From "Vampirina"   

                                  artists  
104232                  Fito y Fitipaldis  
60025   Tokyo Machine;Weird Genius;Lights  
95043                      Eddie Santiago  
35205                           Eric Land  
14358                    Cast - Vampirina  
