In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset from Hugging Face
ds = load_dataset("maharshipandya/spotify-tracks-dataset")
df = ds['train'].to_pandas()


### **1️⃣ Data Cleaning & Preprocessing** ###


In [None]:
# Remove unnecessary columns
drop_columns = ["Unnamed: 0", "track_id", "album_name"]
df.drop(columns=[col for col in drop_columns if col in df.columns], errors="ignore", inplace=True)

In [4]:
# Drop missing values
df.dropna(inplace=True)

# Remove invalid tempo and time_signature values
df = df[(df.tempo > 0) & (df.time_signature > 0)]

# Remove unpopular songs (popularity = 0)
df = df[df.popularity > 0]


In [5]:
# Select numerical features for similarity calculations
features = ['duration_ms', 'danceability', 'energy', 'key', 'loudness',
            'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
            'valence', 'tempo', 'time_signature']
df_features = df[features]


In [6]:
# Store track names for reference
track_data = df[['track_name', 'artists', 'track_genre', 'popularity']]


In [7]:
# Standardize numerical features using RobustScaler
scaler = RobustScaler()
df_scaled = scaler.fit_transform(df_features)

In [8]:
# Apply PCA for dimensionality reduction (keeping 10 key components)
pca = PCA(n_components=10, whiten=True, random_state=42)
df_pca = pca.fit_transform(df_scaled)

In [9]:
# Fit KNN model for similarity search
knn = NearestNeighbors(n_neighbors=6, metric="cosine")
knn.fit(df_pca)

### **2️⃣ Finding Similar Songs** ###


In [None]:
def find_similar_songs(song_name, num_songs=5):
    """Searches for similar songs based on input song name."""
    matches = df[df["track_name"].str.contains(song_name, case=False, na=False)]

    if matches.empty:
        print("No matching song found. Try another name.")
        return None

    # Allow user to choose a song if multiple matches are found
    if len(matches) > 1:
        print("Multiple matches found. Choose the correct song:")
        for i, row in enumerate(matches.iterrows()):
            print(f"{i+1}: {row[1]['track_name']} by {row[1]['artists']}")
        choice = int(input("Enter the number of your choice: ")) - 1
        song_index = matches.index[choice]
    else:
        song_index = matches.index[0]

    print(f"✅ Found match: {df.loc[song_index, 'track_name']} by {df.loc[song_index, 'artists']}")

    # Get nearest neighbors
    distances, indices = knn.kneighbors([df_pca[song_index]], n_neighbors=num_songs+1)
    similar_songs = df.iloc[indices[0][1:]]  # Exclude the input song itself


### **3️⃣ Running in Terminal** ###

In [11]:
if __name__ == "__main__":
    song_input = input("🎵 Enter a song name to find similar tracks: ").strip()
    similar_songs = find_similar_songs(song_input)

    if similar_songs is not None:
        print("\n🎶 Similar Songs Found:")
        print(similar_songs)

        # Option to save results
        save_choice = input("Would you like to save these results? (yes/no): ").strip().lower()
        if save_choice == "yes":
            similar_songs.to_csv("similar_songs.csv", index=False)
            print("✅ Results saved as 'similar_songs.csv'.")


Multiple matches found. Choose the correct song:
1: Shape Of You by Andrew Foy
2: Shape of You - Abheri - Shudha Dhanyasi - Adi Tala - Carnatic Mix by IndianRaga;Mahesh Raghvan;Vinod Krishnan;Aditya Rao
3: Shape of You - Abheri - Shudha Dhanyasi - Adi Tala - Carnatic Mix by IndianRaga;Mahesh Raghvan;Vinod Krishnan;Aditya Rao
4: Shape of You - Rock by Our Last Night
5: Shape of You by Ed Sheeran


IndexError: index 44 is out of bounds for axis 0 with size 5