<a href="https://colab.research.google.com/github/wrldzero/UTD-Summer-Research/blob/main/knn_spotify_recommender_from_scratch_(kinda).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install spotipy --upgrade

Collecting spotipy
  Downloading spotipy-2.25.1-py3-none-any.whl.metadata (5.1 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-6.2.0-py3-none-any.whl.metadata (10 kB)
Downloading spotipy-2.25.1-py3-none-any.whl (31 kB)
Downloading redis-6.2.0-py3-none-any.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.7/278.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: redis, spotipy
Successfully installed redis-6.2.0 spotipy-2.25.1


In [3]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_credentials_manager = SpotifyClientCredentials(
    client_id='a8eead47c2564b5291db63522723a0df',
    client_secret='bfb4fd06cfeb40daaed1fcf96888d1d3'
)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

class Song:
  def __init__(self, index, title, artist, genre, popularity):
    self.index = index
    self.title = title
    self.artist = artist
    self.genre = genre
    self.popularity = popularity

  def __str__(self):
    return f"{self.index} {self.title} - {self.artist} ({self.genre}, popularity = {self.popularity})"

  def str_short(self):
    return f"{self.title} ({self.genre}, popularity = {self.popularity})"

def by_title(song):
  return song.title

class SpotifyRecommender:
  def __init__(self, df, k_neighbors = 25):
    # Data cleaning
    self.df = df.drop(columns=['Unnamed: 0'])
    self.df = self.df.dropna().copy()
    self.df = self.df[self.df['popularity'] > 0]
    self.df = self.df.reset_index(drop=True)

    self.features = [
            'danceability', 'energy', 'key', 'loudness', 'mode',
            'speechiness', 'acousticness', 'instrumentalness',
            'liveness', 'valence', 'tempo', 'time_signature', 'duration_ms'
        ]

    self.scaler = StandardScaler()
    self.scaled_features = self.scaler.fit_transform(self.df[self.features])

    # Build and fit KNN
    self.knn = NearestNeighbors(n_neighbors=k_neighbors, metric='euclidean')
    self.knn.fit(self.scaled_features)

  def get_song_from_index(self, index):
    return Song(index, self.df.iloc[index]['track_name'], self.df.iloc[index]['artist_name'],
                self.df.iloc[index]['genre'], self.df.iloc[index]['popularity'])

  def get_song(self, title, artist, printing = True):
    matches = self.df[
            (self.df['track_name'].str.lower() == title.lower()) &
            (self.df['artist_name'].str.lower() == artist.lower())
        ]
    if len(matches) == 0:
      if printing:
        print(f"Not found: {title} - {artist}")
      return None
    return self.get_song_from_index(matches.index[0])

  def get_song_from_id(self, id, printing = True):
    matches = self.df[self.df['track_id'] == id]
    if len(matches) == 0:
      if printing:
        print(f"Not found: {id}")
      return None
    return self.get_song_from_index(matches.index[0])

  def titles_to_songs(self, info, printing = True, limit = 300):
    songs = []
    for title, artist in info:
      song = self.get_song(title, artist, printing = printing)
      if song is not None:
        songs.append(song)
      if len(songs) >= limit:
        break
    return songs

  def ids_to_songs(self, ids, printing = True, limit = 300):
    songs = []
    for id in ids:
      song = self.get_song_from_id(id, printing = printing)
      if song is not None:
        songs.append(song)
      if len(songs) >= limit:
        break
    return songs

  def recommend_songs(self, liked_songs, n_recommendations = 10, printing = True):
    if len(liked_songs) == 0:
        if printing:
          print("No songs were found")
        return None

    liked_indexes = [song.index for song in liked_songs]
    liked_features = self.scaled_features[liked_indexes]
    avg_features = np.mean(liked_features, axis=0).reshape(1, -1)

    distances, indexes = self.knn.kneighbors(avg_features)

    recommended_songs = []
    for idx in indexes[0]:
        if idx not in liked_indexes:
            recommended_songs.append(self.get_song_from_index(idx))
        if len(recommended_songs) >= n_recommendations:
            break

    if printing:
      print("Liked songs:")
      for song in liked_songs:
        print(song)
      print("\nRecommended songs:")
      for song in recommended_songs:
        print(song)

    return recommended_songs

  def recommend_songs_from_titles(self, liked_titles, n_recommendations = 10, printing = True):
      liked_songs = self.titles_to_songs(liked_titles, printing = printing)

      if printing:
        print(f"Found {len(liked_songs)} out of {len(liked_titles)} songs")

      return self.recommend_songs(liked_songs, n_recommendations = n_recommendations, printing = printing)

  def recommend_songs_from_ids(self, liked_ids, n_recommendations = 10, printing = True):
      liked_songs = self.ids_to_songs(liked_ids, printing = printing)

      if printing:
        print(f"Found {len(liked_songs)} out of {len(liked_ids)} songs")

      return self.recommend_songs(liked_songs, n_recommendations = n_recommendations, printing = printing)

  def get_songs_from_Spotify(self, username):
    all_track_ids = []
    offset = 0

    # Get all public playlist IDs from the user
    while True:
        playlists = sp.user_playlists(username, offset=offset, limit=50)
        if not playlists['items']:
            break
        for playlist in playlists['items']:
            if playlist['public']:  # Only public playlists
                playlist_id = playlist['id']
                print(f"Scraping playlist: {playlist['name']}")

                # Scrape track IDs from this playlist
                track_offset = 0
                while True:
                    results = sp.playlist_tracks(playlist_id, offset=track_offset, limit=100)
                    if not results['items']:
                        break
                    for item in results['items']:
                        track = item.get('track')
                        if track and track.get('id'):
                            all_track_ids.append(track['id'])
                    track_offset += len(results['items'])
        offset += len(playlists['items'])

        print(f'Done! ({len(playlists)})')

    return all_track_ids

  def recommend_songs_from_Spotify(self, username, n_recommendations = 10, printing = True):
      return self.recommend_songs_from_ids(self.get_songs_from_Spotify(username), n_recommendations = n_recommendations, printing = printing)

  def select(self):
    songs = list()

    while True:
      next = input("Input an artist name to search\nInput 'view' to see current list\nInput 'done' or nothing at all to exit\n")

      if next == "view":
        for i in songs:
          print(i)
      elif next == "done" or next == "":
        break
      else:
        print()
        matches = [self.get_song_from_index(i) for i in self.df[self.df['artist_name'].str.lower() == next.lower()].index]
        matches.sort(key = by_title)
        if len(matches) == 0:
          print(f"Could not find songs from {next}")
        else:
          for i in range(len(matches)):
            print(f"{i + 1}. {matches[i].str_short()}")
          for i in input("Input numbers of songs you want to add\n").split():
            try:
              songs.append(matches[int(i) - 1])
            except:
              pass

    return songs

  def select_and_recommend(self, n_recommendations = 10, printing = True):
    return self.recommend_songs(self.select(), n_recommendations = n_recommendations, printing = printing)

In [4]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "spotify_data.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "amitanshjoshi/spotify-1million-tracks",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:\n", df.head())

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/amitanshjoshi/spotify-1million-tracks?dataset_version_number=1&file_name=spotify_data.csv...


100%|██████████| 77.1M/77.1M [00:01<00:00, 63.1MB/s]


Extracting zip of spotify_data.csv...
First 5 records:
    Unnamed: 0    artist_name        track_name                track_id  \
0           0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6   
1           1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218   
2           2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F   
3           3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz   
4           4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8   

   popularity  year     genre  danceability  energy  key  loudness  mode  \
0          68  2012  acoustic         0.483   0.303    4   -10.058     1   
1          50  2012  acoustic         0.572   0.454    3   -10.286     1   
2          57  2012  acoustic         0.409   0.234    3   -13.711     1   
3          58  2012  acoustic         0.392   0.251   10    -9.845     1   
4          54  2012  acoustic         0.430   0.791    6    -5.419     0   

   speechiness  acousticness  instrumental

In [5]:
recommender = SpotifyRecommender(df, k_neighbors = 25)

from spotipy.exceptions import SpotifyException

# recommender.recommend_songs_from_titles([("HUMBLE.", "Kendrick Lamar"), ("Alright", "Kendrick Lamar"), ("DNA.", "Kendrick Lamar")])

# recommender.select_and_recommend()

def is_valid_username(name):
    try:
        user = sp.user(name)
        return True
    except SpotifyException as e:
        if e.http_status == 404:
            return False
        else:
            print(f"⚠️ SpotifyException: {e}")
            return False
    except Exception as e:
        print(f"⚠️ Unexpected error: {e}")
        return False

while True:
    username = input("Enter Spotify Username: ")
    if is_valid_username(username):
        break
    else:
        print("❌ Invalid Spotify username. Please try again.")

display_name = sp.user(username).get("display_name")
print(f"✅ Display name: {display_name}")
recommendations = recommender.recommend_songs_from_Spotify(username)

# print((f"✅ Retrieved {len(ids)} track IDs from all playlists"))

Enter Spotify Username: yriz0480vij1yjg8rp61ipqku
✅ Display name: Arush
Scraping playlist: Gooners
Scraping playlist: 🚗
Scraping playlist: spiderverse
Scraping playlist: New Favs
Scraping playlist: Arush
Done! (7)
Not found: 15uq77MdhAjrnBuumngIsz
Not found: 20dP2DaMHIAmwWAbp7peSr
Not found: 7GX5flRQZVHRAGd6B4TmDO
Not found: 3eh51r6rFWAlGQRlHx9QnQ
Not found: 0vjeOZ3Ft5jvAi9SBFJm1j
Not found: 2Hh3ETdQKrmSI3QS0hme7g
Not found: 1pacwLXyRO47ka0v6LTIiY
Not found: 221qmpQeBNV87sUjQeUTVH
Not found: 5wG3HvLhF6Y5KTGlK0IW3J
Not found: 2dHHgzDwk4BJdRwy9uXhTO
Not found: 2tudvzsrR56uom6smgOcSf
Not found: 4daEMLSZCgZ2Mt7gNm2SRa
Not found: 5BM6yfBokOMMgD6h869lRc
Not found: 6i0V12jOa3mr6uu4WYhUBr
Not found: 4u4VElxO7JM4IR4jR4TL1s
Not found: 7mykoq6R3BArsSpNDjFQTm
Not found: 6qYkmqFsXbj8CQjAdbYz07
Not found: 0gWrMbx6pbdH3n3nsLjE55
Not found: 285pBltuF7vW8TeWk8hdRR
Not found: 6HgWWaMu31KdOpEG5l28BG
Not found: 5w40ZYhbBMAlHYNDaVJIUu
Not found: 7dt6x5M1jzdTEt8oCbisTK
Not found: 21jGcNKet2qwijlDFuPiPb
Not 