In [12]:
# imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import requests 
import json
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

In [2]:
# comma-separated, double quotes as enclosing fields, skip rows with completely broken formats
data = pd.read_csv('spotify_dataset.csv', sep=',', quotechar='"', engine='python', on_bad_lines='skip')
print(data.head())

                            user_id                      "artistname"  \
0  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
1  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
2  9cc0cfd4d7d7885102480dd99e7a90d6                      Tiffany Page   
3  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
4  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   

                                         "trackname"  "playlistname"  
0               (The Angels Wanna Wear My) Red Shoes  HARD ROCK 2010  
1  (What's So Funny 'Bout) Peace, Love And Unders...  HARD ROCK 2010  
2                                   7 Years Too Late  HARD ROCK 2010  
3                              Accidents Will Happen  HARD ROCK 2010  
4                                             Alison  HARD ROCK 2010  


In [3]:
print(data.info()) 
print(data.isnull().sum())
print(data.describe())
print(data.columns)
data.columns = data.columns.str.strip().str.replace('"', '')
print(data.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12791369 entries, 0 to 12791368
Data columns (total 4 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   user_id          object
 1    "artistname"    object
 2    "trackname"     object
 3    "playlistname"  object
dtypes: object(4)
memory usage: 390.4+ MB
None
user_id                0
 "artistname"      33536
 "trackname"          88
 "playlistname"       41
dtype: int64
                                 user_id  "artistname"  "trackname"  \
count                           12791369      12757833     12791281   
unique                             15910        287440      1999876   
top     4398de6902abde3351347b048fcdc287     Daft Punk        Intro   
freq                              294969         36004         6672   

        "playlistname"  
count         12791328  
unique          156888  
top            Starred  
freq           1331416  
Index(['user_id', ' "artistname"', ' "trackname"', ' "playlistname"'], 

In [4]:
# drop rows with missing track names
data = data.dropna(subset=['trackname'])  
data.reset_index(drop=True, inplace=True)

# Fill missing artist names and missing playlist
data['artistname'].fillna('Unknown Artist', inplace=True)  
data['playlistname'].fillna('Unknown Playlist', inplace=True)  

In [5]:
print(data.isnull().sum())
print(data.info())
print(data.describe())
print(data.nunique())  
print(data.shape)

user_id         0
artistname      0
trackname       0
playlistname    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12791281 entries, 0 to 12791280
Data columns (total 4 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   user_id       object
 1   artistname    object
 2   trackname     object
 3   playlistname  object
dtypes: object(4)
memory usage: 390.4+ MB
None
                                 user_id artistname trackname playlistname
count                           12791281   12791281  12791281     12791281
unique                             15910     287435   1999876       156889
top     4398de6902abde3351347b048fcdc287  Daft Punk     Intro      Starred
freq                              294968      36004      6672      1331410
user_id           15910
artistname       287435
trackname       1999876
playlistname     156889
dtype: int64
(12791281, 4)


In [6]:
# sample unique users to preserve user-track interactions
unique_users = data['user_id'].unique()

# training data 5000 users -> 50000 interactions
selected_train_users = pd.Series(unique_users).sample(500, random_state=42)
train_data = data[data['user_id'].isin(selected_train_users)]
# if train_data.shape[0] > 50000:
#     train_data = train_data.sample(50000, random_state=42)

# testing data 500 users -> 5000 interactions
remaining_users = list(set(unique_users) - set(selected_train_users))
selected_test_users = pd.Series(remaining_users).sample(50, random_state=42)
test_data = data[data['user_id'].isin(selected_test_users)]
# if test_data.shape[0] > 5000:
#     test_data = test_data.sample(5000, random_state=42)

# Select exactly 3 random playlists per user
def select_random_playlists(user_df):
    unique_playlists = user_df['playlistname'].unique()
    if len(unique_playlists) > 3:
        selected_playlists = pd.Series(
            unique_playlists).sample(3, random_state=42)
        return user_df[user_df['playlistname'].isin(selected_playlists)]
    return user_df  # Keep all if user has 3 or fewer playlists
  
train_data = train_data.groupby('user_id', group_keys=False).apply(select_random_playlists)
test_data = test_data.groupby('user_id', group_keys=False).apply(select_random_playlists)

In [7]:
print("train data")
print(train_data.describe())
print(train_data.info())
print(train_data.nunique())
print(train_data.shape)
# print(train_data.head())

train data
                                 user_id     artistname trackname playlistname
count                              89547          89547     89547        89547
unique                               500          13580     64022         1205
top     7828c8c38a51637339cdb965dc1acc1b  John Williams     Intro      Starred
freq                                6817            423        47         8858
<class 'pandas.core.frame.DataFrame'>
Index: 89547 entries, 3624339 to 4883393
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       89547 non-null  object
 1   artistname    89547 non-null  object
 2   trackname     89547 non-null  object
 3   playlistname  89547 non-null  object
dtypes: object(4)
memory usage: 3.4+ MB
None
user_id           500
artistname      13580
trackname       64022
playlistname     1205
dtype: int64
(89547, 4)


In [8]:

print("test data")
print(test_data.info())
print(test_data.describe())
print(test_data.nunique())
print(test_data.shape)
# print(test_data.head())

test data
<class 'pandas.core.frame.DataFrame'>
Index: 9865 entries, 1802626 to 2012345
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       9865 non-null   object
 1   artistname    9865 non-null   object
 2   trackname     9865 non-null   object
 3   playlistname  9865 non-null   object
dtypes: object(4)
memory usage: 385.4+ KB
None
                                 user_id       artistname trackname  \
count                               9865             9865      9865   
unique                                50             3088      8443   
top     5f350bacfa1130865dc0745b2c40982d  The Cranberries    Dreams   
freq                                 743              175         7   

       playlistname  
count          9865  
unique          119  
top         Starred  
freq           3110  
user_id           50
artistname      3088
trackname       8443
playlistname     119
dtype: int64
(9865, 4)


In [9]:
# Create a User-Track Binary Interaction Matrix
interaction_matrix = train_data.pivot_table(index='user_id', columns='trackname', aggfunc='size', fill_value=0)
interaction_matrix = (interaction_matrix > 0).astype(int) 

print(interaction_matrix.shape) 

(500, 64022)


In [10]:
# Compute Cosine Similarity between users
user_similarity_matrix = cosine_similarity(interaction_matrix)
user_similarity_df = pd.DataFrame(user_similarity_matrix, index=interaction_matrix.index, columns=interaction_matrix.index)

print(user_similarity_df.head())

user_id                           00967afd6e687f2d07cab2e874fb929d  \
user_id                                                              
00967afd6e687f2d07cab2e874fb929d                          1.000000   
00abf74ef96b119cf0052a3d86fd4425                          0.000000   
00c64863e80532f599ebdd6e3e127a64                          0.004771   
016d025f7c4996a039244149d64ec0fa                          0.000000   
022cf127719796c6fdf8e0f4976c5330                          0.000000   

user_id                           00abf74ef96b119cf0052a3d86fd4425  \
user_id                                                              
00967afd6e687f2d07cab2e874fb929d                          0.000000   
00abf74ef96b119cf0052a3d86fd4425                          1.000000   
00c64863e80532f599ebdd6e3e127a64                          0.004791   
016d025f7c4996a039244149d64ec0fa                          0.000000   
022cf127719796c6fdf8e0f4976c5330                          0.000000   

user_id           

In [15]:
# Compute User-User Co-Occurrence Matrix
co_occurrence_matrix = interaction_matrix.dot(interaction_matrix.T)

# Set diagonal to 0 (remove self-co-occurrence)
np.fill_diagonal(co_occurrence_matrix.values, 0)

print(co_occurrence_matrix.head())  # View co-occurrence matrix

user_id                           00967afd6e687f2d07cab2e874fb929d  \
user_id                                                              
00967afd6e687f2d07cab2e874fb929d                                 0   
00abf74ef96b119cf0052a3d86fd4425                                 0   
00c64863e80532f599ebdd6e3e127a64                                 1   
016d025f7c4996a039244149d64ec0fa                                 0   
022cf127719796c6fdf8e0f4976c5330                                 0   

user_id                           00abf74ef96b119cf0052a3d86fd4425  \
user_id                                                              
00967afd6e687f2d07cab2e874fb929d                                 0   
00abf74ef96b119cf0052a3d86fd4425                                 0   
00c64863e80532f599ebdd6e3e127a64                                 1   
016d025f7c4996a039244149d64ec0fa                                 0   
022cf127719796c6fdf8e0f4976c5330                                 0   

user_id           

In [20]:
alpha = 0.8  # Adjust balance between Cosine Similarity and Co-Occurrence
final_similarity_matrix = alpha * user_similarity_df + (1 - alpha) * co_occurrence_matrix

print(final_similarity_matrix.head())  # View final similarity scores

user_id                           00967afd6e687f2d07cab2e874fb929d  \
user_id                                                              
00967afd6e687f2d07cab2e874fb929d                          0.800000   
00abf74ef96b119cf0052a3d86fd4425                          0.000000   
00c64863e80532f599ebdd6e3e127a64                          0.203817   
016d025f7c4996a039244149d64ec0fa                          0.000000   
022cf127719796c6fdf8e0f4976c5330                          0.000000   

user_id                           00abf74ef96b119cf0052a3d86fd4425  \
user_id                                                              
00967afd6e687f2d07cab2e874fb929d                          0.000000   
00abf74ef96b119cf0052a3d86fd4425                          0.800000   
00c64863e80532f599ebdd6e3e127a64                          0.203833   
016d025f7c4996a039244149d64ec0fa                          0.000000   
022cf127719796c6fdf8e0f4976c5330                          0.000000   

user_id           

In [24]:
def recommend_songs_for_user(user_id, data, similarity_df, top_k_users=20, top_k_songs=10):
    """
    Recommend songs based on similar users' listening history.

    Args:
        user_id (str): The target user.
        data (DataFrame): The original Spotify dataset.
        similarity_df (DataFrame): The user similarity matrix.
        top_k_users (int): Number of similar users to consider.
        top_k_songs (int): Number of songs to recommend.

    Returns:
        List of recommended songs.
    """
    if user_id not in similarity_df.index:
        return []

    # Find top-K similar users and get songs listened to by the similar users
    similar_users = similarity_df.loc[user_id].sort_values(ascending=False).iloc[1:top_k_users+1].index
    similar_users_songs = data[data['user_id'].isin(similar_users)]['trackname']

    # Exclude songs already listened to by the target user
    user_songs = set(data[data['user_id'] == user_id]['trackname'])
    recommended_songs = [song for song in similar_users_songs if song not in user_songs]

    return list(pd.Series(recommended_songs).value_counts().index[:top_k_songs])

sample_user = train_data['user_id'].sample(1).values[0]
recommended_songs = recommend_songs_for_user(sample_user, train_data, final_similarity_matrix)
print(f"Recommended songs for user {sample_user}: {recommended_songs}")


Recommended songs for user be59c3f44f06390364dfb5653d2edb25: ['Only Girl (In The World)', 'Danza Kuduro', 'Bad Romance', 'Chandelier', 'Diamonds', 'Around The World', 'Stay', 'Memories (feat. Kid Cudi)', 'Roar', 'Bounce - Radio Edit']


In [31]:
def get_similar_users(favorite_songs, interaction_matrix, similarity_matrix, top_k_users=10):
    # Find users who listened to at least one of the favorite songs
    relevant_users = interaction_matrix.loc[:, interaction_matrix.columns.isin(favorite_songs)].sum(axis=1)
    relevant_users = relevant_users[relevant_users > 0]  # Keep only users who listened to at least one song

    if relevant_users.empty:
        print("No users found with similar song preferences.")
        return pd.Index([])  

    # Get top-k most similar users to the generated user profile
    similar_users = similarity_matrix.loc[relevant_users.index].sum(axis=0).sort_values(ascending=False)
    return similar_users.index[:top_k_users]  


def recommend_songs_for_user_profile(favorite_songs, interaction_matrix, similarity_matrix, top_k_users=10, top_k_songs=10):
    similar_users = get_similar_users(favorite_songs, interaction_matrix, similarity_matrix, top_k_users)

    if similar_users.empty:  
        return ["No recommendations found. Try adding more favorite songs."]

    similar_users_songs = interaction_matrix.loc[similar_users].sum(axis=0)
    recommended_songs = similar_users_songs.drop(index=favorite_songs, errors='ignore')
    return recommended_songs.sort_values(ascending=False).index[:top_k_songs]

# Test 
favorite_songs = ["Shape of You", "Blinding Lights", "Bohemian Rhapsody", "Someone Like You"]
recommended_songs = recommend_songs_for_user_profile(favorite_songs, interaction_matrix, final_similarity_matrix)

print("Recommended Songs:", recommended_songs)

Recommended Songs: Index(['Hold On', 'Runaway', 'Kids', 'Closer', 'One', 'Afterlife', 'Dreams',
       'Crazy', 'Intro', 'Drive'],
      dtype='object', name='trackname')
