In [20]:
import numpy as np
import pandas as pd
import json
import os

import spotipyAPI

user_playlist = spotipyAPI.user_playlist

# Set path to the folder containing JSON files
json_folder_path = "data"
n_files = 1 #number of files to read

# Get list of JSON file paths
json_file_paths = [os.path.join(json_folder_path, filename) for filename in os.listdir(json_folder_path) if filename.endswith(".json")]

# Read the first n_files JSON files and store in a list
playlists_list = []
for i in range(n_files):
    with open(json_file_paths[i], "r") as f:
        data = json.load(f)
        playlists = data["playlists"]
        playlists_list.extend(playlists)

df = pd.DataFrame(playlists_list)

In [21]:
#drop unnecessary columns
df = df.drop(columns=['collaborative', 'modified_at', 'num_followers', 'num_edits', 'description'])

In [22]:
display(df)

Unnamed: 0,name,pid,num_tracks,num_albums,tracks,duration_ms,num_artists
0,Throwbacks,0,52,47,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",11532414,37
1,Awesome Playlist,1,39,23,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",11656470,21
2,korean,2,64,51,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",14039958,31
3,mat,3,126,107,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",28926058,86
4,90s,4,17,16,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",4335282,16
...,...,...,...,...,...,...,...
995,old,995,41,40,"[{'pos': 0, 'artist_name': 'Katrina', 'track_u...",9917901,36
996,Daze,996,17,17,"[{'pos': 0, 'artist_name': 'PARTYNEXTDOOR', 't...",3699248,15
997,rap,997,119,98,"[{'pos': 0, 'artist_name': 'LoveRance', 'track...",27538723,82
998,Country,998,108,75,"[{'pos': 0, 'artist_name': 'Hunter Hayes', 'tr...",24950143,40


In [23]:
# Create empty lists to store track and interaction information
tracks = []
interactions = []

# Initialize a dictionary to store track_uri and its corresponding track_id
track_dict = {}

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Loop through each track in the playlist
    for track in row['tracks']:
        # Check if the track_uri has already been assigned a track_id
        if track['track_uri'] in track_dict:
            # If so, assign the existing track_id to this track
            track_id = track_dict[track['track_uri']]
            
        # Otherwise, create a new track_id and add it to the dictionary    
        else:
            if len(track_dict) == 0:
                track_id = 0
            else:
                track_id = len(track_dict)      
                      
            track_dict[track['track_uri']] = track_id
        
        # Add the track_id to this track's dictionary
        track['track_id'] = track_id
        
        # Add the track and interaction information to the lists
        tracks.append({
            'pid': row['pid'],
            'track_id': track['track_id'],
            'track_name': track['track_name'],
            'track_uri': track['track_uri'],
            'album_name': track['album_name'],
            'album_uri': track['album_uri'],
            'artist_name': track['artist_name'],
            'artist_uri': track['artist_uri'],
            'duration_ms': track['duration_ms'],
        })
        interactions.append({
            'pid': row['pid'],
            'track_id': track['track_id'],
            'rating': 1,
        })


# Create DataFrames from the lists of tracks and interactions
df_tracks = pd.DataFrame(tracks)
df_interactions = pd.DataFrame(interactions)


In [24]:
# Add user_playlist tracks to the DataFrame
user_tracks = []
user_playlist_pid = len(df)  # Use a new pid for user_playlist
for track in user_playlist:
    track_uri = track['uri']
    if track_uri in track_dict:
        track_id = track_dict[track_uri]
    else:
        track_id = len(track_dict)
        track_dict[track_uri] = track_id

    user_tracks.append({
        'pid': user_playlist_pid,
        'track_id': track_id,
        'track_name': track['name'],
        'track_uri': track['uri'],
        'artist_name': track['artist'],
        'rating': 1,
    })
df_user_tracks = pd.DataFrame(user_tracks)
df_interactions = pd.concat([df_interactions, df_user_tracks], ignore_index=True)
df_tracks = pd.concat([df_tracks, df_user_tracks], ignore_index=True)

In [26]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

#create sparse matrix containing ratings for each pid and track_uri
sparse_matrix = csr_matrix((df_interactions['rating'], (df_interactions['pid'], df_interactions['track_id'])))

# Convert sparse matrix to dense matrix
dense_matrix = sparse_matrix.toarray()

# Calculate Jaccard similarity matrix
user_distances = pairwise_distances(dense_matrix, metric='jaccard')




In [27]:
# number of neighbors to consider
k = 5

# Find k nearest neighbors for user_id
user_id = 1000
user_distances_sorted = np.argsort(user_distances[user_id])[1:k+1]

# Retrieve track_ids of recommended items from nearest neighbors
recommended_track_ids = set()
for neighbor_id in user_distances_sorted:
    neighbor_tracks = df_interactions[df_interactions['pid'] == neighbor_id]['track_id']
    recommended_track_ids.update(neighbor_tracks)

# Exclude tracks already rated by the target user
rated_tracks = df_interactions[df_interactions['pid'] == user_id]['track_id']
recommended_track_ids = recommended_track_ids - set(rated_tracks)

# Limit the recommended tracks to the desired number (10 in this case)
recommended_track_ids = list(recommended_track_ids)[:10]

# Print user ID and their playlist
print(f"User ID: {user_id}")
print("Playlist:")
for track_id in rated_tracks[:10]:
    track_name = df_tracks.loc[df_tracks['track_id'] == track_id, 'track_name'].values[0]
    artist_name = df_tracks.loc[df_tracks['track_id'] == track_id, 'artist_name'].values[0]
    print(f"- {track_id}: {track_name}, from {artist_name}")

# Print recommended track IDs and their names
print("Recommended tracks:")
for track_id in recommended_track_ids:
    track_name = df_tracks.loc[df_tracks['track_id'] == track_id, 'track_name'].values[0]
    artist_name = df_tracks.loc[df_tracks['track_id'] == track_id, 'artist_name'].values[0]
    track_pid = None
    for pid in user_distances_sorted:
        if track_id in df_interactions[df_interactions['pid'] == pid]['track_id'].values:
            track_pid = pid
            break
    print(f"- {track_id}: {track_name}, from {artist_name}. pid = {track_pid}")
    


User ID: 1000
Playlist:
- 34443: Complicated, from Avril Lavigne
- 3387: Bring Me To Life, from Evanescence
- 12061: Oops!...I Did It Again, from Britney Spears
- 26338: Everybody (Backstreet's Back) - Radio Edit, from Backstreet Boys
- 34444: Boulevard of Broken Dreams, from Green Day
- 34445: Smack That, from Eminem
- 12060: ...Baby One More Time, from Britney Spears
- 34446: Irreplaceable, from Beyoncé
- 8690: Gangsta's Paradise, from Coolio
- 4103: Ridin', from Chamillionaire
Recommended tracks:
- 15361: Crush, from David Archuleta. pid = 747
- 3074: Hall of Fame, from The Script. pid = 262
- 1027: Halo, from Beyoncé. pid = 262
- 2: Crazy In Love, from Beyoncé. pid = 747
- 5: Yeah!, from Usher. pid = 747
- 4102: Gold Digger, from Kanye West. pid = 747
- 3: Rock Your Body, from Justin Timberlake. pid = 38
- 7: Buttons, from The Pussycat Dolls. pid = 944
- 28677: I'm a Slave 4 U, from Britney Spears. pid = 944
- 12: Beautiful Soul, from Jesse McCartney. pid = 663
