In [14]:
#import tensorflow as tf

#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [15]:
import numpy as np
import pandas as pd
import json
import os

# Set path to the folder containing JSON files
json_folder_path = "data"
n_files = 1

# Get list of JSON file paths
json_file_paths = [os.path.join(json_folder_path, filename) for filename in os.listdir(json_folder_path) if filename.endswith(".json")]

# Read the first n_files JSON files and store in a list
playlists_list = []
for i in range(n_files):
    with open(json_file_paths[i], "r") as f:
        data = json.load(f)
        #print(f.name, "loaded")
        playlists = data["playlists"]
        playlists_list.extend(playlists)

df = pd.DataFrame(playlists_list)


In [16]:
display(df)

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,false,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,false,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,false,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,false,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,false,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,old,false,995,1507852800,41,40,1,"[{'pos': 0, 'artist_name': 'Katrina', 'track_u...",8,9917901,36,
996,Daze,false,996,1479254400,17,17,1,"[{'pos': 0, 'artist_name': 'PARTYNEXTDOOR', 't...",13,3699248,15,
997,rap,false,997,1410307200,119,98,1,"[{'pos': 0, 'artist_name': 'LoveRance', 'track...",63,27538723,82,
998,Country,false,998,1507939200,108,75,1,"[{'pos': 0, 'artist_name': 'Hunter Hayes', 'tr...",37,24950143,40,


In [17]:
df = df.drop(columns=['collaborative', 'modified_at', 'num_followers', 'num_edits', 'description'])


In [18]:
# Create empty lists to store track and interaction information
tracks = []
interactions = []

# Initialize a dictionary to store track_uri and its corresponding track_id
track_dict = {}

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Loop through each track in the playlist
    for track in row['tracks']:
        # Check if the track_uri has already been assigned a track_id
        if track['track_uri'] in track_dict:
            # If so, assign the existing track_id to this track
            track_id = track_dict[track['track_uri']]
        else:
            # Otherwise, create a new track_id and add it to the dictionary
            track_id = len(track_dict) + 1
            track_dict[track['track_uri']] = track_id
        
        # Add the track_id to this track's dictionary
        track['track_id'] = track_id
        
        # Add the track and interaction information to the lists
        tracks.append({
            'pid': row['pid'],
            'track_id': track['track_id'],
            'track_name': track['track_name'],
            'track_uri': track['track_uri'],
            'album_name': track['album_name'],
            'album_uri': track['album_uri'],
            'artist_name': track['artist_name'],
            'artist_uri': track['artist_uri'],
            'duration_ms': track['duration_ms'],
        })
        interactions.append({
            'pid': row['pid'],
            'track_id': track['track_id'],
            'rating': 1,
        })


# Create DataFrames from the lists of tracks and interactions
df_tracks = pd.DataFrame(tracks)
df_interactions = pd.DataFrame(interactions)


In [19]:
display(df_interactions)

Unnamed: 0,pid,track_id,rating
0,0,1,1
1,0,2,1
2,0,3,1
3,0,4,1
4,0,5,1
...,...,...,...
67498,999,2077,1
67499,999,34440,1
67500,999,34441,1
67501,999,34442,1


In [21]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

sparse_matrix = csr_matrix((df_interactions['rating'], (df_interactions['pid'], df_interactions['track_id'])))

# Convert sparse matrix to dense matrix
dense_matrix = sparse_matrix.toarray()

# Calculate Jaccard similarity matrix
user_distances = pairwise_distances(dense_matrix, metric='jaccard')




In [22]:
display(dense_matrix.size)

34444000

In [23]:
# Assuming k = 5 (number of neighbors to consider)
k = 1

# Find k nearest neighbors for user with id 0
user_id = 261
user_distances_sorted = np.argsort(user_distances[user_id])[1:k+1]

# Retrieve track_ids of recommended items from nearest neighbors
recommended_track_ids = set()
for neighbor_id in user_distances_sorted:
    neighbor_tracks = df_interactions[df_interactions['pid'] == neighbor_id]['track_id']
    recommended_track_ids.update(neighbor_tracks)

# Exclude tracks already rated by the target user
rated_tracks = df_interactions[df_interactions['pid'] == user_id]['track_id']
recommended_track_ids = recommended_track_ids - set(rated_tracks)

# Limit the recommended tracks to the desired number (10 in this case)
recommended_track_ids = list(recommended_track_ids)[:10]


# Print user ID and their playlist
print(f"User ID: {user_id}")
print("Playlist:")
for track_id in rated_tracks[:10]:
    track_name = df_tracks.loc[df_tracks['track_id'] == track_id, 'track_name'].values[0]
    display(f"- {track_id}: {track_name}")

# Print recommended track IDs and their names
print("Recommended tracks:")
for track_id in recommended_track_ids:
    track_name = df_tracks.loc[df_tracks['track_id'] == track_id, 'track_name'].values[0]
    display(f"- {track_id}: {track_name}")


User ID: 261
Playlist:


'- 1817: Stay'

'- 12055: Get Home (Get Right)'

'- 2931: Show Me Love (feat. Chance The Rapper, Moses Sumney and Robin Hannibal) - Skrillex Remix'

'- 3714: One Dance'

"- 7756: I'm Good"

'- 7781: Mind Your Manners (feat. Icona Pop)'

'- 12056: City Of Stars'

'- 1623: Work from Home'

'- 537: Go Flex'

'- 4756: What They Want'

Recommended tracks:


'- 14858: Fast Car'

'- 2702: Caroline'

'- 18710: Eleven 11: / 11'

"- 28055: I Can't Stop Drinking About You"

'- 3610: Time For That'

'- 1946: Exchange'

'- 22561: Freak Hoe'

'- 1406: You & Me'

'- 1827: Never Be Like You'

'- 4773: No Problem (feat. Lil Wayne & 2 Chainz)'