In [120]:
#import tensorflow as tf

#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [1]:
import numpy as np
import pandas as pd
import json
import os

# Set path to the folder containing JSON files
json_folder_path = "data"
n_files = 1

# Get list of JSON file paths
json_file_paths = [os.path.join(json_folder_path, filename) for filename in os.listdir(json_folder_path) if filename.endswith(".json")]

# Read the first n_files JSON files and store in a list
playlists_list = []
for i in range(n_files):
    with open(json_file_paths[i], "r") as f:
        data = json.load(f)
        #print(f.name, "loaded")
        playlists = data["playlists"]
        playlists_list.extend(playlists)

df = pd.DataFrame(playlists_list)


In [2]:
display(df)

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,false,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,false,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,false,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,false,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,false,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,old,false,995,1507852800,41,40,1,"[{'pos': 0, 'artist_name': 'Katrina', 'track_u...",8,9917901,36,
996,Daze,false,996,1479254400,17,17,1,"[{'pos': 0, 'artist_name': 'PARTYNEXTDOOR', 't...",13,3699248,15,
997,rap,false,997,1410307200,119,98,1,"[{'pos': 0, 'artist_name': 'LoveRance', 'track...",63,27538723,82,
998,Country,false,998,1507939200,108,75,1,"[{'pos': 0, 'artist_name': 'Hunter Hayes', 'tr...",37,24950143,40,


In [3]:
df = df.drop(columns=['collaborative', 'modified_at', 'num_followers', 'num_edits', 'description'])


In [4]:
# Create empty lists to store track and interaction information
tracks = []
interactions = []

# Initialize a dictionary to store track_uri and its corresponding track_id
track_dict = {}

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Loop through each track in the playlist
    for track in row['tracks']:
        # Check if the track_uri has already been assigned a track_id
        if track['track_uri'] in track_dict:
            # If so, assign the existing track_id to this track
            track_id = track_dict[track['track_uri']]
        else:
            # Otherwise, create a new track_id and add it to the dictionary
            track_id = len(track_dict) + 1
            track_dict[track['track_uri']] = track_id
        
        # Add the track_id to this track's dictionary
        track['track_id'] = track_id
        
        # Add the track and interaction information to the lists
        tracks.append({
            'pid': row['pid'],
            'track_id': track['track_id'],
            'track_name': track['track_name'],
            'track_uri': track['track_uri'],
            'album_name': track['album_name'],
            'album_uri': track['album_uri'],
            'artist_name': track['artist_name'],
            'artist_uri': track['artist_uri'],
            'duration_ms': track['duration_ms'],
        })
        interactions.append({
            'pid': row['pid'],
            'track_id': track['track_id'],
            'rating': 1,
        })


# Create DataFrames from the lists of tracks and interactions
df_tracks = pd.DataFrame(tracks)
df_interactions = pd.DataFrame(interactions)


In [5]:
import pandas as pd

# Create a DataFrame with all possible combinations of pid and track_id
#all_pids = df_tracks['pid'].unique()
#all_track_ids = df_tracks['track_id'].unique()
#df_all_combinations = pd.DataFrame({'pid': all_pids})
#df_all_combinations['key'] = 1  # Key column to merge on
#df_all_combinations = df_all_combinations.merge(pd.DataFrame({'track_id': all_track_ids, 'key': 1}), on='key')


In [6]:
# merge with interactions dataframe to get the rating values
#df_ratings = pd.merge(df_all_combinations, df_interactions, on=['pid', 'track_id'], how='left')

# fill the missing rating values with 0
#df_ratings['rating'] = df_ratings['rating'].fillna(0)

# keep only pid, track_id and rating columns
#df_ratings = df_ratings[['pid', 'track_id', 'rating']]


In [7]:
#display(df_ratings)

In [8]:
from surprise import accuracy, Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(df_interactions[["pid", "track_id", "rating"]], reader)
print('Dataset loaded!')

# Define similarity measure
sim_options = {'name': 'cosine', 'user_based': False}

# Train the model
model = KNNWithMeans(k=50, sim_options=sim_options)


Dataset loaded!


In [9]:
# Train the algorithm on the dataset
model.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x22b37a76d10>

In [13]:
# Get recommendations for a playlist
target_playlist_id = 0
target_track_ids = df_tracks[df_tracks['pid'] == target_playlist_id]['track_id'].unique()
target_playlist_tracks = set(target_track_ids)

all_track_ids = df_tracks['track_id'].unique()
all_unseen_track_ids = np.setdiff1d(all_track_ids, target_track_ids)

# Get the predicted ratings for the unseen tracks
testset = [[target_playlist_id, track_id, 1] for track_id in all_unseen_track_ids]
predictions = model.test(testset)
accuracy.rmse(predictions)

# Get the top 10 recommended track IDs
top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:10]
recommended_track_ids = [pred.iid for pred in top_n]

# Get the recommended tracks that are not already in the target playlist
recommended_tracks = df_tracks[df_tracks['track_id'].isin(recommended_track_ids)]
recommended_tracks = recommended_tracks[~recommended_tracks['track_id'].isin(target_playlist_tracks)]

print(recommended_tracks)

RMSE: 0.0000
       pid  track_id                                         track_name  \
52       1        52                                   Eye of the Tiger   
53       1        53   Libera Me From Hell (Tengen Toppa Gurren Lagann)   
54       1        54                                      Pokémon Theme   
55       1        55         Concerning Hobbits (The Lord of the Rings)   
56       1        56      The Blood of Cuchulainn (The Boondock Saints)   
57       1        57           He's a Pirate (Pirates of the Caribbean)   
58       1        58  Very Bloody Tears (Castlevania II: Simon's Quest)   
59       1        59                         U.N. Owen Was Her? (Remix)   
60       1        60                            I am the Doctor in Utah   
61       1        61                          The Room Where It Happens   
4848    84        52                                   Eye of the Tiger   
7390   120        52                                   Eye of the Tiger   
7598   121  