## Import and read the CSV file into Pandas DataFrame

In [1]:
import json
import pandas as pd

In [3]:
def load(i):
    start = i*1000
    path = "data/mpd.slice." + str(start) + "-" + str(start+999) + ".json"
    data = json.load(open(path,'r'))
    return pd.DataFrame.from_dict(data['playlists'], orient='columns')

---
# Choose one of the following load options:

## (OLD) Load data to table where each row represents a song within a playlist

In [3]:
# get tracks from playlists 

n = 2

def get_train():
    songPlaylistArray = []
    start = 0
    while start != n:
         thisSlice = load(start)
         songPlaylistArray.append(thisSlice)
         start+= 1
    return pd.concat(songPlaylistArray)

In [None]:
#train = get_train()
#train

In [None]:
# get table where each row represents a song within a playlist

def get_train_playlist_songs():
    exploded_train = train.explode('tracks')
    exploded_train.reset_index(drop=True, inplace=True)
    normalized_tracks = pd.json_normalize(exploded_train['tracks'])
    return pd.concat([exploded_train.drop(columns='tracks'), normalized_tracks], axis=1)

In [None]:
#train_playlist_songs = get_train_playlist_songs()

## Load data to table where each row represents a song within a playlist

In [4]:
# get table where each row represents a playlist with 1 song
# combination of get_train and get_train_playlist_songs

n = 1

def get_train_playlist_songs_combined():
    start = 0
    concatenated_data = None
    
    while start != n:
        this_slice = load(start)

        if concatenated_data is None:
            concatenated_data = this_slice
        else:
            concatenated_data = pd.concat([concatenated_data, this_slice])
        start += 1
    
    exploded_train = concatenated_data.explode('tracks')
    exploded_train.reset_index(drop=True, inplace=True)
    normalized_tracks = pd.json_normalize(exploded_train['tracks'])
    
    return pd.concat([exploded_train.drop(columns='tracks'), normalized_tracks], axis=1)


In [5]:
#train_playlist_songs = get_train_playlist_songs_combined()
#train_playlist_songs

## Load data to table where each row represents a song within a playlist, and which contains only pid and track_uri

In [6]:
# get table where each row represents a playlist with 1 song
# contains only pid and track_uri

import time

n = 1

def get_train_playlist_songs_combined_small():
    start = 0
    concatenated_data = None
    
    start_time = time.time()
    while start != n:
        this_slice = load(start)

        this_slice = this_slice[['pid', 'tracks']]

        exploded_slice = this_slice.explode('tracks')
        exploded_slice.reset_index(drop=True, inplace=True)
        normalized_tracks = pd.json_normalize(exploded_slice['tracks'])
        normalized_tracks = normalized_tracks[['track_uri']]

        playlist_songs = pd.concat([exploded_slice.drop(columns='tracks'), normalized_tracks], axis=1)

        if concatenated_data is None:
            concatenated_data = playlist_songs
        else:
            concatenated_data = pd.concat([concatenated_data, playlist_songs])
        start += 1

        if(start % 1 == 0):
            end_time = time.time()
            execution_time = end_time - start_time
            print(f"playlist {start*1000} loaded in {execution_time:.2f} seconds")
            start_time = time.time()

    print("all playlists loaded")

    concatenated_data.reset_index(drop=True, inplace=True)

    return concatenated_data

In [7]:
#train_playlist_songs = get_train_playlist_songs_combined_small()
#len(train_playlist_songs)

## Load data to table where each row represents a song within a playlist, and which contains only pid and track_uri from pickle file

In [None]:
# restore data from pickle file

#train_playlist_songs = pd.read_pickle("my_data.pkl")
#len(train_playlist_songs) 

---
## Save data to pickle file

In [None]:
# save data to pickle file

#train_playlist_songs.to_pickle("my_data.pkl")

---
## User-based recommendation (based on playlist)

In [18]:
# URI's of user's prefered tracks as list

def get_pref_tracksURI_list(df_user_PL):
    pref_tracksURI = [el['track_uri'] for el in df_user_PL['tracks']]

    # Remove doublons
    pref_tracksURI_list = list(set(pref_tracksURI))
    return pref_tracksURI_list

In [6]:
# get table where each row represents a song with number of times it appears in playlists

count_df = train_playlist_songs['track_uri'].value_counts().reset_index()

count_df.columns = ['track_uri', 'count']

In [7]:
# 500 most popular songs 

def get_first_tracks(n):
    return count_df[:n]['track_uri'].tolist()

#first_500

In [8]:
# Select playlists contained user's prefered tracks
def get_most_relevant(pref_tracksURI_list):
    pref_in_train = train_playlist_songs.loc[train_playlist_songs['track_uri'].isin(pref_tracksURI_list)]
    pref_in_train

    # Range them according to relevance
    most_relevant = pref_in_train['pid'].value_counts().rename_axis('pid').reset_index(name='Frequency')
    return most_relevant

In [9]:
# Range by most frequent songs
def get_most_freq_songs():
    most_freq_songs = pd.DataFrame(train_playlist_songs.groupby(['track_uri'])['pid'].count()).reset_index().rename(columns= {'pid':'song_freq'}).sort_values(by=['song_freq'], ascending=False)
    return most_freq_songs

most_freq_songs = get_most_freq_songs()

---
## User-based recommendation function

In [19]:
# Make list of playlists in range of relevance
def get_recommendation(playlist, n):
    pref_tracksURI_list = get_pref_tracksURI_list(playlist)
    
    # if initial playlist is empty, return 500 most frequent tracks
    if len(pref_tracksURI_list) == 0:
        return get_first_tracks(n)
    
    most_relevant = get_most_relevant(pref_tracksURI_list)

    pids = most_relevant['pid'].tolist()

    # Exclude preferencies from proposition
    sans_preferences = train_playlist_songs[train_playlist_songs.pid.isin(pids) & ~train_playlist_songs.track_uri.isin(pref_tracksURI_list)][['pid', 'track_uri']]

    # Add playlist frequency info
    new_one = sans_preferences.merge(most_relevant, left_on='pid', right_on='pid').rename(columns= {'Frequency':'playlist_freq'})

    # Add track frequency info, sort by playlist_freq first then by song_freq
    new_one_1 = new_one.merge(most_freq_songs, left_on='track_uri', right_on='track_uri').sort_values(by=['playlist_freq', 'song_freq'], ascending=False)

    # Exclude doublons from proposition
    sans_doublons = new_one_1[['track_uri', 'playlist_freq', 'song_freq']].drop_duplicates(['track_uri'])

    # First 500 tracks
    recommended = sans_doublons[:n]['track_uri'].tolist()
    
    if len(recommended) < n:
        songs_not_in_list = count_df[~count_df['track_uri'].isin(recommended)]['track_uri']
        songs_to_add = songs_not_in_list.head(n - len(recommended)).tolist()
        recommended.extend(songs_to_add)
            
    return recommended

In [None]:
#recommended = get_recommendation()
#recommended

---
## Using User-based recommendation with challenge dataset

In [21]:
# Load challenge set

path = "data_challenge/challenge_set.json"
data = json.load(open(path,'r'))
data_challenge = pd.DataFrame.from_dict(data['playlists'], orient='columns')

challenge_set = [data_challenge]
df_challenge = pd.concat(challenge_set)

In [None]:
# Make recommendation for each playlist in challenge set, write to file

import os

file_name = 'result.csv'
file_number = 1

while os.path.exists(f"{file_name[:-4]}_{file_number}.csv"):
    file_number += 1

new_file_name = f"{file_name[:-4]}_{file_number}.csv"


with open(new_file_name, mode='a', newline='', encoding='utf-8') as file:

    if file.tell() == 0:
        file.write('team_info,Bragina_Graff,vdfrtrp@gmail.com\n\n')

    for i in range(1004, 1005):
        playlist = df_challenge.iloc[i]

        recommended = get_recommendation(playlist)
        
        pid = playlist['pid']
        
        recommended_str = ', '.join(recommended)
        file.write(f"{pid}, {recommended_str}\n\n")

        if(i%100 == 0):
            print(f"playlist {i} was processed")

        

---
## Using User-based recommendation with 1 track 

In [32]:
def evaluate_algorithm(algorithm, playlists):
    # Divide each playlist into 2 parts: first 80% and last 20%
    parts = []
    for playlist in playlists:
        parts.append(playlist[:int(len(playlist)*0.8)])
        parts.append(playlist[int(len(playlist)*0.8):])

    print(parts)

    # Make recommendation for first 80% and compare to last 20%
    # Calculate precision and recall
    # Calculate average precision and recall for all playlists
    # Calculate F1 score
    # Return average F1 score

playlists = [load(10).iloc[0]]

algorithm = lambda playlist, n: get_recommendation(playlist, n)
evaluate_algorithm(algorithm, playlists)

[name                                                          Funk
collaborative                                                false
pid                                                          10000
modified_at                                             1470355200
num_tracks                                                      16
num_albums                                                      15
num_followers                                                    1
tracks           [{'pos': 0, 'artist_name': 'Coldplay', 'track_...
num_edits                                                        4
Name: 0, dtype: object, duration_ms    4007017
num_artists         11
description        NaN
Name: 0, dtype: object]
