In [51]:
import json
import pandas as pd

In [52]:
def load(i):
    start = i*1000
    path = "data/mpd.slice." + str(start) + "-" + str(start+999) + ".json"
    data = json.load(open(path,'r'))
    return pd.DataFrame.from_dict(data['playlists'], orient='columns')

In [53]:
# get tracks from playlists 

def get_train():
    songPlaylistArray = []
    start = 0
    while start != 1:
         thisSlice = load(start)
         songPlaylistArray.append(thisSlice)
         start+= 1
    return pd.concat(songPlaylistArray)

In [54]:
train = get_train()
#train

In [55]:
#train.loc[train['pid'] == 844]

In [56]:
# get table where each row represents a song within a playlist

def get_train_playlist_songs():
    exploded_train = train.explode('tracks')
    exploded_train.reset_index(drop=True, inplace=True)
    normalized_tracks = pd.json_normalize(exploded_train['tracks'])
    return pd.concat([exploded_train.drop(columns='tracks'), normalized_tracks], axis=1)

In [57]:
train_playlist_songs = get_train_playlist_songs()
#train_playlist_songs

In [58]:
#train_playlist_songs.columns.tolist()

In [59]:
# Discovering of data
#train_playlist_songs.isnull().sum()

In [60]:
count_df = train_playlist_songs['track_uri'].value_counts().reset_index()
count_df.columns = ['track_uri', 'count']

#count_df


In [61]:
first_500 = count_df[:500]['track_uri'].tolist()

#first_500

### Take some playlist as user's preferencies / TODO Temporary solution 

In [62]:
# Take un example for user's playlist
def load2(i):
    start = i*10000
    path = "data/mpd.slice." + str(start) + "-" + str(start+999) + ".json"
    data = json.load(open(path,'r'))
    return pd.DataFrame.from_dict(data['playlists'], orient='columns')

#user_PL = []
#user_PL.append(load2(1))
#df_user_PL = pd.concat(user_PL)
#df_user_PL = df_user_PL.iloc[1]
#df_user_PL

#### Select all playlists according track_uri from user's playlist

In [77]:
# URI's of user's prefered tracks as list

def get_pref_tracksURI_list(df_user_PL):
    pref_tracksURI = [el['track_uri'] for el in df_user_PL['tracks']]

    # Remove doublons
    pref_tracksURI_list = list(set(pref_tracksURI))
    return pref_tracksURI_list

In [64]:
#pref_tracksURI_list = get_pref_tracksURI_list()
#print("nb of prefered tracks = ", len(pref_tracksURI_list))
#pref_tracksURI_list

# User-based recommendation (based on playlist)

In [65]:
# Select playlists contained user's prefered tracks
def get_most_relevant(pref_tracksURI_list):
    pref_in_train = train_playlist_songs.loc[train_playlist_songs['track_uri'].isin(pref_tracksURI_list)]
    pref_in_train

    # Range them according to relevance
    most_relevant = pref_in_train['pid'].value_counts().rename_axis('pid').reset_index(name='Frequency')
    return most_relevant

In [66]:
#most_relevant = get_most_relevant()
#print("inique values:\n", most_relevant.nunique())
#most_relevant

In [67]:
# Range by most frequent songs
def get_most_freq_songs():
    most_freq_songs = pd.DataFrame(train_playlist_songs.groupby(['track_uri'])['pid'].count()).reset_index().rename(columns= {'pid':'song_freq'}).sort_values(by=['song_freq'], ascending=False)
    return most_freq_songs

In [68]:
#most_freq_songs = get_most_freq_songs()
#most_freq_songs

## User-based recommendation RESULT

In [90]:
# Make list of playlists in range of relevance
def get_recommendation(playlist):
    pref_tracksURI_list = get_pref_tracksURI_list(playlist)
    
    # if initial playlist is empty, return 500 most frequent tracks
    if len(pref_tracksURI_list) == 0:
        return first_500
    
    most_relevant = get_most_relevant(pref_tracksURI_list)
    most_freq_songs = get_most_freq_songs()

    pids = most_relevant['pid'].tolist()

    # Make list of songs in range of frequency
    #uris = most_freq_songs['track_uri'].tolist()

    # Exclude preferencies from proposition
    sans_preferences = train_playlist_songs[train_playlist_songs.pid.isin(pids) & ~train_playlist_songs.track_uri.isin(pref_tracksURI_list)][['pid', 'track_uri']]

    # Add playlist frequency info
    new_one = sans_preferences.merge(most_relevant, left_on='pid', right_on='pid').rename(columns= {'Frequency':'playlist_freq'})

    # Add track frequency info, sort by playlist_freq first then by song_freq
    new_one_1 = new_one.merge(most_freq_songs, left_on='track_uri', right_on='track_uri').sort_values(by=['playlist_freq', 'song_freq'], ascending=False)

    # Exclude doublons from proposition
    sans_doublons = new_one_1[['track_uri', 'playlist_freq', 'song_freq']].drop_duplicates(['track_uri'])

    # First 500 tracks
    recommended = sans_doublons[:500]['track_uri'].tolist()
    
    if len(recommended) < 500:
        songs_not_in_list = count_df[~count_df['track_uri'].isin(recommended)]['track_uri']
        songs_to_add = songs_not_in_list.head(500 - len(recommended)).tolist()
        recommended.extend(songs_to_add)
            

    return recommended

In [70]:
#recommended = get_recommendation()
#recommended

# Item-based recommendation (based on tracks)

In [71]:
# Load challenge set

path = "data_challenge/challenge_set.json"
data = json.load(open(path,'r'))
data_challenge = pd.DataFrame.from_dict(data['playlists'], orient='columns')

challenge_set = [data_challenge]
df_challenge = pd.concat(challenge_set)

In [91]:
import os

file_name = 'result.csv'
file_number = 1

while os.path.exists(f"{file_name[:-4]}_{file_number}.csv"):
    file_number += 1

new_file_name = f"{file_name[:-4]}_{file_number}.csv"


with open(new_file_name, mode='a', newline='', encoding='utf-8') as file:

    if file.tell() == 0:
        file.write('team_info,Bragina_Graff,vdfrtrp@gmail.com\n\n')

    for i in range(1004, 1005):
        playlist = df_challenge.iloc[i]
        print(len(recommended))
        recommended = get_recommendation(playlist)
        
        pid = playlist['pid']
        
        recommended_str = ', '.join(recommended)
        file.write(f"{pid}, {recommended_str}\n\n")

        

500
