## Import and read the CSV file into Pandas DataFrame

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import math

In [None]:
def load(i):
    start = i*1000
    path = "data/mpd.slice." + str(start) + "-" + str(start+999) + ".json"
    data = json.load(open(path,'r'))
    return pd.DataFrame.from_dict(data['playlists'], orient='columns')

---
## Choose one of the following load options:

### (OLD) Load data to table where each row represents a song within a playlist

In [None]:
# get tracks from playlists 

n = 2

def get_train():
    songPlaylistArray = []
    start = 0
    while start != n:
         thisSlice = load(start)
         songPlaylistArray.append(thisSlice)
         start+= 1
    return pd.concat(songPlaylistArray)

# get table where each row represents a song within a playlist

def get_train_playlist_songs():
    exploded_train = train.explode('tracks')
    exploded_train.reset_index(drop=True, inplace=True)
    normalized_tracks = pd.json_normalize(exploded_train['tracks'])
    return pd.concat([exploded_train.drop(columns='tracks'), normalized_tracks], axis=1)

#train = get_train()

#train_playlist_songs = get_train_playlist_songs()

### Load data to table where each row represents a song within a playlist

In [None]:
# get table where each row represents a playlist with 1 song
# combination of get_train and get_train_playlist_songs

n = 1

def get_train_playlist_songs_combined():
    start = 0
    concatenated_data = None
    
    while start != n:
        this_slice = load(start)

        if concatenated_data is None:
            concatenated_data = this_slice
        else:
            concatenated_data = pd.concat([concatenated_data, this_slice])
        start += 1
    
    exploded_train = concatenated_data.explode('tracks')
    exploded_train.reset_index(drop=True, inplace=True)
    normalized_tracks = pd.json_normalize(exploded_train['tracks'])
    
    return pd.concat([exploded_train.drop(columns='tracks'), normalized_tracks], axis=1)


In [None]:
#train_playlist_songs = get_train_playlist_songs_combined()
#train_playlist_songs

### Load data to table where each row represents a song within a playlist, and which contains only pid and track_uri

In [None]:
# get table where each row represents a playlist with 1 song
# contains only pid and track_uri

import time

n = 1

def get_train_playlist_songs_combined_small():
    start = 0
    concatenated_data = None
    
    start_time = time.time()
    while start != n:
        this_slice = load(start)

        this_slice = this_slice[['pid', 'tracks']]

        exploded_slice = this_slice.explode('tracks')
        exploded_slice.reset_index(drop=True, inplace=True)
        normalized_tracks = pd.json_normalize(exploded_slice['tracks'])
        normalized_tracks = normalized_tracks[['track_uri']]

        playlist_songs = pd.concat([exploded_slice.drop(columns='tracks'), normalized_tracks], axis=1)

        if concatenated_data is None:
            concatenated_data = playlist_songs
        else:
            concatenated_data = pd.concat([concatenated_data, playlist_songs])
        start += 1

        if(start % 1 == 0):
            end_time = time.time()
            execution_time = end_time - start_time
            print(f"playlist {start*1000} loaded in {execution_time:.2f} seconds")
            start_time = time.time()

    print("all playlists loaded")

    concatenated_data.reset_index(drop=True, inplace=True)

    return concatenated_data

In [None]:
#train_playlist_songs = get_train_playlist_songs_combined_small()
#len(train_playlist_songs)

### Load data to table where each row represents a song within a playlist, and which contains only playlist name, pid and track_uri

In [None]:
# get table where each row represents a playlist with 1 song
# contains only pid and track_uri

import time

n = 1

def get_train_playlist_songs_combined_small_2():
    start = 0
    result_tracks = None
    result_names = None
    
    start_time = time.time()
    while start != n:
        slice = load(start)

        slice_tracks = slice[['pid', 'tracks']]
        slice_names = slice[['pid', 'name']]

        exploded_slice = slice_tracks.explode('tracks')
        exploded_slice.reset_index(drop=True, inplace=True)
        normalized_tracks = pd.json_normalize(exploded_slice['tracks'])
        normalized_tracks = normalized_tracks[['track_uri']]

        playlist_songs = pd.concat([exploded_slice.drop(columns='tracks'), normalized_tracks], axis=1)

        if result_tracks is None:
            result_tracks = playlist_songs
            result_names = slice_names
        else:
            result_tracks = pd.concat([result_tracks, playlist_songs])
            result_names = pd.concat([result_names, slice_names])
        start += 1

        if(start % 1 == 0):
            end_time = time.time()
            execution_time = end_time - start_time
            print(f"playlist {start*1000} loaded in {execution_time:.2f} seconds")
            start_time = time.time()

    print("all playlists loaded")

    result_tracks.reset_index(drop=True, inplace=True)

    return result_tracks, result_names 

In [None]:
#train_playlist_songs, train_names = get_train_playlist_songs_combined_small_2()
#train_playlist_songs, train_names

### Load data to table where each row represents a song within a playlist, and which contains only pid and track_uri from pickle file

In [None]:
# restore data from pickle file

#train_playlist_songs = pd.read_pickle("my_data.pkl")
#len(train_playlist_songs) 

---
## Save data to pickle file

In [None]:
# save data to pickle file

#train_playlist_songs.to_pickle("my_data.pkl")

---
## Divide dataset to train and evaluate

In [None]:
# Divide data into train and evaluate

unique_playlists = train_playlist_songs['pid'].unique()

train_playlists, evaluate_playlists = train_test_split(unique_playlists, test_size=0.2, random_state=42)

train = train_playlist_songs[train_playlist_songs['pid'].isin(train_playlists)]
evaluate = train_playlist_songs[train_playlist_songs['pid'].isin(evaluate_playlists)]

#train_playlist_songs = train
#evaluate_playlists_songs = evaluate

---
# Methode 1: Cold start recommendation

In [None]:
# get table where each row represents a song with number of times it appears in playlists

count_df = train_playlist_songs['track_uri'].value_counts().reset_index()

count_df.columns = ['track_uri', 'count']

In [None]:
# 500 most popular songs 

def get_first_tracks(n):
    return count_df[:n]['track_uri'].tolist()

#first_500

---
# Methode 2: User-based recommendation (based on playlist)

### Auxiliary functions

In [None]:
# Range by most frequent songs
def get_most_freq_songs():
    most_freq_songs = pd.DataFrame(train_playlist_songs.groupby(['track_uri'])['pid'].count()).reset_index().rename(columns= {'pid':'song_freq'}).sort_values(by=['song_freq'], ascending=False)
    return most_freq_songs

most_freq_songs = get_most_freq_songs()
#most_freq_songs

In [None]:
# URI's of user's prefered tracks as list

def get_pref_tracksURI_list(df_user_PL):
    pref_tracksURI = [el['track_uri'] for el in df_user_PL['tracks']]

    # Remove doublons
    pref_tracksURI_list = list(set(pref_tracksURI))
    return pref_tracksURI_list

In [None]:
# Select playlists contained user's prefered tracks
def get_most_relevant_playlist(pref_tracksURI_list):
    pref_in_train = train_playlist_songs.loc[train_playlist_songs['track_uri'].isin(pref_tracksURI_list)]
    
    # Range them according to relevance
    most_relevant = pref_in_train['pid'].value_counts().rename_axis('pid').reset_index(name='Frequency')
    return most_relevant

### Main function

In [None]:
# Make recommendation for a playlist
def get_recommendation_for_playlist(playlist, n):

    # if initial playlist is empty, return 500 most frequent tracks
    if len(playlist) == 0:
        return get_first_tracks(n)
    
    most_relevant = get_most_relevant_playlist(playlist)

    pids = most_relevant['pid'].tolist()

    # Exclude preferencies from proposition
    sans_preferences = train_playlist_songs[train_playlist_songs.pid.isin(pids) & ~train_playlist_songs.track_uri.isin(playlist)][['pid', 'track_uri']]

    # Add playlist frequency info
    new_one = sans_preferences.merge(most_relevant, left_on='pid', right_on='pid').rename(columns= {'Frequency':'playlist_freq'})

    # Add track frequency info, sort by playlist_freq first then by song_freq
    new_one_1 = new_one.merge(most_freq_songs, left_on='track_uri', right_on='track_uri').sort_values(by=['playlist_freq', 'song_freq'], ascending=False)

    # Exclude doublons from proposition
    sans_doublons = new_one_1[['track_uri', 'playlist_freq', 'song_freq']].drop_duplicates(['track_uri'])

    # First 500 tracks
    recommended = sans_doublons[:n]['track_uri'].tolist()
    
    if len(recommended) < n:
        songs_not_in_list = count_df[~count_df['track_uri'].isin(recommended)]['track_uri']
        songs_to_add = songs_not_in_list.head(n - len(recommended)).tolist()
        recommended.extend(songs_to_add)
            
    return recommended

---
# Methode 3: Tracklist title analysis

In [None]:
from sentence_transformers import SentenceTransformer, util
import string

import re
import nltk
nltk.download('word2vec_sample')
nltk.download('all')
#nltk.download('punkt')            #if arror need to download
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from gensim.models import Word2Vec

### Auxiliary functions

In [None]:
# A little cleaning
def prepare_titles(df_train):
    liste = (df_train[['name']]
          .drop_duplicates(['name'])['name']
          .str.lower()
          .str.lstrip(string.punctuation)
          .str.lstrip(string.whitespace)
          .str.rstrip(string.punctuation)
          .str.rstrip(string.whitespace)
          .tolist()
        )   
    #remove doblicates again
    return list(set(liste))


# Function to calculate similarity score between two words
def get_synonym_score(word1, word2):
    synsets_word1 = wordnet.synsets(word1)
    synsets_word2 = wordnet.synsets(word2)

    if synsets_word1 and synsets_word2:
        syn_score = synsets_word1[0].path_similarity(synsets_word2[0])
        return syn_score
    else:
        return None

# Calculate similarity between words from both sentences
def get_similarity_between_words(words_sentence1, words_sentence2):
    words_sentence1 = words_sentence1.split()
    words_sentence2 = words_sentence2.split()
    symres = 0
    for word1 in words_sentence1:
        for word2 in words_sentence2:
            similarity_score = get_synonym_score(word1, word2)
            if similarity_score is not None:
                symres += similarity_score
    maxlen = max(len(words_sentence1), len(words_sentence2))
    return symres/maxlen


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

list_titles = prepare_titles(train_names)

encoded_titles_pairs = [(item, model.encode(item)) for item in list_titles]

### Main functions

In [None]:
# Find <k> the most similare titles of playlists for <title>
def find_similar_titles(title, k):    
    simularity = {}
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    sample_embed = model.encode(title, convert_to_tensor=True)
    
    for el,curr_embed in encoded_titles_pairs:
            
        #Compute cosine-similarities
        cosine_scores = util.cos_sim(sample_embed, curr_embed)
        
        if (cosine_scores.item() >0.5):
            simularity[el] = cosine_scores.item()

    # sort by values
    ret = dict(sorted(simularity.items(), key=lambda x:x[1], reverse = True))
    
    if (k < len(simularity)):
        print(f"Get first {k} simular titles from {len(ret)} with similarity score > 0.5:")        
        return dict(list(ret.items())[0: k])
    else:
        print(f"{len(ret)} Simular titles from with similarity score > 0.5:") 
        return ret


# Find in titles <=k synonymes of title
def get_similarity_for_title(title, titles, k):
    simularity = {}

    for word in titles:
        similarity_score = get_similarity_between_words(title, word)
        if similarity_score is not None:
            if similarity_score > 0.5 :
                simularity[word] = similarity_score
                    
    ret = dict(sorted(simularity.items(), key=lambda x:x[1], reverse = True))
    
    if (k < len(ret)):
        print(f"Get first {k} synonyme titles from {len(ret)} with similarity score > 0.5:")        
        return dict(list(ret.items())[0: k])
    else:
        print(f"{len(ret)} synonyme titles from with similarity score > 0.5:") 
        return ret


In [None]:
def get_recommendation_by_title(pid, k):

    title = train_names.loc[train_names['pid'] == pid]['name'].iloc[0]

    return find_similar_titles(title, k)

get_recommendation_by_title(4, 5)

In [None]:
pref1 = ['dance']
pref2 = ['uplift']
pref3 = ['Groovin']

list_titles = prepare_titles(train_names)

rez = find_similar_titles(pref1[0], list_titles, 10)
print(rez)
print()

similarity_list = get_similarity_for_title(pref1[0], list_titles, 10)
print(similarity_list)

---
## Efficiency calculation

In [None]:
def calculate_dcg(recommended, evaluations):
    rel = []

    for track in recommended:
        if track in evaluations:
            rel.append(evaluations.index(track))
        else:
            rel.append(0)

    dcg = rel[0]
    dcgi = 1
    for i in range(1, len(rel)):
        dcg += rel[i] / math.log2(i + 1)
        dcgi += 1 / math.log2(i + 1)  
    return dcg/dcgi

---
## Using User-based recommendation with challenge dataset

In [None]:
# Load challenge set

path = "data_challenge/challenge_set.json"
data = json.load(open(path,'r'))
data_challenge = pd.DataFrame.from_dict(data['playlists'], orient='columns')

challenge_set = [data_challenge]
df_challenge = pd.concat(challenge_set)

In [None]:
# Make recommendation for each playlist in challenge set, write to file

import os

file_name = 'result.csv'
file_number = 1

while os.path.exists(f"{file_name[:-4]}_{file_number}.csv"):
    file_number += 1

new_file_name = f"{file_name[:-4]}_{file_number}.csv"


with open(new_file_name, mode='a', newline='', encoding='utf-8') as file:

    if file.tell() == 0:
        file.write('team_info,Bragina_Graff,vdfrtrp@gmail.com\n\n')

    for i in range(1004, 1005):
        playlist = df_challenge.iloc[i]

        recommended = get_recommendation_for_playlist(playlist, 500)
        
        pid = playlist['pid']
        
        recommended_str = ', '.join(recommended)
        file.write(f"{pid}, {recommended_str}\n\n")

        if(i%100 == 0):
            print(f"playlist {i} was processed")

        

---
## Evaluate

In [None]:
def apply_algorithm(row, algorithm):
    pid = row['pid']

    tracks = row['tracks']

    tests, evaluations = train_test_split(tracks, test_size=0.2, random_state=42)

    recommended = algorithm(tests, 500)

    intersection = [item for item in recommended if item in evaluations]

    precision = len(intersection) / len(recommended)
    recall = len(intersection) / len(evaluations)
    dcgs = calculate_dcg(recommended, evaluations)

    return precision, recall, dcgs

def evaluate_algorithm(algorithm, tracks):
    
    results = tracks.apply(lambda row: apply_algorithm(row, algorithm), axis=1)
    
    results_df = pd.DataFrame(results.tolist(), columns=['precision', 'recall', 'dcgs'])
    
    average_precision = results_df['precision'].mean()
    average_recall = results_df['recall'].mean()
    average_dcgs = results_df['dcgs'].mean()
    
    print(f"Average precision: {average_precision}")
    print(f"Average recall: {average_recall}")
    print(f"Average dcg: {average_dcgs}")

---
## Work

In [None]:
# Get list of list of tracks of each playlist

playlist_tracks = evaluate_playlists_songs.groupby('pid')['track_uri'].apply(list).reset_index(name='tracks')

In [None]:
tracks = playlist_tracks[:2]
print(len(tracks))

algorithm = lambda playlist, n: get_recommendation_for_playlist(playlist, n)
evaluate_algorithm(algorithm, tracks)