## Import and read the CSV file into Pandas DataFrame

In [43]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import math

In [44]:
def load(i):
    start = i*1000
    path = "data/mpd.slice." + str(start) + "-" + str(start+999) + ".json"
    data = json.load(open(path,'r'))
    return pd.DataFrame.from_dict(data['playlists'], orient='columns')

---
## Choose one of the following load options:

### Load data to table where each row represents a song within a playlist, and which contains only playlist name, pid and track_uri

In [45]:
# get table where each row represents a playlist with 1 song
# contains only pid and track_uri

import time

n = 10

def get_train_playlist_songs_combined_small_2():
    start = 0
    result_tracks = None
    result_names = None
    
    start_time = time.time()
    while start != n:
        slice = load(start)

        slice_tracks = slice[['pid', 'tracks']]
        slice_names = slice[['pid', 'name']]

        exploded_slice = slice_tracks.explode('tracks')
        exploded_slice.reset_index(drop=True, inplace=True)
        normalized_tracks = pd.json_normalize(exploded_slice['tracks'])
        normalized_tracks = normalized_tracks[['track_uri']]

        playlist_songs = pd.concat([exploded_slice.drop(columns='tracks'), normalized_tracks], axis=1)

        if result_tracks is None:
            result_tracks = playlist_songs
            result_names = slice_names
        else:
            result_tracks = pd.concat([result_tracks, playlist_songs])
            result_names = pd.concat([result_names, slice_names])
        start += 1

        if(start % 1 == 0):
            end_time = time.time()
            execution_time = end_time - start_time
            print(f"playlist {start*1000} loaded in {execution_time:.2f} seconds")
            start_time = time.time()

    print("all playlists loaded")

    result_tracks.reset_index(drop=True, inplace=True) 
    result_tracks['track_id'] = result_tracks['track_uri'].astype('category').cat.codes

    return result_tracks, result_names 

In [46]:
train_playlist_songs, playlist_names = get_train_playlist_songs_combined_small_2()
train_playlist_songs, playlist_names
train_playlist_songs

playlist 1000 loaded in 0.61 seconds
playlist 2000 loaded in 0.59 seconds
playlist 3000 loaded in 0.63 seconds
playlist 4000 loaded in 0.61 seconds
playlist 5000 loaded in 0.65 seconds
playlist 6000 loaded in 0.61 seconds
playlist 7000 loaded in 0.64 seconds
playlist 8000 loaded in 0.64 seconds
playlist 9000 loaded in 0.64 seconds
playlist 10000 loaded in 0.61 seconds
all playlists loaded


Unnamed: 0,pid,track_uri,track_id
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,10959
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,137484
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,11768
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,25799
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,38891
...,...,...,...
664707,9999,spotify:track:6ZOPiKQeibCn7fP8dncucL,143416
664708,9999,spotify:track:7pxhKtuTwofDIdgHx2DcVK,166569
664709,9999,spotify:track:7mldq42yDuxiUNn08nvzHO,165451
664710,9999,spotify:track:23TxRN09aR1RB0G0tFoT0b,44962


### Load data to table where each row represents a song within a playlist, and which contains only pid and track_uri from pickle file

In [5]:
# restore data from pickle file

#train_playlist_songs = pd.read_pickle("my_data.pkl")
#len(train_playlist_songs) 

---
## Save data to pickle file

In [6]:
# save data to pickle file

#train_playlist_songs.to_pickle("my_data.pkl")

---
## Divide dataset to train and evaluate

In [47]:
# Divide data into train and evaluate

unique_playlists = train_playlist_songs['pid'].unique()

train_playlists, evaluate_playlists = train_test_split(unique_playlists, test_size=0.2, random_state=42)

train = train_playlist_songs[train_playlist_songs['pid'].isin(train_playlists)]
evaluate = train_playlist_songs[train_playlist_songs['pid'].isin(evaluate_playlists)]

train_names = playlist_names[playlist_names['pid'].isin(train_playlists)]
train_playlist_songs = train
evaluate_playlists_songs = evaluate



---
# Methode 1: Cold start recommendation

In [48]:
# get table where each row represents a song with number of times it appears in playlists

count_df = train_playlist_songs['track_uri'].value_counts().reset_index()

count_df.columns = ['track_uri', 'count']

def get_first_tracks_URI(n=500):
    return count_df[:n]['track_uri'].tolist()

count_df

Unnamed: 0,track_uri,count
0,spotify:track:7KXjTSCq5nL1LoYtL7XAwS,358
1,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,356
2,spotify:track:1xznGGDReH1oQq0xzbwXa3,337
3,spotify:track:7yyRTcZmCiyzzJlNzGC9Ol,320
4,spotify:track:3a1lNhkSLSkpJE4MSHpDu9,319
...,...,...
146794,spotify:track:7B7NVwjvqQVqnbKsRwir8h,1
146795,spotify:track:75N9csksJk1ViavIjzibGk,1
146796,spotify:track:7EBPgjH8H6CNsRU0026ew2,1
146797,spotify:track:7inTj6TyfkvXqTcSnj8w1n,1


---
# Methode 2: User-based recommendation (based on playlist)

### Auxiliary functions

In [49]:
# Range by most frequent songs
def get_most_freq_songs():
    most_freq_songs = pd.DataFrame(train_playlist_songs.groupby(['track_id'])['pid'].count()).reset_index().rename(columns= {'pid':'song_freq'}).sort_values(by=['song_freq'], ascending=False)
    return most_freq_songs

most_freq_songs = get_most_freq_songs()
most_freq_songs

Unnamed: 0,track_id,song_freq
138112,160047,358
135355,156834,356
37221,42999,337
146451,169684,320
67731,78291,319
...,...,...
59652,68970,1
59651,68968,1
59650,68967,1
59649,68966,1


In [50]:
# Select playlists contained user's prefered tracks
def get_most_relevant_playlist(pref_tracksID_list):
    pref_in_train = train_playlist_songs.loc[train_playlist_songs['track_id'].isin(pref_tracksID_list)]

    # Range them according to relevance
    most_relevant = pref_in_train['pid'].value_counts().rename_axis('pid').reset_index(name='Frequency')
    return most_relevant

### Main function

In [51]:
# Make recommendation for a playlist
def get_recommendation_for_playlist(pid, ids_tracks, n):

    # if initial playlist is empty, return 500 most frequent tracks
    if len(ids_tracks) == 0:
        return get_first_tracks_URI(n)
    
    most_relevant = get_most_relevant_playlist(ids_tracks)
    pids = most_relevant['pid'].tolist()

    # Exclude preferencies from proposition
    sans_preferences = train_playlist_songs[train_playlist_songs.pid.isin(pids) & ~train_playlist_songs.track_id.isin(ids_tracks)][['pid', 'track_id']]
    
    # Add playlist frequency info
    new_one = sans_preferences.merge(most_relevant, left_on='pid', right_on='pid').rename(columns= {'Frequency':'playlist_freq'})
 
    # Add track frequency info, sort by playlist_freq first then by song_freq
    new_one_1 = new_one.merge(most_freq_songs, left_on='track_id', right_on='track_id').sort_values(by=['playlist_freq', 'song_freq'], ascending=False)

    # Exclude doublons from proposition
    sans_doublons = new_one_1[['track_id', 'playlist_freq', 'song_freq']].drop_duplicates(['track_id'])

    # First 500 tracks
    recommended = sans_doublons[:n]['track_id'].tolist()    

    return train_playlist_songs.loc[train_playlist_songs['track_id'].isin(recommended)]['track_uri'].tolist()

---
# Methode 3: Tracklist title analysis

In [52]:
from sentence_transformers import SentenceTransformer, util
import string

import re
import nltk
nltk.download('word2vec_sample')
nltk.download('all')
#nltk.download('punkt')            #if arror need to download
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from gensim.models import Word2Vec

[nltk_data] Downloading package word2vec_sample to
[nltk_data]     C:\Users\pernata\AppData\Roaming\nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\pernata\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\pernata\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\pernata\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\pernata\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is

### Auxiliary functions

In [53]:
# A little cleaning
def prepare_titles(df_train):
    liste = (df_train[['name']]
          .drop_duplicates(['name'])['name']
          .str.lower()
          .str.lstrip(string.punctuation)
          .str.lstrip(string.whitespace)
          .str.rstrip(string.punctuation)
          .str.rstrip(string.whitespace)
          .tolist()
        )   
    #remove doblicates again
    return list(set(liste))


# Function to calculate similarity score between two words
def get_synonym_score(word1, word2):
    synsets_word1 = wordnet.synsets(word1)
    synsets_word2 = wordnet.synsets(word2)

    if synsets_word1 and synsets_word2:
        syn_score = synsets_word1[0].path_similarity(synsets_word2[0])
        return syn_score
    else:
        return None

# Calculate similarity between words from both sentences
def get_similarity_between_words(words_sentence1, words_sentence2):
    words_sentence1 = words_sentence1.split()
    words_sentence2 = words_sentence2.split()
    symres = 0
    for word1 in words_sentence1:
        for word2 in words_sentence2:
            similarity_score = get_synonym_score(word1, word2)
            if similarity_score is not None:
                symres += similarity_score
    maxlen = max(len(words_sentence1), len(words_sentence2))
    return symres/maxlen


In [54]:
model = SentenceTransformer('all-MiniLM-L6-v2')

list_titles = prepare_titles(train_names)
original_titles = train_names['name'].tolist()
#original_titles

encoded_titles_pairs = [(item, model.encode(item)) for item in list_titles]

### Main functions

In [28]:
# Find <k> the most similare titles of playlists for <title>
def find_similar_titles(title):    
    simularity = {}
    
    sample_embed = model.encode(title, convert_to_tensor=True)
    
    for el,curr_embed in encoded_titles_pairs:
        if title != el :
            
            #Compute cosine-similarities
            cosine_scores = util.cos_sim(sample_embed, curr_embed)
            
            if (cosine_scores.item() > 0.3):
                simularity[el] = cosine_scores.item()

    # sort by values
    ret = dict(sorted(simularity.items(), key=lambda x:x[1], reverse = True))
    
    return ret


# Find in titles <=k synonymes of title
def find_synonyms_for_title(title):
    simularity = {}
    
    for word in original_titles:
        similarity_score = get_similarity_between_words(title, word)
        if similarity_score is not None:  
            if similarity_score > 0.3 :
                simularity[word] = similarity_score
                    
    ret = dict(sorted(simularity.items(), key=lambda x:x[1], reverse = True))
      
    return ret

In [55]:
def get_recommendation_by_title(pid, playlist, k, algo):

    title = playlist_names.loc[playlist_names['pid'] == pid]['name'].iloc[0]

    similar_titles = algo(title)

    playlist_pids = []
    for playlist_name in similar_titles:
        df_name = train_names.loc[train_names['name'] == playlist_name]
        pid = list(df_name['pid']) if not df_name.empty else []
        playlist_pids += pid
    
    result = []

    for pid in playlist_pids:
        tracks_to_add = train_playlist_songs.loc[train_playlist_songs['pid'] == playlist_pids[0], 'track_uri'].tolist()
        result += tracks_to_add
        if len(result) >= k:
            break
            
    return result[:k]

In [56]:
def get_recommendation_by_similarity(pid, playlist, k):
    algo = lambda title : find_similar_titles(title) 
    return get_recommendation_by_title(pid, playlist, k, algo)

def get_recommendation_by_synonymity(pid, playlist, k):
    algo = lambda title : find_synonyms_for_title(title) 
    return get_recommendation_by_title(pid, playlist, k, algo)

rec = get_recommendation_by_synonymity(0, [], 100)
len(rec)

100

In [57]:
pref1 = ['dance']
pref2 = ['uplift']
pref3 = ['Groovin']

list_titles = prepare_titles(playlist_names)

rez = find_similar_titles(pref1[0])
print(rez)
print()

rez2 = find_synonyms_for_title(pref1[0])
print(rez2)

{'dance dance': 0.9566716551780701, 'dancing': 0.9515113234519958, 'dancedancedance': 0.8280803561210632, 'dance music': 0.826194167137146, 'slow dance': 0.8002327680587769, 'dance workout': 0.7983101010322571, 'slow dancing': 0.7948652505874634, 'dancing on my own': 0.7875999808311462, 'dance songs': 0.7856455445289612, 'dance class': 0.7765056490898132, 'dance party': 0.7688067555427551, 'kitchen dancing': 0.7624962329864502, 'first dance': 0.7602494359016418, 'dance it out': 0.7564575672149658, 'lets dance': 0.7514907121658325, 'lose yourself to dance': 0.7451066970825195, 'dancey': 0.7252622842788696, 'dance mix': 0.7231075167655945, 'latin dance': 0.7207781672477722, 'dance pop': 0.7201910018920898, 'swing dance': 0.7096807360649109, "90's dance": 0.7063279151916504, "80's dance": 0.7032131552696228, 'dance club': 0.6809204816818237, 'line dances': 0.677971601486206, 'dance jams': 0.675937831401825, 'eurodance': 0.6519446969032288, 'dancehall': 0.6517083644866943, 'waltz': 0.61203

---
## Efficiency calculation

In [58]:
def calculate_dcg(recommended, evaluations):
    rel = []

    for track in recommended:
        if track in evaluations:
            rel.append(evaluations.index(track))
        else:
            rel.append(0)

    dcg = rel[0]
    dcgi = 1
    for i in range(1, len(rel)):
        dcg += rel[i] / math.log2(i + 1)
        dcgi += 1 / math.log2(i + 1)  
    return dcg/dcgi

---
## Using User-based recommendation with challenge dataset

In [None]:
## Load challenge set
#
#path = "data_challenge/challenge_set.json"
#data = json.load(open(path,'r'))
#data_challenge = pd.DataFrame.from_dict(data['playlists'], orient='columns')
#
#challenge_set = [data_challenge]
#df_challenge = pd.concat(challenge_set)

In [None]:
## Make recommendation for each playlist in challenge set, write to file
#
#import os
#
#file_name = 'result.csv'
#file_number = 1
#
#while os.path.exists(f"{file_name[:-4]}_{file_number}.csv"):
#    file_number += 1
#
#new_file_name = f"{file_name[:-4]}_{file_number}.csv"
#
#
#with open(new_file_name, mode='a', newline='', encoding='utf-8') as file:
#
#    if file.tell() == 0:
#        file.write('team_info,Bragina_Graff,vdfrtrp@gmail.com\n\n')
#
#    for i in range(1004, 1005):
#        playlist = df_challenge.iloc[i]
#
#        recommended = get_recommendation_for_playlist(playlist, 500)
#        
#        pid = playlist['pid']
#        
#        recommended_str = ', '.join(recommended)
#        file.write(f"{pid}, {recommended_str}\n\n")
#
#        if(i%100 == 0):
#            print(f"playlist {i} was processed")
#
#        

---
## Evaluate

In [59]:
def apply_algorithm(row, algorithms):

    k = 500
    
    pid = row['pid']

    tracks = row['tracks']

    tests, evaluations = train_test_split(tracks, test_size=0.2, random_state=42)

    recommended = []
    for algorithm in algorithms:
        recommended += [item for item in algorithm(pid, tests, k) if item not in recommended]
        if len(recommended) >= k:
            break
    
    if len(recommended) < k:
        recommended += [item for item in get_first_tracks_URI() if item not in recommended]
    
    recommended = recommended[:k]

    intersection = [item for item in recommended if item in evaluations]

    precision = len(intersection) / len(recommended)
    recall = len(intersection) / len(evaluations)
    dcgs = calculate_dcg(recommended, evaluations)
    #print(f"Algo applied for {pid}")

    return precision, recall, dcgs

def evaluate_algorithm(algorithms, tracks):

    results = tracks.apply(lambda row: apply_algorithm(row, algorithms), axis=1)

    results_df = pd.DataFrame(results.tolist(), columns=['precision', 'recall', 'dcgs'])
    
    average_precision = results_df['precision'].mean()
    average_recall = results_df['recall'].mean()
    average_dcgs = results_df['dcgs'].mean()
    
    print(f"Average precision: {average_precision}")
    print(f"Average recall: {average_recall}")
    print(f"Average dcg: {average_dcgs}")

---
## Work

In [60]:
# Get list of list of tracks of each playlist

playlist_tracks = evaluate_playlists_songs.groupby('pid')['track_uri'].apply(list).reset_index(name='tracks')

In [None]:
tracks = playlist_tracks     #TODO  Idea  what if  prepare_titles  it  ?  minimal cleaning

algorithm1 = lambda pid, playlist, n: get_recommendation_for_playlist(pid, playlist, n)
algorithm2 = lambda pid, playlist, n: get_recommendation_by_similarity(pid, playlist, n)
algorithm3 = lambda pid, playlist, n: get_recommendation_by_synonymity(pid, playlist, n)

algorithms = [algorithm1]
evaluate_algorithm(algorithms, tracks)

algorithms = [algorithm2]
evaluate_algorithm(algorithms, tracks)

algorithms = [algorithm2, algorithm1]
evaluate_algorithm(algorithms, tracks)

algorithms = [algorithm3]
evaluate_algorithm(algorithms, tracks)

Average precision: 0.0036340000000000005
Average recall: 0.13763761294145707
Average dcg: 0.042247280273250795
