#### Import packages

In [1]:
import numpy as np
import pandas as pd

from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

#### Load and preprocess data 

In [2]:
df = pd.read_csv('songs.csv')

# drop useless columns
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# generate playlist and song ids
df['tid'] = pd.factorize(df.track_name + df.artist_name + df.album_name)[0]

# remove duplicate songs from the same playlist
df.drop_duplicates(subset=['pid', 'tid'], inplace=True)
df.sort_values(by=['pid', 'tid'])

# add prefixes to playlist and tracks ids
df['pid'] = 'pid_' + df['pid'].astype(str)
df['tid'] = 'tid_' + df['tid'].astype(str)

# add column to keep track of playlist-track pairs
df['count'] = 1

In [3]:
df

Unnamed: 0,pid,playlist,track_name,artist_name,album_name,tid,count
0,pid_0,Throwbacks,Sk8er Boi,Avril Lavigne,Let Go,tid_0,1
1,pid_0,Throwbacks,My Happy Ending,Avril Lavigne,Under My Skin,tid_1,1
2,pid_0,Throwbacks,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),tid_2,1
3,pid_0,Throwbacks,Check On It - feat. Bun B and Slim Thug,Beyoncé,B'Day,tid_3,1
4,pid_0,Throwbacks,Stacy's Mom,Bowling For Soup,I've Never Done Anything Like This,tid_4,1
...,...,...,...,...,...,...,...
334482,pid_4999,Waltz,The Ludlows,Rob Folsom,Sentimental Serenade,tid_108732,1
334483,pid_4999,Waltz,Crested Hens,Solas,Solas,tid_108733,1
334484,pid_4999,Waltz,"Cavatina (From ""The Deer Hunter"")",Stanley Myers,John Williams - The Guitarist,tid_108734,1
334485,pid_4999,Waltz,"Theme from the Elephant Man - From ""The Elepha...",The Hollywood LA Soundtrack Orchestra,"Blockbuster Main Themes, Vol. 4 (Best of Sound...",tid_108735,1


#### Number of unique songs and playlists

In [4]:
print('Unique playlists: ', df['pid'].nunique())
print('Unique songs: ', df['tid'].nunique())

Unique playlists:  5000
Unique songs:  108737


#### Lengths of playlists

In [5]:
df['pid'].value_counts().sort_values()

pid_3311      4
pid_1231      5
pid_2594      5
pid_54        5
pid_278       5
           ... 
pid_4018    243
pid_1482    245
pid_1617    245
pid_2183    246
pid_3149    246
Name: pid, Length: 5000, dtype: int64

#### Item CF Class

In [6]:
class ItemCF:
    def __init__(self, target_playlist_id, num_seed, k):
        """
        Parameters
        ----------
        target_playlist_id : int
            Unique identifier of a playlist for which we're trying to suggest songs
        num_seed : int
            Number of seed tracks used to create recommendations
        k : int
            The parameter for kNN algorithm in order to find top k most similar songs to the seed track. 
        """
        self.target_playlist_id = target_playlist_id
        self.num_seed = num_seed
        self.k = k
        
    def get_seed_tracks():
        # to be done
        return
             
    def filter_by_playlist_length(self, df, len_threshold):
        """Filters a given dataframe to contain only playlist with more than len_threshold songs.

        Parameters
        ----------
        df : DataFrame
            Pandas dataframe containing full information about playlists and songs
        len_threshold : int
            Threshold used to filter a dataframe based on playlist length

        Returns
        -------
        filtered_df
            Filtered dataframe containing only playlists longer than len_threshold
        """
        filtered_df = df.groupby('pid').filter(lambda x: len(x) > len_threshold).copy()
        return filtered_df
    
    def create_playlist_song_matrix(self, df):
        """Creates a binary playlist-song matrix with unique playlist identifiers as rows and unique song identifiers
        as columns. An entry of a matrix is 1 if a song is contained in the playlist, 0 otherwise.

        Parameters
        ----------
        df : DataFrame
            Pandas dataframe containing full information about playlists and songs
        
        Returns
        -------
        pt_df
            Dataframe which stores a binary playlist-song matrix
        """
        # create playlist and song categories
        playlist_type = CategoricalDtype(df['pid'].unique(), ordered=True)
        song_type = CategoricalDtype(df['tid'].unique(), ordered=True)
        
        # creation of a playlist-song matrix using a compressed row matrix format
        # this is necessary because sparsity prevents us from using groupby or pivot on dataframe directly 
        row = df['tid'].astype(song_type).cat.codes
        col = df['pid'].astype(playlist_type).cat.codes
        sparse_matrix = csr_matrix((df['count'], (row, col)), shape=(song_type.categories.size, playlist_type.categories.size))
        
        # convert back to dataframe
        # CSR helps avoid issues with memory and computational complexity but has rather limiting indexing options
        pt_df = pd.DataFrame(sparse_matrix.todense(),index=song_type.categories, columns=playlist_type.categories)
        return pt_df

    def exclude_target_playlist_songs(self, df, seed_track_id):
        """Creates a slice of a dataframe containing a playlist-song matrix which excludes the songs which are already
        in the target playlist. This is necessary because the goal is to recommend songs which are not present in the playlist.
        The seed track is kept in order to compute the similarities with other songs. 
        
        Parameters
        ----------
        df : DataFrame
            Pandas dataframe containing a binary playlist-song matrix
        
        seed_track_id : int
            Unique identifier of a seed track
        
        Returns
        -------
        slice_df
            Dataframe which stores a binary playlist-song matrix excluding the songs already present in the playlist
        """      
        song_ids = df.loc[df[self.target_playlist_id] == 1].index.tolist()
        song_ids.remove(seed_track_id)
        slice_df = df[~df.index.isin(song_ids)].copy()
        return slice_df

    def find_k_most_similar_songs(self, df, seed_track_id):
        """Finds k most similar songs to the seed track using a kNN algorithm with a cosine similarity as a similarity measure.
        The songs are considered more similar if they co-occur more often across different playlists.
        
        Parameters
        ----------
        df : DataFrame
            Pandas dataframe containing a binary playlist-song matrix excluding the songs which are already in the playlist
        
        seed_track_id : int
            Unique identifier of a seed track
        
        Returns
        -------
        sim_tracks_id
            List of unique song identifiers for the top k most similar songs to the seed track
        
        track_distances
            List of distances between a seed track and top k most similar songs to it
        """       
        # exclude songs already in the target playlist
        slice_df = self.exclude_target_playlist_songs(df, seed_track_id)
        
        # apply knn to get top k most similar tracks to the seed track
        knn = NearestNeighbors(metric='cosine', algorithm='brute')
        knn.fit(csr_matrix(slice_df.values))
        distances, indices = knn.kneighbors(csr_matrix(slice_df.values), n_neighbors=self.k)
        
        # retrieve ids of top k most similar tracks and their distances to the seed track 
        seed_track_loc = slice_df.index.get_loc(seed_track_id)
        track_distances = distances[seed_track_loc].tolist()
        sim_tracks_loc = indices[seed_track_loc].tolist()
        sim_tracks_id = slice_df.iloc[sim_tracks_loc,:].index.tolist()
        
        return sim_tracks_id, track_distances

    def get_song_and_artist_name(self, df, song_id):
        """Given a unique song identifier, returns a song and artist name.

        Parameters
        ----------
        df : DataFrame
            Pandas dataframe containing full information about playlists and songs
        song_id : int
            Unique id of a song for which we need a name and an artist

        Returns
        -------
        song_name
            Name of a song
        artist_name
            Name of an artist performing a song
        """       
        song_df = df.drop(['pid', 'playlist'], axis=1).sort_values(by=['tid']).copy()
        song_df.drop_duplicates(subset=['tid'], inplace=True)
    
        song_name = song_df.loc[song_df.tid == song_id, 'track_name'].item()
        artist_name = song_df.loc[song_df.tid == song_id, 'artist_name'].item()
    
        return song_name, artist_name
    
    def get_playlist_name(self, df, playlist_id):
        """Given a unique playlist identifier, returns a playlist name.

        Parameters
        ----------
        df : DataFrame
            Pandas dataframe containing full information about playlists and songs
        playlist_id : int
            Unique id of a playlist for which we need a name

        Returns
        -------
        playlist
            Name of a playlist
        """           
        playlist_df = df.drop(['track_name', 'artist_name', 'album_name','tid'], axis=1).sort_values(by=['pid']).copy()
        playlist_df.drop_duplicates(subset=['pid'], inplace=True)
    
        playlist = playlist_df.loc[playlist_df.pid == playlist_id, 'playlist'].item()
        return playlist

    def get_song_recommendation(self, df, seed_track_id, similar_song_id):
        """Given a dataframe, seed track identifier and an identifier of a similar song, it prints out a recommendation of
        a similar song based on its similarity to the seed track.

        Parameters
        ----------
        df : DataFrame
            Pandas dataframe containing full information about playlists and songs
        seed_track_id : int
            Unique id of a playlist for which we need a name
        similar_song_id : int
            Unique id of a song to be recommended based on its similarity to the seed track
        """
        (seed_track_name, seed_track_artist) = self.get_song_and_artist_name(df, seed_track_id)
        (song_name, artist_name) = self.get_song_and_artist_name(df, similar_song_id)

        print('Suggested song for your playlist ' + self.get_playlist_name(df, self.target_playlist_id) +
              ' is ' + song_name + ' by ' + artist_name + ' based on its similarity to ' + seed_track_name +
              ' by ' + seed_track_artist + '.')
        print()

#### Parameters

In [7]:
len_threshold = 100
k = 4
num_seed = 5
target_playlist_id = 'pid_4998'
seed_ids = ['tid_9133', 'tid_482', 'tid_14387', 'tid_16887', 'tid_40339']

#### Main part

In [8]:
itemcf = ItemCF(target_playlist_id, num_seed, k)

filtered_df = itemcf.filter_by_playlist_length(df, len_threshold)
pt_df = itemcf.create_playlist_song_matrix(filtered_df)

for seed_track_id in seed_ids:
    sim_tracks_id, track_distances = itemcf.find_k_most_similar_songs(pt_df, seed_track_id)
    
    for similar_song_id in sim_tracks_id:
        itemcf.get_song_recommendation(filtered_df, seed_track_id, similar_song_id)

Suggested song for your playlist poop is Breezeblocks by alt-J based on its similarity to Breezeblocks by alt-J.

Suggested song for your playlist poop is Tessellate by alt-J based on its similarity to Breezeblocks by alt-J.

Suggested song for your playlist poop is Fitzpleasure by alt-J based on its similarity to Breezeblocks by alt-J.

Suggested song for your playlist poop is Fairytale by Milky Chance based on its similarity to Breezeblocks by alt-J.

Suggested song for your playlist poop is Left Hand Free by alt-J based on its similarity to Left Hand Free by alt-J.

Suggested song for your playlist poop is Every Other Freckle by alt-J based on its similarity to Left Hand Free by alt-J.

Suggested song for your playlist poop is Tessellate by alt-J based on its similarity to Left Hand Free by alt-J.

Suggested song for your playlist poop is Seventeen Years by Ratatat based on its similarity to Left Hand Free by alt-J.

Suggested song for your playlist poop is Dammit by blink-182 based

#### TBD

-Add seed track list to the class instead of having it as an argument outside of it\
-Remove the seed track itself from the list of similar tracks, was kept to see if the algo is working correctly\
-Do train-test split\n
-Maybe move the main part inside the class as well under generate_recommendation_list function?

#### If enough time:

-Add TF-IDF weights to make up for high popularity bias\n
-Recommend based on playlist title if a distance from seed track to rec track is larger than a threshold