In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r'data/processed_data.csv')

In [3]:
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):
            
        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_songs_users = []        
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))
            
        ###############################################
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           
        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(user_songs)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pd.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):
        
        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_songs = self.get_user_items(user)    
            
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
                
        return df_recommendations[['song','rank']]
    
    def get_similar_items(self, item_list):
        
        user_songs = item_list
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations

In [4]:
total_play_count = sum(df.freq)
play_count = df[['release', 'freq']].groupby('release').sum().\
             sort_values(by='freq',ascending=False).head(10000)
song_subset = list(play_count.index[:5000])
user_subset = list(df.loc[df.release.isin(song_subset), 'user_id'].unique())
user_song_list_count_sub = df[df.release.isin(song_subset)]

In [5]:
is_model = item_similarity_recommender_py()
is_model.create(user_song_list_count_sub, 'user_id', 'release')

In [6]:
user_id = list(user_song_list_count_sub.user_id)[7]
is_model.recommend(user_id)

Non zero values in cooccurence_matrix :16350


Unnamed: 0,song,rank
0,Undo,1
1,Revelry,2
2,Secrets,3
3,Fireflies,4
4,Horn Concerto No. 4 in E flat K495: II. Romanc...,5
5,Hey_ Soul Sister,6
6,Marry Me,7
7,OMG,8
8,Tive Sim,9
9,Drop The World,10


0          fd50c4007b68a3737fe052d5a4f78ce8aa117f3d
1          c34670d9c1718361feb93068a853cead3c95b76a
2          c5006d9f41f68ccccbf5ee29212b6af494110c5e
3          e4332e11f4df6dd26673bb6b085e9a2bbdc9b8a5
4          baf2fe5885ab93fbbdb7fecc6691788e70afb6c8
                             ...                   
1456559    9d2f78e2bda5a004879d3ee53ee7de5cdf99c730
1456560    987b654b09b239f7c47751e0cfaa2990834cbb55
1456561    987b654b09b239f7c47751e0cfaa2990834cbb55
1456562    fc072bde2043756ea30ff07e4a5311e34825b4df
1456563    fc072bde2043756ea30ff07e4a5311e34825b4df
Name: user_id, Length: 1456564, dtype: object

no. of unique songs in the training set: 5000
Non zero values in cooccurence_matrix :16350


Unnamed: 0,user_id,song,score,rank
0,,Undo,0.068731,1
1,,Revelry,0.06391,2
2,,Secrets,0.060298,3
3,,Fireflies,0.044531,4
4,,Horn Concerto No. 4 in E flat K495: II. Romanc...,0.044244,5
5,,Hey_ Soul Sister,0.043559,6
6,,Marry Me,0.043209,7
7,,OMG,0.036191,8
8,,Tive Sim,0.034483,9
9,,Drop The World,0.03411,10
