### Ref 
- https://github.com/llSourcell/recommender_live/blob/master/Song%20Recommender_Python.ipynb

- https://towardsdatascience.com/how-to-build-a-simple-song-recommender-296fcbc8c85

In [1]:
%matplotlib inline

import pandas as pd 
from sklearn.cross_validation import train_test_split
import numpy as np
import time
from sklearn.externals import joblib



In [2]:
cd .. 

/Users/yennanliu


In [3]:
cd movie_recommendation/

/Users/yennanliu/movie_recommendation


In [64]:
def get_data():
    route='datasets/ml-latest-small/'
    #df_movie = pd.read_csv(route +'movies.csv')
    df_ratings = pd.read_csv(route +'ratings.csv')
    return df_ratings

def data_preprocess(df):
    df['view_count']=df.groupby(['userId','movieId']).count().reset_index()['rating']
    movie_grouped = df.groupby(['movieId']).agg({'view_count': 'count'}).reset_index()
    grouped_sum = movie_grouped['view_count'].sum()
    movie_grouped['percentage']  = movie_grouped['view_count'].div(grouped_sum)*100
    movie_grouped.sort_values(['view_count', 'movieId'], ascending = [0,1])
    return movie_grouped



def get_train_test_data(df):
	train_data, test_data = train_test_split(df, test_size = 0.20, random_state=0)
	return train_data, test_data



# ML




#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.userId = None
        self.movieId = None
        self.cooccurence_matrix = None
        self.movies_dict = None
        self.rev_movies_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.userId] == user]
        user_items = list(user_data[self.movieId].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.movieId] == item]
        item_users = set(item_data[self.userId].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.movieId].unique())
            
        return all_items
    

        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_movies, all_movies):
            
        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_movies_users = []        
        for i in range(0, len(user_movies)):
            user_movies_users.append(self.get_item_users(user_movies[i]))
            
        ###############################################
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_movies), len(all_movies))), float)
           
        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_movies)):
            #Calculate unique listeners (users) of song (item) i
            movie_i_data = self.train_data[self.train_data[self.movieId] == all_movies[i]]
            users_i = set(movie_i_data[self.userId].unique())
            
            for j in range(0,len(user_movies)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_movies_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix
    
    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_movies, user_movies):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        #columns = ['user_id', 'song', 'score', 'rank']
        columns = ['userId', 'movieId' ,'rating' ,'view_count']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pd.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_movies[sort_index[i][1]] not in user_movies and rank <= 10:
                df.loc[len(df)]=[user,all_movies[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
    

    #Create the item similarity based recommender system model
    def create(self, train_data, userId, movieId):
        self.train_data = train_data
        self.userId = userId
        self.movieId = movieId

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):
        
        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_movies = self.get_user_items(user)   
        print ('------------')
        print (user_movies)
        print ('------------')
            
        #print("No. of unique movies for the user: %d" % len(user_movies))
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_movies = self.get_all_items_train_data()
        
        print("no. of unique movies in the training set: %d" % len(all_movies))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_movies, all_movies)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_movies, user_movies)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        user_movies = item_list
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_movies))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_movies, all_movies)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_movies, user_movies)
         
        return df_recommendations

In [29]:
train_data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,view_count
73174,509,7323,3.5,1093277659,1
30938,220,1961,4.0,970504853,1
79289,547,3467,4.5,1086010787,1


In [5]:
np.matrix(np.zeros(shape=(10, 5)), float)

matrix([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [65]:
df_ratings = get_data()
movie_grouped = data_preprocess(df_ratings)
print (movie_grouped)
train_data, test_data = get_train_test_data(df_ratings)

      movieId  view_count  percentage
0           1         247    0.246990
1           2         107    0.106996
2           3          59    0.058998
3           4          13    0.012999
4           5          56    0.055998
5           6         104    0.103996
6           7          53    0.052998
7           8           5    0.005000
8           9          20    0.019999
9          10         122    0.121995
10         11          82    0.081997
11         12          18    0.017999
12         13           8    0.008000
13         14          31    0.030999
14         15          11    0.011000
15         16          88    0.087996
16         17          86    0.085997
17         18          26    0.025999
18         19          92    0.091996
19         20          13    0.012999
20         21          95    0.094996
21         22          38    0.037998
22         23          22    0.021999
23         24          34    0.033999
24         25         101    0.100996
25         2

In [36]:
is_model = item_similarity_recommender_py()
#is_model.create(train_data, 'user_id', 'song')

In [13]:
train_data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,view_count
73174,509,7323,3.5,1093277659,1
30938,220,1961,4.0,970504853,1
79289,547,3467,4.5,1086010787,1


In [14]:
test_data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,view_count
3582,20,1378,3.0,1224042777,1
60501,439,2648,3.0,1041113077,1
76461,529,6533,3.5,1083179789,1


In [9]:
is_model

<__main__.item_similarity_recommender_py at 0x109293c88>

In [57]:
is_model = item_similarity_recommender_py()
is_model.create(train_data, 'userId', 'movieId')

In [66]:
is_model = item_similarity_recommender_py()
is_model.create(train_data, 'userId', 'movieId')

In [68]:
#Print the songs for the user in training data
user_id = 30
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

------------------------------------------------------------------------------------
Training data songs for the user userid: 30:
------------------------------------------------------------------------------------
5956
4713
8
369
954
5247
2890
1721
3859
4352
100
5366
2
2858
1562
5694
1488
5161
2065
208
371
2750
296
2991
2947
4951
2391
1864
2193
5125
6422
953
3704
50
2577
3791
1018
1321
3494
3529
535
1073
2307
3638
4409
3551
2787
5308
1093
2398
661
2070
300
1523
2245
1884
3087
4464
903
2433
3370
2490
1193
1101
2551
3686
1086
2956
5269
4306
3498
2617
4608
1089
5729
2263
1962
2013
1028
2148
587
4498
4832
1370
3613
1082
3468
118
3688
6027
5506
1956
5968
3763
4932
1598
3526
597
3957
2259
515
2278
4886
4041
1304
5120
1945
1917
3263
454
317
2054
1378
2289
1219
2355
4128
2120
2002
593
4111
2952
4146
2517
3546
4489
1965
4941
2255
532
3360
4022
1250
1358
2739
1704
235
1263
3101
5703
1042
3483
4835
5303
2320
2883
5843
3359
2166
344
4776
3053
2819
1283
1231
4190
6084
4896
18
1798
517
1267
1327
13

Non zero values in cooccurence_matrix :3115810


Unnamed: 0,userId,movieId,rating,view_count
0,30.0,2915.0,0.105438,1.0
1,30.0,1953.0,0.103055,2.0
2,30.0,2248.0,0.098761,3.0
3,30.0,2463.0,0.098121,4.0
4,30.0,2303.0,0.09764,5.0
5,30.0,2717.0,0.095842,6.0
6,30.0,2779.0,0.095424,7.0
7,30.0,2797.0,0.093767,8.0
8,30.0,1387.0,0.091434,9.0
9,30.0,1288.0,0.091372,10.0
