In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
# import cProfile

In [2]:
# get original data
# transpose it to a sparse matrix where column is movie, row is user and cell is rating
def get_data(file,col_names):
    ratings_contents = pd.read_table(file, names = col_names)
    
    num_users = ratings_contents.user.max()
    num_movies = ratings_contents.movie.max()
    
    ratings_as_mat = sparse.lil_matrix((num_users, num_movies))
    
    for _, row in ratings_contents.iterrows():
        ratings_as_mat[row.user - 1, row.movie - 1] = row.rating
        
    return ratings_contents, ratings_as_mat

In [3]:
# define how many neighborhoods we need for calculating similarity
num_nei = 75

In [4]:
# calculate similarity matrix
# and neighborhoods' index
def get_sim_nei(rating_matrix, num_nei):
    
    sim_matrix = cosine_similarity(rating_matrix.T)
    nei_index = sim_matrix.argsort()[:,-num_nei:]
    
    return sim_matrix, nei_index

In [26]:
# predict for one user
def pred(user_id,ratings_matrix, sim_matrix, nei_index):
    '''
    Input: user_id 
           ratings_matrix: users by items
           sim_matrix: item by item
           nei_index: item by neighborhoods
    Output: 
           prediction: 1-D array, non-zero elements represent estimated score for item_to_rate
    '''
    # find index of movies that have been rated by this user, in this case index also is movie id
    rated_item_index = ratings_matrix[user_id].nonzero()[1]
    # initialize a prediction array
    prediction = np.zeros(ratings_matrix.shape[1])
    # make prediction on movies
    for item_to_rate in range(ratings_matrix.shape[1]):
        # only predect items not rated by this user yet
        if item_to_rate not in rated_item_index:
            # find index of intersetion between rated movie by this user and this item's neighborhoods
            relevant_item_index = np.intersect1d(nei_index[item_to_rate],rated_item_index, assume_unique = True)
            # make predictions on this item for this user
            prediction[item_to_rate] = (ratings_matrix[user_id,relevant_item_index] * \
                                        sim_matrix[item_to_rate,relevant_item_index]) / \
                                        sim_matrix[item_to_rate,relevant_item_index].sum()
    return prediction

In [29]:
def pred_all_user(ratings_matrix, sim_matrix, nei_index):
    return [pred(user_id, ratings_matrix, sim_matrix, nei_index) for user_id in range(ratings_matrix.shape[0])]

In [33]:
def top_n_recommendations(prediction, top_n, ratings_contents):
    top_index = np.argsort(prediction)[::-1][:top_n + 1]
    return ratings_contents.ix[top_index,"movie"].values

In [6]:
file = 'data/u.data'
col_names = ["user", "movie", "rating", "timestamp"]
ratings_contents, ratings_as_mat = get_data(file, col_names)

In [7]:
sim_matrix, nei_index = get_sim_nei(ratings_as_mat, num_nei)

In [11]:
user_id = np.random.randint(low = 0, high = ratings_as_mat.shape[0])
print(user_id)

539


In [27]:
prediction = pred(user_id, ratings_as_mat, sim_matrix,nei_index)
print(prediction)



[ 0.          4.          3.73137362 ...,  4.                 nan  4.        ]


In [35]:
print("top 10 movies' name:")
print(top_n_recommendations(prediction, 10, ratings_contents))

top 10 movies' name:
[  68  191  974  125  135   50  187  240  625  289 1245]
