In [21]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
similar_users_path = "./similar_users.csv"
user_coefficients_path = "./user_coefficients.csv"

Ratings = pd.DataFrame()
Movies = pd.DataFrame()
SimilarUsers = pd.DataFrame()
UserCoefficients = pd.DataFrame()

In [22]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return None

In [23]:
def unskewed_pearson_similarity(user1, user2):
    dot_product = user1.transpose().dot(user2)
    user1_vector_length = math.sqrt(user1.transpose().dot(user1))
    user2_vector_length = math.sqrt(user2.transpose().dot(user2))
    if user1_vector_length < 0.0000001 or user2_vector_length < 0.0000001 or (user1==user2).all():
        return 0
    else:
        return dot_product / user1_vector_length / user2_vector_length

In [24]:
def process_ratings(Ratings):
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    PivotedUserMatrix = Ratings.pivot_table(index='movieId', columns='userId', values='rating_unskewed', fill_value=0)
    return Ratings, PivotedUserMatrix

In [25]:
def user_similarity_matrix():
    SimilarUsers = pd.DataFrame(0, index=np.arange(distinct_users.size), columns=np.arange(distinct_users.size), dtype='float')
    UserCoefficients = pd.DataFrame(0, index=np.arange(distinct_users.size), columns=np.arange(distinct_users.size), dtype='float')
    
    for user in distinct_users[:40]:
        userIndex = np.searchsorted(distinct_users, user)
        
        for user2 in distinct_users:
            user2Index = np.searchsorted(distinct_users, user2)
            
            proximity = unskewed_pearson_similarity(PivotedUserMatrix.iloc[:,userIndex], PivotedUserMatrix.iloc[:,user2Index])
            SimilarUsers[user2Index][userIndex] = proximity
            
        similarity_values = np.copy(SimilarUsers.values[userIndex])
        SimilarUsers.values[userIndex] = np.argsort(SimilarUsers.values[userIndex])[::-1]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]
        UserCoefficients.values[userIndex] = np.where(similarity_values > 0, similarity_values, 0)
        SimilarUsers.values[userIndex] = np.where(similarity_values > 0, SimilarUsers.values[userIndex], 0)
        print("Calculated for {} users out of {}.").format(userIndex+1, distinct_users.size)
    
    UserCoefficients.to_csv(user_coefficients_path, index=False)
    SimilarUsers.to_csv(similar_users_path, index=False)
    return SimilarUsers, UserCoefficients

In [26]:
def load_precalculated(data, name, path, recalculator, arg):
    data = load_dataset((name, path))
    if data is None:
        print("{} data is being recalculated... It might take a while...").format(name)
        data = recalculator()[arg]
    return data

In [28]:
Ratings = load_dataset(("Ratings", ratings_path))
Ratings, PivotedUserMatrix = process_ratings(Ratings)
distinct_users = np.unique(Ratings['userId'])
distinct_movies = np.unique(Ratings['movieId'])
Movies = load_dataset(("Movies", movies_path))
SimilarUsers = load_precalculated(SimilarUsers, "SimilarUsers", similar_users_path, user_similarity_matrix, 0)
UserCoefficients = load_precalculated(UserCoefficients, "UserCoefficients", user_coefficients_path, user_similarity_matrix, 1)    

Ratings is successfully read from memory.
Movies is successfully read from memory.
CAUTION: SimilarUsers cannot be read from memory.
SimilarUsers data is being recalculated... It might take a while...
Calculated for 1 users out of 671.
Calculated for 2 users out of 671.
Calculated for 3 users out of 671.
Calculated for 4 users out of 671.
Calculated for 5 users out of 671.
Calculated for 6 users out of 671.
Calculated for 7 users out of 671.
Calculated for 8 users out of 671.
Calculated for 9 users out of 671.
Calculated for 10 users out of 671.
Calculated for 11 users out of 671.
Calculated for 12 users out of 671.
Calculated for 13 users out of 671.
Calculated for 14 users out of 671.
Calculated for 15 users out of 671.
Calculated for 16 users out of 671.
Calculated for 17 users out of 671.
Calculated for 18 users out of 671.
Calculated for 19 users out of 671.
Calculated for 20 users out of 671.
Calculated for 21 users out of 671.
Calculated for 22 users out of 671.
Calculated for 2

In [51]:
def accumulate_user_recommendations(userId, recommenders):
    recommendations = np.zeros(distinct_movies.size)
    userIndex = np.searchsorted(distinct_users, userId)
    recommendersProximity = UserCoefficients.values[userIndex]

    for recommender, proximity in zip(recommenders, recommendersProximity):
        recommenderIndex = np.searchsorted(distinct_users, recommender)
        recommenderRatings = PivotedUserMatrix.iloc[:,recommenderIndex]
        recommendation_vector = (recommenderRatings*proximity).values
        recommendations += recommendation_vector

    return recommendations

In [52]:
def user_collaborative_recommendations(userId):
    userIndex = np.searchsorted(np.unique(Ratings['userId']), userId)
    recommenders = np.extract(SimilarUsers.values[userIndex]>0, SimilarUsers.values[userIndex])
    acc_recommendations = accumulate_user_recommendations(userId, recommenders)
    user_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    sorted_recommendations = acc_recommendations[::-1]
    user_recommendations = np.extract(sorted_recommendations>0, user_recommendations)
    return user_recommendations

In [54]:
userId=5
res = user_collaborative_recommendations(userId)[:10]
res

array([ 296,  318,  260, 2571, 1196,  608,  858,   50,  593, 1198])

In [394]:
Movies[Movies['movieId'].isin(res)]

Unnamed: 0,movieId,title,genres
232,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
266,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
284,318,"Shawshank Redemption, The (1994)",Crime|Drama
472,527,Schindler's List (1993),Drama|War
695,858,"Godfather, The (1972)",Crime|Drama
953,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
955,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
3871,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
4395,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
5026,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy


In [49]:
Movies[Movies['movieId'].isin(res)]

Unnamed: 0,movieId,title,genres
48,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
232,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
266,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
284,318,"Shawshank Redemption, The (1994)",Crime|Drama
525,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
535,608,Fargo (1996),Comedy|Crime|Drama|Thriller
695,858,"Godfather, The (1972)",Crime|Drama
953,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
955,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
2062,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
