In [1]:
import os, sys
sys.path.insert(0, '/Users/victornguyen/Sites/07.book_management')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "book_management.settings")

import django
from django.db.models import Count

from scipy.sparse import dok_matrix
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib as mlt

mlt.use('TkAgg')
import matplotlib.pyplot as plt

import numpy as np

django.setup()

from recommendation.models import Rating, Cluster
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error

In [2]:
user_ids = list(
            Rating.objects.values('user_id')
                .annotate(title_count=Count('title_id'))
                .order_by('-title_count'))
content_ids = list(Rating.objects.values('title_id').distinct())
content_map = {content_ids[i]['title_id']: i
               for i in range(len(content_ids))}
num_users = len(user_ids)
user_ratings = dok_matrix((num_users,
                           len(content_ids)),
                          dtype=np.float32)

In [3]:
df=pd.DataFrame(list(Rating.objects.all().values()))
df['user_id'] = pd.to_numeric(df['user_id'],errors='coerce')
df['title_id'] = pd.to_numeric(df['title_id'],errors='coerce')
df['rating'] = pd.to_numeric(df['rating'],errors='coerce')

In [4]:
df

Unnamed: 0,id,rating,rating_timestamp,title_id,type,user_id
0,1,9.43,2019-06-17 16:14:40.824420,20,calculate,8
1,2,9.43,2019-06-17 16:14:40.833565,119,calculate,8
2,3,4.32,2019-06-17 16:14:40.835441,140,calculate,8
3,4,9.08,2019-06-17 16:14:40.836554,113,calculate,9
4,5,9.96,2019-06-17 16:14:40.837826,14,calculate,10
5,6,3.04,2019-06-17 16:14:40.838531,164,calculate,11
6,7,9.52,2019-06-17 16:14:40.840193,143,calculate,13
7,8,4.50,2019-06-17 16:14:40.841104,54,calculate,14
8,9,4.50,2019-06-17 16:14:40.841901,66,calculate,15
9,10,7.86,2019-06-17 16:14:40.843328,88,calculate,16


In [5]:
num_user =df.user_id.max()
num_title =df.title_id.max()

In [6]:
num_user

125

In [7]:
num_title

164

In [8]:
rating_matrix = np.zeros((num_user+1, num_title+1))
for row in df.itertuples():
    rating_matrix[row.user_id, row.title_id] = row.rating

In [9]:
rating_matrix[8]

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 9.43, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 9.43, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 4.32, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.

In [10]:
pearson_sim = 1-pairwise_distances(rating_matrix, metric="correlation")

In [11]:
#This function finds k similar users given the user_id and ratings matrix M
#Note that the similarities are same as obtained via using pairwise_distances
def findksimilarusers(user_id, ratings, metric, k=4):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric, algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k)
    similarities = 1-distances.flatten()
    print ('{0} most similar users for User {1}:\n'.format(k-1,user_id))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;

        else:
            print ('{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    return similarities,indices

In [117]:
ratings_matrix = np.array([[np.nan,np.nan,np.nan,np.nan,np.nan], 
                           [5,1,np.nan,2,2], 
                           [1,5,2,5,5],
                           [2,np.nan,3,5,4],
                           [4,3,5,3,np.nan]])

In [118]:
def specified_rating_indices(u):
    if np.sum(~np.isnan(u)) == 0:
        return None
    else:
        return list(map(tuple, np.where(np.isfinite(u))))

In [119]:
def mean(u):
    if specified_rating_indices(u) is None:
        return np.NaN
    else:
        specified_ratings = u[specified_rating_indices(u)]  # u[np.isfinite(u)]
        m = sum(specified_ratings) / np.shape(specified_ratings)[0]
        return m

In [120]:
def all_user_mean_ratings(ratings_matrix):
    return np.array([mean(ratings_matrix[u, :]) for u in range(ratings_matrix.shape[0])])

In [121]:
def get_mean_centered_ratings_matrix(ratings_matrix):
    print(ratings_matrix)
    users_mean_rating = all_user_mean_ratings(ratings_matrix)
    print(users_mean_rating)
    print(np.reshape(users_mean_rating, [-1, 1]))
    mean_centered_ratings_matrix = ratings_matrix - np.reshape(users_mean_rating, [-1, 1])
    print(mean_centered_ratings_matrix)
    return mean_centered_ratings_matrix

In [122]:
mean_centered_ratings_matrix = get_mean_centered_ratings_matrix(ratings_matrix)

[[nan nan nan nan nan]
 [ 5.  1. nan  2.  2.]
 [ 1.  5.  2.  5.  5.]
 [ 2. nan  3.  5.  4.]
 [ 4.  3.  5.  3. nan]]
[ nan 2.5  3.6  3.5  3.75]
[[ nan]
 [2.5 ]
 [3.6 ]
 [3.5 ]
 [3.75]]
[[  nan   nan   nan   nan   nan]
 [ 2.5  -1.5    nan -0.5  -0.5 ]
 [-2.6   1.4  -1.6   1.4   1.4 ]
 [-1.5    nan -0.5   1.5   0.5 ]
 [ 0.25 -0.75  1.25 -0.75   nan]]


  """


In [126]:
def pearson(u, v):
    mean_u = mean(u)
    mean_v = mean(v)
    
    if mean_u is None or mean_v is None or specified_rating_indices(u) is None or specified_rating_indices(v) is None:
        return np.NaN
    
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    u_mutually = u[mutually_specified_ratings_indices]
    v_mutually = v[mutually_specified_ratings_indices]
      
    centralized_mutually_u = u_mutually - mean_u
    centralized_mutually_v = v_mutually - mean_v

    result = np.sum(np.multiply(centralized_mutually_u, centralized_mutually_v)) 
    result = result / (np.sqrt(np.sum(np.square(centralized_mutually_u))) * np.sqrt(np.sum(np.square(centralized_mutually_v))))

    return result

In [127]:
print(pearson(ratings_matrix[0, :], ratings_matrix[2, :]))

nan


  """


In [130]:
def get_user_similarity_value_for(u_index, ratings_matrix):
    user_ratings = ratings_matrix[u_index, :]
    similarity_value = np.array([pearson(ratings_matrix[i, :], user_ratings) for i in range(ratings_matrix.shape[0])])
    return similarity_value

In [131]:
def get_user_similarity_matrix(ratings_matrix):
    similarity_matrix = []
    for u_index in range(ratings_matrix.shape[0]):
        similarity_value = get_user_similarity_value_for(u_index, ratings_matrix)
        print(u_index,':',similarity_value)
        similarity_matrix.append(similarity_value)
    return np.array(similarity_matrix)

In [132]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix)
print(user_similarity_matrix)

0 : [nan nan nan nan nan]
1 : [        nan  1.         -0.93757325 -0.83887049  0.65923172]
2 : [        nan -0.93757325  1.          0.92179059 -0.78719671]
3 : [        nan -0.83887049  0.92179059  1.         -0.65923172]
4 : [        nan  0.65923172 -0.78719671 -0.65923172  1.        ]
[[        nan         nan         nan         nan         nan]
 [        nan  1.         -0.93757325 -0.83887049  0.65923172]
 [        nan -0.93757325  1.          0.92179059 -0.78719671]
 [        nan -0.83887049  0.92179059  1.         -0.65923172]
 [        nan  0.65923172 -0.78719671 -0.65923172  1.        ]]


  """


In [141]:
def predict(u_index, i_index):
# k là số lượng người dùng giống với người dùng cần dự đoán
# ta có thể tùy chọn giá trị k này
    users_mean_rating = all_user_mean_ratings(ratings_matrix)
    similarity_value = user_similarity_matrix[u_index]
    sorted_users_similar = np.argsort(similarity_value)
    sorted_users_similar = np.flip(sorted_users_similar, axis=0)
    users_rated_item = specified_rating_indices(ratings_matrix[:, i_index])[0]
    ranked_similar_user_rated_item = [u for u in sorted_users_similar if u in users_rated_item]
    top_k_similar_user = np.array(ranked_similar_user_rated_item)
    ratings_in_item = mean_centered_ratings_matrix[:, i_index]
    top_k_ratings = ratings_in_item[top_k_similar_user]
    top_k_similarity_value = similarity_value[top_k_similar_user]
    r_hat = users_mean_rating[u_index] + np.sum(top_k_ratings * top_k_similarity_value)/np.sum(np.abs(top_k_similarity_value))
    return r_hat

In [142]:
print(predict(3, 4))

4.300511907823464


  """


In [143]:
def predict_top_k_items_of_user(u_index, k_items):
    items = []
    for i_index in range(ratings_matrix.shape[1]):
        if np.isnan(ratings_matrix[u_index][i_index]):
            rating = predict(u_index, i_index)
            items.append((i_index, rating))
    items = sorted(items, key=lambda tup: tup[1])
    return list(reversed(items))

In [144]:
print(predict_top_k_items_of_user(3, 4))

[(1, 4.757591389605282)]


  """
