In [2]:
import os, sys
sys.path.insert(0, '/Users/victornguyen/Sites/07.book_management')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "book_management.settings")

import django
from django.db.models import Count

from scipy.sparse import dok_matrix
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib as mlt

mlt.use('TkAgg')
import matplotlib.pyplot as plt

import numpy as np

django.setup()

from recommendation.models import Rating, Cluster
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error

In [2]:
user_ids = list(
            Rating.objects.values('user_id')
                .annotate(title_count=Count('title_id'))
                .order_by('-title_count'))
content_ids = list(Rating.objects.values('title_id').distinct())
content_map = {content_ids[i]['title_id']: i
               for i in range(len(content_ids))}
num_users = len(user_ids)
user_ratings = dok_matrix((num_users,
                           len(content_ids)),
                          dtype=np.float32)

In [3]:
df=pd.DataFrame(list(Rating.objects.all().values()))
df['user_id'] = pd.to_numeric(df['user_id'],errors='coerce')
df['title_id'] = pd.to_numeric(df['title_id'],errors='coerce')
df['rating'] = pd.to_numeric(df['rating'],errors='coerce')

In [4]:
num_user =df.user_id.max()
num_title =df.title_id.max()

In [5]:
num_user

125

In [6]:
num_title

164

In [7]:
rating_matrix = np.zeros((num_user+1, num_title+1))
for row in df.itertuples():
    rating_matrix[row.user_id, row.title_id] = row.rating

In [8]:
rating_matrix[8]

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 9.43, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 9.43, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 4.32, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.

In [9]:
pearson_sim = 1-pairwise_distances(rating_matrix, metric="correlation")

In [10]:
#This function finds k similar users given the user_id and ratings matrix M
#Note that the similarities are same as obtained via using pairwise_distances
def findksimilarusers(user_id, ratings, metric, k=4):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric, algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k)
    similarities = 1-distances.flatten()
    print ('{0} most similar users for User {1}:\n'.format(k-1,user_id))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;

        else:
            print ('{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    return similarities,indices

In [12]:
similarities,indices = findksimilarusers(1,rating_matrix, metric='correlation')

TypeError: '<=' not supported between instances of 'str' and 'int'

In [10]:
ratings_matrix = np.array([[5,1,np.nan,2,2], 
                [1,5,2,5,5],
                [2,np.nan,3,5,4],
                [4,3,5,3,np.nan]])

In [11]:
def specified_rating_indices(u):
    return list(map(tuple, np.where(np.isfinite(u))))

In [12]:
def mean(u):
    # may use specified_rating_indices but use more time
    specified_ratings = u[specified_rating_indices(u)]#u[np.isfinite(u)]
    m = sum(specified_ratings)/np.shape(specified_ratings)[0]
    return m

In [13]:
def all_user_mean_ratings(ratings_matrix):
    return np.array([mean(ratings_matrix[u, :]) for u in range(ratings_matrix.shape[0])])


In [14]:
def get_mean_centered_ratings_matrix(ratings_matrix):
    users_mean_rating = all_user_mean_ratings(ratings_matrix)
    mean_centered_ratings_matrix = ratings_matrix - np.reshape(users_mean_rating, [-1, 1])
    return mean_centered_ratings_matrix

In [15]:
mean_centered_ratings_matrix = get_mean_centered_ratings_matrix(ratings_matrix)

  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
def pearson(u, v):
    mean_u = mean(u)
    mean_v = mean(v)
    
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    u_mutually = u[mutually_specified_ratings_indices]
    v_mutually = v[mutually_specified_ratings_indices]
      
    centralized_mutually_u = u_mutually - mean_u
    centralized_mutually_v = v_mutually - mean_v

    result = np.sum(np.multiply(centralized_mutually_u, centralized_mutually_v)) 
    result = result / (np.sqrt(np.sum(np.square(centralized_mutually_u))) * np.sqrt(np.sum(np.square(centralized_mutually_v))))

    return result

In [17]:
print(pearson(ratings_matrix[1, :], ratings_matrix[2, :]))

0.9217905864112383


  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
def get_user_similarity_value_for(u_index, ratings_matrix):
    user_ratings = ratings_matrix[u_index, :]
    similarity_value = np.array([pearson(ratings_matrix[i, :], user_ratings) for i in range(ratings_matrix.shape[0])])
    return similarity_value

In [20]:
def get_user_similarity_matrix(ratings_matrix):
    similarity_matrix = []
    for u_index in range(ratings_matrix.shape[0]):
        similarity_value = get_user_similarity_value_for(u_index, ratings_matrix)
        similarity_matrix.append(similarity_value)
    return np.array(similarity_matrix)

user_similarity_matrix = get_user_similarity_matrix(ratings_matrix)
print(user_similarity_matrix)

[[ 1.         -0.93757325 -0.83887049  0.65923172]
 [-0.93757325  1.          0.92179059 -0.78719671]
 [-0.83887049  0.92179059  1.         -0.65923172]
 [ 0.65923172 -0.78719671 -0.65923172  1.        ]]


  This is separate from the ipykernel package so we can avoid doing imports until
