In [1]:
import numpy as np

In [2]:
ratings_matrix = np.array([
#                         [np.nan,np.nan,np.nan,np.nan,np.nan], 
                        [5,1,np.nan,2,2], 
                        [1,5,np.nan,5,5],
                        [2,np.nan,np.nan,5,4],
                        [4,3,np.nan,3,np.nan]])

In [3]:
def specified_rating_indices(u):
    if np.sum(~np.isnan(u)) == 0:
        return None
    else:
        return list(map(tuple, np.where(np.isfinite(u))))

In [4]:
def mean(u):
    if specified_rating_indices(u) is None:
        return np.NaN
    else:
        specified_ratings = u[specified_rating_indices(u)]  # u[np.isfinite(u)]
        m = sum(specified_ratings) / np.shape(specified_ratings)[0]
        return m

In [5]:
def all_user_mean_ratings(ratings_matrix):
    return np.array([mean(ratings_matrix[u, :]) for u in range(ratings_matrix.shape[0])])

In [6]:
def get_mean_centered_ratings_matrix(ratings_matrix):
    print(ratings_matrix)
    users_mean_rating = all_user_mean_ratings(ratings_matrix)
    print(users_mean_rating)
    print(np.reshape(users_mean_rating, [-1, 1]))
    mean_centered_ratings_matrix = ratings_matrix - np.reshape(users_mean_rating, [-1, 1])
    print(mean_centered_ratings_matrix)
    return mean_centered_ratings_matrix

In [7]:
mean_centered_ratings_matrix = get_mean_centered_ratings_matrix(ratings_matrix)

[[ 5.  1. nan  2.  2.]
 [ 1.  5. nan  5.  5.]
 [ 2. nan nan  5.  4.]
 [ 4.  3. nan  3. nan]]
[2.5        4.         3.66666667 3.33333333]
[[2.5       ]
 [4.        ]
 [3.66666667]
 [3.33333333]]
[[ 2.5        -1.5                nan -0.5        -0.5       ]
 [-3.          1.                 nan  1.          1.        ]
 [-1.66666667         nan         nan  1.33333333  0.33333333]
 [ 0.66666667 -0.33333333         nan -0.33333333         nan]]


  """


In [8]:
def pearson(u, v):
    mean_u = mean(u)
    mean_v = mean(v)
    
    if mean_u is None or mean_v is None or specified_rating_indices(u) is None or specified_rating_indices(v) is None:
        return np.NaN
    
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    u_mutually = u[mutually_specified_ratings_indices]
    v_mutually = v[mutually_specified_ratings_indices]
      
    centralized_mutually_u = u_mutually - mean_u
    centralized_mutually_v = v_mutually - mean_v

    result = np.sum(np.multiply(centralized_mutually_u, centralized_mutually_v)) 
    result = result / (np.sqrt(np.sum(np.square(centralized_mutually_u))) * np.sqrt(np.sum(np.square(centralized_mutually_v))))

    return result

In [9]:
print(pearson(ratings_matrix[0, :], ratings_matrix[2, :]))

-0.8908708063747478


  """


In [10]:
def get_user_similarity_value_for(u_index, ratings_matrix):
    user_ratings = ratings_matrix[u_index, :]
    similarity_value = np.array([pearson(ratings_matrix[i, :], user_ratings) for i in range(ratings_matrix.shape[0])])
    return similarity_value

In [11]:
def get_user_similarity_matrix(ratings_matrix):
    similarity_matrix = []
    for u_index in range(ratings_matrix.shape[0]):
        similarity_value = get_user_similarity_value_for(u_index, ratings_matrix)
        print(u_index,':',similarity_value)
        similarity_matrix.append(similarity_value)
    return np.array(similarity_matrix)

In [12]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix)
print(user_similarity_matrix)

0 : [ 1.         -0.96225045 -0.89087081  0.96609178]
1 : [-0.96225045  1.          0.93048421 -0.98473193]
2 : [-0.89087081  0.93048421  1.         -0.97780241]
3 : [ 0.96609178 -0.98473193 -0.97780241  1.        ]
[[ 1.         -0.96225045 -0.89087081  0.96609178]
 [-0.96225045  1.          0.93048421 -0.98473193]
 [-0.89087081  0.93048421  1.         -0.97780241]
 [ 0.96609178 -0.98473193 -0.97780241  1.        ]]


  """


In [13]:
def predict(u_index, i_index, k):
# k là số lượng người dùng giống với người dùng cần dự đoán
# ta có thể tùy chọn giá trị k này
    users_mean_rating = all_user_mean_ratings(ratings_matrix)
    
    similarity_value = user_similarity_matrix[u_index]
    sorted_users_similar = np.argsort(similarity_value)
    sorted_users_similar = np.flip(sorted_users_similar, axis=0)
    if specified_rating_indices(ratings_matrix[:, i_index]) is None:
        return np.nan
    users_rated_item = specified_rating_indices(ratings_matrix[:, i_index])[0]
    print('users_rated_item',users_rated_item)
    ranked_similar_user_rated_item = [u for u in sorted_users_similar if u in users_rated_item]
    
    if k < len(ranked_similar_user_rated_item):
        top_k_similar_user = ranked_similar_user_rated_item[0:k]   
    else:
        top_k_similar_user = np.array(ranked_similar_user_rated_item)
            
    ratings_in_item = mean_centered_ratings_matrix[:, i_index]
    top_k_ratings = ratings_in_item[top_k_similar_user]
    
    top_k_similarity_value = similarity_value[top_k_similar_user]
    
    r_hat = users_mean_rating[u_index] + np.sum(top_k_ratings * top_k_similarity_value)/np.sum(np.abs(top_k_similarity_value))
    return r_hat

In [14]:
print(predict(1, 2, 10))

nan


  """


In [21]:
def predict_top_k_items_of_user(u_index, k_users):
    items = []
    print(ratings_matrix)
    # Loop in num_title
    for i_index in range(ratings_matrix.shape[1]):
        print(i_index)
        if np.isnan(ratings_matrix[u_index][i_index]):
            rating = predict(u_index, i_index, k_users)
            if not np.isnan(rating):
                items.append((i_index, rating))
    items = sorted(items, key=lambda tup: tup[1])
    return list(reversed(items))

In [22]:
print(predict_top_k_items_of_user(2, 10))

[[ 5.  1. nan  2.  2.]
 [ 1.  5. nan  5.  5.]
 [ 2. nan nan  5.  4.]
 [ 4.  3. nan  3. nan]]
0
1
users_rated_item (0, 1, 3)
2
3
4
[(1, 4.592918448510355)]


  """
