In [57]:
# url
# https://juejin.im/post/5c6a20cbf265da2dcf62758a

import pandas as pd

data_url = u'https://gist.githubusercontent.com/guerbai/3f4964350678c84d359e3536a08f6d3a/raw/f62f26d9ac24d434b1a0be3b5aec57c8a08e7741/user_book_ratings.txt'
df = pd.read_csv(data_url, sep = u',', header = None, names = [u'user_id', u'book_id', u'rating'])

In [58]:
print(df.head())

    user_id   book_id  rating
0  user_001  book_001       4
1  user_001  book_002       3
2  user_001  book_005       5
3  user_002  book_001       5
4  user_002  book_003       4


In [59]:
user_count = df['user_id'].unique().shape[0]
book_count = df['book_id'].unique().shape[0]

In [60]:
print(u'user_count: ', user_count)
print(u'book_count: ', book_count)

user_count:  6
book_count:  6


In [61]:
# 生成用户物品关系矩阵
user_id_index_series = pd.Series(range(user_count), index=[u'user_001', u'user_002', u'user_003', u'user_004', u'user_005', u'user_006'])
book_id_index_series = pd.Series(range(book_count), index=[u'book_001', u'book_002', u'book_003', u'book_004', u'book_005', u'book_006'])

In [62]:
import numpy as np

def construct_user_item_matrix(df):
    user_item_matrix = np.zeros((user_count, book_count), dtype=np.int8)
    for row in df.itertuples():
        user_id = row[1]
        book_id = row[2]
        rating = row[3]
        user_item_matrix[user_id_index_series[user_id], book_id_index_series[book_id]] = rating
    return user_item_matrix  

In [63]:
user_book_matrix = construct_user_item_matrix(df=df)
print(u'用户关系矩阵长这样： ')
print(u'----------------')
print(user_book_matrix)

用户关系矩阵长这样： 
----------------
[[4 3 0 0 5 0]
 [5 0 4 0 4 0]
 [4 0 5 3 4 0]
 [0 3 0 0 0 5]
 [0 4 0 0 0 4]
 [0 0 2 4 0 5]]


In [64]:
# 计算相似度矩阵
# sim(x,y) = xy/|x||y| = sum(xi * yi)/ ((xi * xi)*(yi * yi))开根号

def cosine_similarity(vec1, vec2):
    return round(vec1.dot(vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)), 2)


In [65]:
def construct_similarity_matrix(user_item_matrix, dim=u'user'):
    if dim == u'user':
        similarity_matrix = np.zeros((user_count, user_count))
        count = user_count
    else:
        similarity_matrix = np.zeros((book_count, book_count))
        count = book_count
    get_vector = lambda i: user_item_matrix[i] if dim == u'user' else user_item_matrix[:, i]
    for i in range(user_count):
        i_vector = get_vector(i)
        similarity_matrix[i][i] = cosine_similarity(i_vector, i_vector)
        for j in range(i, count):
            j_vector = get_vector(j)
            similarity = cosine_similarity(i_vector, j_vector)
            similarity_matrix[i][j] = similarity
            similarity_matrix[j][i] = similarity
    return similarity_matrix            

In [66]:
user_similarity_matrix = construct_similarity_matrix(user_book_matrix)
book_similarity_matrix = construct_similarity_matrix(user_book_matrix, dim=u'book')
print(u'user_similarity_matrix: ')
print(user_similarity_matrix)
print(u'book_similarity_matrix: ')
print(book_similarity_matrix)

user_similarity_matrix: 
[[1.   0.75 0.63 0.22 0.3  0.  ]
 [0.75 1.   0.91 0.   0.   0.16]
 [0.63 0.91 1.   0.   0.   0.4 ]
 [0.22 0.   0.   1.   0.97 0.64]
 [0.3  0.   0.   0.97 1.   0.53]
 [0.   0.16 0.4  0.64 0.53 1.  ]]
book_similarity_matrix: 
[[1.   0.27 0.79 0.32 0.98 0.  ]
 [0.27 1.   0.   0.   0.34 0.65]
 [0.79 0.   1.   0.69 0.71 0.18]
 [0.32 0.   0.69 1.   0.32 0.49]
 [0.98 0.34 0.71 0.32 1.   0.  ]
 [0.   0.65 0.18 0.49 0.   1.  ]]


In [67]:
# 推荐
def recommend_similar_users(user_id, n=3):
    user_index = user_id_index_series[user_id]
    similar_users_index = pd.Series(user_similarity_matrix[user_index]).drop(index=user_index).sort_values(ascending=False).index[:n]
    return np.array(similar_users_index)

In [68]:
print(u'recommend user_indexes %s to user_001' % recommend_similar_users(u'user_001'))

recommend user_indexes [1 2 4] to user_001


In [69]:
def recommend_similar_items(item_id, n=3):
    item_index = book_id_index_series[item_id]
    similar_item_index = pd.Series(book_similarity_matrix[item_index]).drop(index=item_index).sort_values(ascending=False).index[:n]
    return np.array(similar_item_index)

In [70]:
print(u'recommend otem_indexes %s to book_001' % recommend_similar_items(u'book_001'))

recommend otem_indexes [4 2 3] to book_001


In [71]:
# 为用户推荐书籍
# Pu,i = SUM(n->j)[((Sim(u,j)* (R(j,i))]/SUM(n->j)(Sim(u,j))
# 利用用户相似度矩阵来为用户推荐物品

def recommend_item_to_user(user_id):
    user_index = user_id_index_series[user_id]
    similar_users = recommend_similar_users(user_id, 2)
    recommend_set = set()
    for similar_user in similar_users:
        recommend_set = recommend_set.union(np.nonzero(user_book_matrix[similar_user])[0])
    recommend_set = recommend_set.difference(np.nonzero(user_book_matrix[user_index])[0])
    predict = pd.Series([0.0]*len(recommend_set), index=list(recommend_set))
    for book_index in recommend_set:
        fenzi = 0
        fenmu = 0
        for j in similar_users:
            if user_book_matrix[j][book_index] == 0:
                continue # 相似用户未看过该书则不计入统计.
            fenzi += user_book_matrix[j][book_index] * user_similarity_matrix[j][user_index]
            fenmu += user_similarity_matrix[j][user_index]
        if fenmu == 0:
            continue
        predict[book_index] = round(fenzi/fenmu, 2)
    return predict.sort_values(ascending=False)

In [72]:
recommend_item_to_user(u'user_005')

3    4.0
2    2.0
dtype: float64

In [73]:
# 基于物品为用户推荐物品

def recommend_item_to_user_ib(user_id):
    user_index = user_id_index_series[user_id]
    user_read_books = np.nonzero(user_book_matrix[user_index])[0]
    book_set = set()
    book_relation = dict()
    for book in user_read_books:
        relative_books = recommend_similar_items(book, 2)
        book_set = book_set.union(relative_books)
        book_relation[book] = relative_books
    book_set = book_set.difference(user_read_books)
    predict = pd.Series([0.0]*len(book_set), index=list(book_set))
    for book in book_set:
        fenzi = 0
        fenmu = 0
        for similar_book, relative_books in book_relation.items():
            if book in relative_books:
                fenzi += book_similarity_matrix[book][similar_book] * user_book_matrix[user_index][similar_book]
                fenmu += book_similarity_matrix[book][similar_book]
        predict[book] = round(fenzi/fenmu, 2)
    return predict.sort_values(ascending=False)    

In [74]:
recommend_item_to_user_ib(u'user_001')

2    4.47
5    3.00
dtype: float64