In [9]:
import openpyxl
import numpy as np

In [2]:
def build_u2i_matrix(user_item_score_data_path, item_name_data_path, write_file=False):
    #获取item id到电影名的对应关系
    item_id_to_item_name = {}
    with open(item_name_data_path, encoding="ISO-8859-1") as f:
        for line in f:
            item_id, item_name = line.split("|")[:2]
            item_id = int(item_id)
            item_id_to_item_name[item_id] = item_name
    total_movie_count = len(item_id_to_item_name)
    #读打分文件
    user_to_rating = {}
    with open(user_item_score_data_path, encoding="ISO-8859-1") as f:
        for line in f:
            user_id, item_id, score, time_stamp = line.split("\t")
            user_id, item_id, score = int(user_id), int(item_id), int(score)
            if user_id not in user_to_rating:
                user_to_rating[user_id] = [0] * total_movie_count
            user_to_rating[user_id][item_id - 1] = score
    
    if not write_file:
        return user_to_rating, item_id_to_item_name
    
    # 写入excel便于查看
    workbook = openpyxl.Workbook()
    sheet = workbook.create_sheet(index=0)
    #第一行：user_id, movie1, movie2...
    header = ["user_id"] + [item_id_to_item_name[i + 1] for i in range(total_movie_count)]
    sheet.append(header)
    for i in range(len(user_to_rating)):
        #每行：user_id, rate1, rate2...
        line = [i + 1] + user_to_rating[i + 1]
        sheet.append(line)
    workbook.save("user_movie_rating.xlsx")
    return user_to_rating, item_id_to_item_name

In [3]:
#向量余弦距离
def cosine_distance(vector1, vector2):
    ab = vector1.dot(vector2)
    a_norm = np.sqrt(np.sum(np.square(vector1)))
    b_norm = np.sqrt(np.sum(np.square(vector2)))
    return ab/(a_norm * b_norm)


#依照user对item的打分判断user之间的相似度
def find_similar_user(user_to_rating):
    user_to_similar_user = {}
    score_buffer = {}
    for user_a, ratings_a in user_to_rating.items():
        similar_user = []
        for user_b, ratings_b in user_to_rating.items():
            if user_b == user_a or user_b>100 or user_a > 100:
                continue
            #ab用户互换不用重新计算cos
            if "%d_%d"%(user_b, user_a) in score_buffer:
                similarity = score_buffer["%d_%d"%(user_b, user_a)]
            else:
                similarity = cosine_distance(np.array(ratings_a), np.array(ratings_b))
                score_buffer["%d_%d" % (user_a, user_b)] = similarity

            similar_user.append([user_b, similarity])
        similar_user = sorted(similar_user, reverse=True, key=lambda x:x[1])
        user_to_similar_user[user_a] = similar_user
    return user_to_similar_user

In [37]:
def get_user_favorites(user_id, user_to_rating):
    ratings = np.array(user_to_rating[user_id])
    top_ratings = np.argsort(ratings)[::-1]

    return top_ratings

def get_item_similarity(favo_item, item_id):
    favo_item_rating = np.array([rating[favo_item] for rating in user_to_rating.values()])
    item_rating = np.array([rating[item_id] for rating in user_to_rating.values()])

    return cosine_distance(favo_item_rating, item_rating)

def get_item_rating_by_user(user_id, item_id):
    return user_to_rating[user_id][item_id]

In [4]:
def movie_recommand(user_id, similar_user, user_to_rating, item_to_name, topn=10):
    unseen_items = [item_id for item_id, rating in enumerate(user_to_rating[user_id]) if rating == 0]
    res = []
    for item_id in unseen_items:
        score = user_cf(user_id, item_id, similar_user, user_to_rating)
        res.append([item_to_name[item_id + 1], score])
    res = sorted(res, key=lambda x:x[1], reverse=True)
    return res[:topn]

In [5]:
user_item_score_data_path = "ml-100k/u.data"
item_name_data_path = "ml-100k/u.item"
user_to_rating, item_to_name = build_u2i_matrix(user_item_score_data_path, item_name_data_path, False)

In [7]:
print(user_to_rating[196])

[0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [29]:
def get_user_favorites(user_id, user_to_rating, topn):
    ratings_by_users = np.array([rating for rating in user_to_rating[user_id]])
    top_ratings = np.argsort(ratings_by_users)[::-1]
    top_scores = np.zeros(len(ratings_by_users), dtype=np.float32)

    for index in top_ratings[:topn]:
        top_scores[index] = ratings_by_users[index]
    return top_scores

In [16]:
a = np.array([5, 3, 1, 4, 2])
np.argsort(a)[::-1]

array([0, 3, 1, 4, 2])

In [30]:
def get_item_rating_by_user(user_id, item_id):
    return user_to_rating[user_id][item_id]

In [44]:
def item_cf(user_id, item_id, topn):
    favorite_items = get_user_favorites(user_id, user_to_rating)  #TODO  获取这个用户喜欢的前n个电影
    favorite_item = {}
    
    for favo_item in favorite_items:
        score = 0
        sim = get_item_similarity(favo_item, item_id) #TODO  对于两个电影，计算相似度
        score += sim * get_item_rating_by_user(user_id, item_id)  #TODO  获取已知喜欢的电影得分
        favorite_item[favo_item] = score
    favorite_item = sorted(favorite_item.items(), key=lambda x:x[1], reverse=True)[:topn]
    for key, val in favorite_item:
        print(item_id_to_item_name[key], val)
    # print(favorite_item)
    
item_cf(1, 1, 10)

NameError: name 'item_id_to_item_name' is not defined