In [None]:
'''
推荐引擎，需要量化相似度：
    欧式距离
    皮尔逊系数
'''

In [10]:
# 欧氏距离
import json
import numpy as np

def euclidean_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('User '+user1+' not in dataset')
    if user2 not in dataset:
        raise TypeErrorr('User '+user2+' not in dataset')
    
    # 为了计算分数，需要提取两个用户均评估过分的电影
    rated_by_both = {}
    for item in dataset[user1]:
        for item in dataset[user2]:
            rated_by_both[item] = 1
    
    # 如果users之间没有共同评过的电影，说明没有相似度
    if not rated_by_both:
        return 0
    
    # 对于共同评分，计算平方和的平方根，并将该值归一化
    squared_differences = []
    for item in dataset[user1]:
        for item in dataset[user2]:
            squared_differences.append(np.square(dataset[user1][item] - dataset[user2][item]))
    return 1 / (1 + np.sqrt(np.sum(squared_differences)))


In [13]:
# 皮尔逊相关系数
def pearson_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('User '+user1+' not in dataset')
    if user2 not in dataset:
        raise TypeErrorr('User '+user2+' not in dataset')
    
    # 为了计算分数，需要提取两个用户均评估过分的电影
    rated_by_both = {}
    for item in dataset[user1]:
        for item in dataset[user2]:
            rated_by_both[item] = 1
            
    num_ratings = len(rated_by_both)
    if not num_ratings :
        num_ratings == 0
        
    # 如果users之间没有共同评过的电影，说明没有相似度
    if not rated_by_both:
        return 0
    
    # 计算相同评分电影的平方值之和
    user1_sum = np.sum([dataset[user1][item] for item in rated_by_both])
    user2_sum = np.sum([dataset[user2][item] for item in rated_by_both])
    
    # 计算所有相同评分电影的评分的平方和
    user1_squared_sum = np.sum([np.square(dataset[user1][item]) for item in rated_by_both])
    user2_squared_sum = np.sum([np.square(dataset[user2][item]) for item in rated_by_both])
    
    # 计算数据集的乘积之和
    product_sum = np.sum([dataset[user1][item] * dataset[user2][item] for item in rated_by_both])
    
    # 计算皮尔逊相关度
    Sxy = product_sum - (user1_sum * user2_sum / num_ratings)
    Sxx = user1_squared_sum - np.square(user1_sum) / num_ratings
    Syy = user2_squared_sum - np.square(user2_sum) / num_ratings
    
    if Sxx * Syy == 0:
        return 0
    
    return Sxy / np.sqrt(Sxx * Syy)


In [25]:
# load datasets
input_file = 'movie_ratings.json'
with open(input_file,'r') as f:
    data = json.loads(f.read())

print(type(data))
i = 0
for key,val in data.items():
    if i < 5:
        print("No.",i,' : ',key, ' -> ',val)
        i += 1
           
user1 = 'John Carson'
user2 = 'Michelle Peterson'

print('\nEuclidean score: ',euclidean_score(data,user1,user2))
print('\nPearson score: ',pearson_score(data, user1, user2))

<class 'dict'>
No. 0  :  John Carson  ->  {'Inception': 2.5, 'Pulp Fiction': 3.5, 'Anger Management': 3.0, 'Fracture': 3.5, 'Serendipity': 2.5, 'Jerry Maguire': 3.0}
No. 1  :  Michelle Peterson  ->  {'Inception': 3.0, 'Pulp Fiction': 3.5, 'Anger Management': 1.5, 'Fracture': 5.0, 'Jerry Maguire': 3.0, 'Serendipity': 3.5}
No. 2  :  William Reynolds  ->  {'Inception': 2.5, 'Pulp Fiction': 3.0, 'Fracture': 3.5, 'Jerry Maguire': 4.0}
No. 3  :  Jillian Hobart  ->  {'Pulp Fiction': 3.5, 'Anger Management': 3.0, 'Jerry Maguire': 4.5, 'Fracture': 4.0, 'Serendipity': 2.5}
No. 4  :  Melissa Jones  ->  {'Inception': 3.0, 'Pulp Fiction': 4.0, 'Anger Management': 2.0, 'Fracture': 3.0, 'Jerry Maguire': 3.0, 'Serendipity': 2.0}

Euclidean score:  0.14548268842493628

Pearson score:  0.39605901719066977


In [23]:
# 构建推荐引擎中，一个非常重要的任务就是寻找相似的用户，
# 即为某位用户生成的推荐信息可以同时推荐给与其相似的用户
def find_similar_users(dataset, user, num_users):
    if user not in dataset:
        raise TypeError("User " + user + "not in dataset")
    
    # 计算所有user的Pearson相关度
    scores = np.array([[x, pearson_score(dataset, user, x)] for x in dataset if user != x])
    print("Scores :", scores[:5])
    # sorted
    scores_sorted = np.argsort(scores[:, 1])
    scored_sorted_dec = scores_sorted[::-1]
    print("\nScores_sorted: ",scores_sorted[:5])
    print("\nScored_sorted_dec: ",scored_sorted_dec[:5])
    
    top_k = scored_sorted_dec[:num_users]
    print("\nTop_k :", top_k)
    return scores[top_k]

data_file = 'movie_ratings.json'
with open(data_file,'r') as f:
    data = json.loads(f.read())
    
user = 'John Carson'
print("\nUsers similar to "+user+':\n')
similar_users = find_similar_users(data, user, 3)
print("User\t\tSimilarity score\n")
for item in similar_users:
    print(item[0],'\t\t',round(float(item[1]),2))


Users similar to John Carson:

Scores : [['Michelle Peterson' '0.39605901719066977']
 ['William Reynolds' '0.40451991747794525']
 ['Jillian Hobart' '0.5669467095138396']
 ['Melissa Jones' '0.5940885257860044']
 ['Alex Roberts' '0.7470178808339965']]

Scores_sorted:  [0 1 2 3 4]

Scored_sorted_dec:  [5 4 3 2 1]

Top_k : [5 4 3]
User		Similarity score

Michael Henry 		 0.99
Alex Roberts 		 0.75
Melissa Jones 		 0.59


In [24]:
# 生成电影推荐
def generate_recommendations(dataset, user):
    if user not in dataset:
        raise TypeError(user + "not in dataset")
    
    total_scores = {}
    similarity_sums = {}
    
    for u in dataset:
        if u == user:
            continue
        else:
            similarity_score = pearson_score(dataset, user, u)
        
        if similarity_score <=0 :
            continue
    
        for item in [x for x in dataset[u] if x not in dataset[user] or dataset[user][x] == 0]:
            total_scores.update({item:dataset[u][item] * similarity_score})
            similarity_sums.update({item: similarity_score})
    
    if not total_scores:
        return ["No recommendations possible"]
    
    # 生成一个电影评分标准化列表
    movie_ranks = np.array([[total/similarity_sums[item], item] for item, total in total_scores.items()])
    # 根据第一列对Pearson相关系数进行降序排列
    movie_ranks = movie_ranks[np.argsort(movie_ranks[:, 0])[::-1]]
    
    # recommendations
    recommendations = [movie for _, movie in movie_ranks]
    return recommedations

data_file = 'movie_ratings.json'
with open(data_file,'r') as f:
    data = json.loads(f.read())
    
users = ['Michael Henry', 'John Carson']
for user in users:
    print("\nRecommendations for "+user+":")
    movies = generate_recommendations(data, user)
    for i, movie in enumerate(movies):
        print(str(i+1) + '. '+ movie)




Recommendations for Michael Henry:


KeyError: 'Inception'