In [1]:

critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

# Finding similar users

Q:用什么算法衡量两个用户的相似性？
A:欧几里得距离评价 或 皮尔逊相关度评价，当然还有其他算法，如Jaccard系数或曼哈顿距离算法

Q:皮尔逊相关度的优点
A:在数据不是很规范时（比如影评者对影片的评价总是高于平均水平）也能给出较好的结果

In [2]:
'''
Euclidean Distance Score:欧几里得距离评价
'''
from math import sqrt
def sim_distance(critics_data,user1,user2):
    shared_item = []
    for item in critics_data[user1]:
        if item in critics_data[user2]:
            shared_item.append(item)

    if len(shared_item) == 0:
        return 0
    
    sum_of_squares = sum([pow(critics_data[user1][item] - critics_data[user2][item],2) for item in shared_item])
    return 1/(1+sqrt(sum_of_squares))

sim_distance(critics,'Lisa Rose','Toby')

0.3483314773547883

In [3]:
from math import sqrt
def sim_all_distance(critics_data,user1,user2):
    all_item = set()
    user1_item = set(critics_data[user1].keys())
    user2_item = set(critics_data[user2].keys())
    
    all_item = user1_item & user2_item
    
    if len(all_item) == 0:
        return 0
    
    sum_of_squares = sum([pow(critics_data[user1][item] - critics_data[user2][item],2) for item in all_item])
    return 1/(1+sqrt(sum_of_squares))
    
sim_all_distance(critics,'Lisa Rose','Toby')

0.3483314773547883

In [4]:
'''
Pearson Correlation Score - 皮尔逊相关度评价

该函数返回一个介于-1和1之间的数值，值为1则表明两个人相关度较高，So 0和-1都代表什么？
'''

def sim_pearson(critics_data,user1,user2):
    shared_item = set()
    user1_item = set(critics_data[user1].keys())
    user2_item = set(critics_data[user2].keys())
    
    shared_item = user1_item & user2_item
    shared_count = len(shared_item)
    
    if shared_count == 0:
        return 0
    
    sum_user1 = sum([critics_data[user1][item] for item in shared_item])
    sum_user2 = sum([critics_data[user2][item] for item in shared_item])
    
    sum_user1_square = sum([pow(critics_data[user1][item],2) for item in shared_item])
    sum_user2_square = sum([pow(critics_data[user2][item],2) for item in shared_item])
    
    sum_total_square = sum([critics_data[user1][item] * critics_data[user2][item] for item in shared_item])
    
    num = sum_total_square - (sum_user1*sum_user2/shared_count)
    den = sqrt( (sum_user1_square - pow(sum_user1,2)/shared_count) * (sum_user2_square - pow(sum_user2,2)/shared_count))
    if den == 0:
        return 0
    return num/den

sim_pearson(critics,'Lisa Rose','Gene Seymour')

0.39605901719066977

In [6]:
'''
获取指定用户的推荐者列表
'''
def top_matches(critics_data,user1,top_n,core=sim_pearson):
    
    '''
    if core == 'sim_pearson':
        scores = [(core(critics_data,user1,user_other),user_other) for user_other in critics_data.keys() if user_other != user1]
        
    if core == 'sim_distance':
        scores = [(sim_distance(critics_data,user1,user_other),user_other) for user_other in critics_data.keys() if user_other != user1]
    '''
    scores = [(core(critics_data,user1,user_other),user_other) for user_other in critics_data.keys() if user_other != user1]
    return sorted(scores, key=lambda x:x[0], reverse=True)[:top_n]
    
print(top_matches(critics,'Lisa Rose',3,sim_pearson))
print()
print(top_matches(critics,'Lisa Rose',3,sim_distance))

[(0.9912407071619299, 'Toby'), (0.7470178808339965, 'Jack Matthews'), (0.5940885257860044, 'Mick LaSalle')]

[(0.4721359549995794, 'Michael Phillips'), (0.4142135623730951, 'Mick LaSalle'), (0.38742588672279304, 'Claudia Puig')]
