In [1]:
# A dictionary of movie critics and their ratings of a small set of movies
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'Superman Returns': 3.5,
        'You, Me and Dupree': 2.5,
        'The Night Listener': 3.0,
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 3.5,
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.0,
        'Superman Returns': 3.5,
        'The Night Listener': 4.0,
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'The Night Listener': 4.5,
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5,
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'Just My Luck': 2.0,
        'Superman Returns': 3.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0,
    },
    'Jack Matthews': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'The Night Listener': 3.0,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5,
    },
    'Toby': {
        'Snakes on a Plane': 4.5, 
        'You, Me and Dupree': 1.0,
        'Superman Returns': 4.0},
}

In [3]:
critics['Lisa Rose']['Lady in the Water']

2.5

## Euclidean Distance Score(欧几里得距离评价)

#### Compute the distance score of Toby and Lasalle

In [4]:
from math import sqrt
sqrt(pow(4.5-4,2))+pow(1-2,2)   

1.5

In [12]:
def sim_distance(prefs, p1, p2):
    '''
    Returns a distance-based similarity score for person1 and person2.
    '''

    # Get the list of shared_items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    # If they have no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[p1][item] - prefs[p2][item], 2) for item in si])  ##Noting:it's different from the code in the book
    return 1 / (1 + sqrt(sum_of_squares)) #To avoid divide-by-0 error, add 1 here 

In [13]:
sim_distance(critics,'Lisa Rose','Gene Seymour') #range between 0-1

0.29429805508554946

## Pearson Correlation Score(皮尔逊相关度评价)

**该相关系数是判断两组数据与某一直线拟合程度的一种度量，它在数据不是很规范时会给出更好的结果**


**它的作用是修正了“夸大分值”的情况，虽然Jack Matthews总是比Lisa Rose给出的分值高，但他们的品味其实是类似的**

In [14]:
def sim_pearson(prefs, p1, p2):
    '''
    Returns the Pearson correlation coefficient for p1 and p2.
    '''

    # Get the list of mutually rated items（返回p1和p2的皮尔逊相关系数）
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    # If they are no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Sum calculations
    n = len(si)
    # Sums of all the preferences
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    # Sums of the squares
    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])
    # Sum of the products
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
    # Calculate r (Pearson score)
    num = pSum - sum1 * sum2 / n
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0:
        return 0
    r = num / den
    return r

In [16]:
sim_pearson(critics,'Lisa Rose','Gene Seymour')

0.39605901719066977

## Ranking the Critics（为评论者打分）

In [18]:
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    '''
    Returns the best matches for person from the prefs dictionary. (运用皮尔逊相关度，返回和你相似度最高的人)
    Number of results and similarity function are optional params.
    '''

    scores = [(similarity(prefs, person, other), other) for other in prefs
              if other != person]
    scores.sort() #从大到小进行排序
    scores.reverse() #翻转，从小到大进行排序
    return scores[0:n]

In [19]:
topMatches(critics,'Toby',n=3) #与Toby相似度最高的三个人

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

## Recommending Items(推荐物品)

**基本思路：找到与自己匹配度最高的几个人，将他们打过分的物品按照权重得到各自的单项分数（即匹配度高的人其权重也高，具体算法是相似度\*匹配值），然后进行排序**

In [25]:
def getRecommendations(prefs, person, similarity=sim_pearson):
    '''
    Gets recommendations for a person by using a weighted average
    of every other user's rankings
    '''

    totals = {}
    simSums = {}
    for other in prefs:
    # Don't compare me to myself
        if other == person:
            continue
        sim = similarity(prefs, person, other)
        # Ignore scores of zero or lower
        if sim <= 0:
            continue
        for item in prefs[other]:
            # Only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item] == 0:
                # Similarity * Score
                totals.setdefault(item, 0)
                # The final score is calculated by multiplying each item by the
                #   similarity and adding these products together
                totals[item] += prefs[other][item] * sim
                # Sum of similarities
                simSums.setdefault(item, 0)
                simSums[item] += sim
    # Create the normalized list
    rankings = [(total / simSums[item], item) for (item, total) in
                totals.items()]        ##.items()以列表形式返回(键，值)
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings


In [26]:
getRecommendations(critics,'Toby') #Pearson Correlation

[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [27]:
getRecommendations(critics,'Toby',similarity=sim_distance) #Euclidean Distance

[(3.457128694491423, 'The Night Listener'),
 (2.7785840038149234, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]

## Matching Products(匹配商品)

**和前面不同，现在我们要为一个商品给出top3相似度的商品，因此我们需要如下类型的数据:<br>'Lisa Rose': {'Lady in the Water': 2.5}<br>'Gene Seymour': {'Lady in the Water': 3.0}<br>变为<br>'Lady in the Water':{'Lisa Rose':2.5,'Gene Semour': 3.5}<br>之后我们就可以复用以前的topMatches()了**

In [28]:
def transformPrefs(prefs):
    '''
    Transform the recommendations into a mapping where persons are described
    with interest scores for a given title e.g. {title: person} instead of
    {person: title}.
    '''

    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            # Flip item and person
            result[item][person] = prefs[person][item]
    return result

In [29]:
movies = transformPrefs(critics)

In [30]:
print(movies)

{'Superman Returns': {'Claudia Puig': 4.0, 'Gene Seymour': 5.0, 'Lisa Rose': 3.5, 'Jack Matthews': 5.0, 'Mick LaSalle': 3.0, 'Toby': 4.0, 'Michael Phillips': 3.5}, 'You, Me and Dupree': {'Claudia Puig': 2.5, 'Gene Seymour': 3.5, 'Lisa Rose': 2.5, 'Jack Matthews': 3.5, 'Mick LaSalle': 2.0, 'Toby': 1.0}, 'Lady in the Water': {'Michael Phillips': 2.5, 'Gene Seymour': 3.0, 'Lisa Rose': 2.5, 'Jack Matthews': 3.0, 'Mick LaSalle': 3.0}, 'Snakes on a Plane': {'Claudia Puig': 3.5, 'Gene Seymour': 3.5, 'Lisa Rose': 3.5, 'Jack Matthews': 4.0, 'Mick LaSalle': 4.0, 'Toby': 4.5, 'Michael Phillips': 3.0}, 'Just My Luck': {'Claudia Puig': 3.0, 'Gene Seymour': 1.5, 'Lisa Rose': 3.0, 'Mick LaSalle': 2.0}, 'The Night Listener': {'Claudia Puig': 4.5, 'Gene Seymour': 3.0, 'Lisa Rose': 3.0, 'Jack Matthews': 3.0, 'Mick LaSalle': 3.0, 'Michael Phillips': 4.0}}


In [31]:
topMatches(movies,'Superman Returns') #负值代表不喜欢

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

### 寻找这部影片的潜在爱好者

In [32]:
getRecommendations(movies,'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

## Building a del.icio.us Link Recommender(构建一个基于del.icio.us的推荐系统)