In [82]:
# A dictionary of movie critics and their ratings of a small set of movies
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'Superman Returns': 3.5,
        'You, Me and Dupree': 2.5,
        'The Night Listener': 3.0,
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 3.5,
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.0,
        'Superman Returns': 3.5,
        'The Night Listener': 4.0,
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'The Night Listener': 4.5,
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5,
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'Just My Luck': 2.0,
        'Superman Returns': 3.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0,
    },
    'Jack Matthews': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'The Night Listener': 3.0,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5,
    },
    'Toby': {
        'Snakes on a Plane': 4.5, 
        'You, Me and Dupree': 1.0,
        'Superman Returns': 4.0},
}

In [83]:
critics['Lisa Rose']['Lady in the Water']

2.5

### Euclidean Distance Score(欧几里得距离评价)

#### Compute the distance score of Toby and Lasalle

In [84]:
from math import sqrt
sqrt(pow(4.5-4,2))+pow(1-2,2)   

1.5

In [85]:
def sim_distance(prefs, p1, p2):
    '''
    Returns a distance-based similarity score for person1 and person2.
    '''

    # Get the list of shared_items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    # If they have no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[p1][item] - prefs[p2][item], 2) for item in si])  ##Noting:it's different from the code in the book
    return 1 / (1 + sqrt(sum_of_squares)) #To avoid divide-by-0 error, add 1 here 

In [86]:
sim_distance(critics,'Lisa Rose','Gene Seymour') #range between 0-1

0.29429805508554946

### Pearson Correlation Score(皮尔逊相关度评价)

**该相关系数是判断两组数据与某一直线拟合程度的一种度量，它在数据不是很规范时会给出更好的结果**


**它的作用是修正了“夸大分值”的情况，虽然Jack Matthews总是比Lisa Rose给出的分值高，但他们的品味其实是类似的**

In [87]:
def sim_pearson(prefs, p1, p2):
    '''
    Returns the Pearson correlation coefficient for p1 and p2.
    '''

    # Get the list of mutually rated items（返回p1和p2的皮尔逊相关系数）
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    # If they are no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Sum calculations
    n = len(si)
    # Sums of all the preferences
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    # Sums of the squares
    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])
    # Sum of the products
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
    # Calculate r (Pearson score)
    num = pSum - sum1 * sum2 / n
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0:
        return 0
    r = num / den
    return r

In [88]:
sim_pearson(critics,'Lisa Rose','Gene Seymour')

0.39605901719066977

### Ranking the Critics（为评论者打分）

In [89]:
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    '''
    Returns the best matches for person from the prefs dictionary. (运用皮尔逊相关度，返回和你相似度最高的人)
    Number of results and similarity function are optional params.
    '''

    scores = [(similarity(prefs, person, other), other) for other in prefs
              if other != person]
    scores.sort() #从大到小进行排序
    scores.reverse() #翻转，从小到大进行排序
    return scores[0:n]

In [90]:
topMatches(critics,'Toby',n=3) #与Toby相似度最高的三个人

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

### Recommending Items(推荐物品)

**基本思路：找到与自己匹配度最高的几个人，将他们打过分的物品按照权重得到各自的单项分数（即匹配度高的人其权重也高，具体算法是相似度\*匹配值），然后进行排序**

In [91]:
def getRecommendations(prefs, person, similarity=sim_pearson):
    '''
    Gets recommendations for a person by using a weighted average
    of every other user's rankings
    '''

    totals = {}
    simSums = {}
    for other in prefs:
    # Don't compare me to myself
        if other == person:
            continue
        sim = similarity(prefs, person, other)
        # Ignore scores of zero or lower
        if sim <= 0:
            continue
        for item in prefs[other]:
            # Only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item] == 0:
                # Similarity * Score
                totals.setdefault(item, 0)
                # The final score is calculated by multiplying each item by the
                #   similarity and adding these products together
                totals[item] += prefs[other][item] * sim
                # Sum of similarities
                simSums.setdefault(item, 0)
                simSums[item] += sim
    # Create the normalized list
    rankings = [(total / simSums[item], item) for (item, total) in
                totals.items()]        ##.items()以列表形式返回(键，值)
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings


In [92]:
getRecommendations(critics,'Toby') #Pearson Correlation

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [93]:
getRecommendations(critics,'Toby',similarity=sim_distance) #Euclidean Distance

[(3.457128694491423, 'The Night Listener'),
 (2.778584003814924, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]

## Matching Products(匹配商品)

**和前面不同，现在我们要为一个商品给出top3相似度的商品，因此我们需要如下类型的数据:<br>'Lisa Rose': {'Lady in the Water': 2.5}<br>'Gene Seymour': {'Lady in the Water': 3.0}<br>变为<br>'Lady in the Water':{'Lisa Rose':2.5,'Gene Semour': 3.5}<br>之后我们就可以复用以前的topMatches()了**

In [94]:
def transformPrefs(prefs):
    '''
    Transform the recommendations into a mapping where persons are described
    with interest scores for a given title e.g. {title: person} instead of
    {person: title}.
    '''

    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            # Flip item and person
            result[item][person] = prefs[person][item]
    return result

In [95]:
movies = transformPrefs(critics)

In [96]:
print(movies)

{'Lady in the Water': {'Lisa Rose': 2.5, 'Gene Seymour': 3.0, 'Michael Phillips': 2.5, 'Mick LaSalle': 3.0, 'Jack Matthews': 3.0}, 'Snakes on a Plane': {'Lisa Rose': 3.5, 'Gene Seymour': 3.5, 'Michael Phillips': 3.0, 'Claudia Puig': 3.5, 'Mick LaSalle': 4.0, 'Jack Matthews': 4.0, 'Toby': 4.5}, 'Just My Luck': {'Lisa Rose': 3.0, 'Gene Seymour': 1.5, 'Claudia Puig': 3.0, 'Mick LaSalle': 2.0}, 'Superman Returns': {'Lisa Rose': 3.5, 'Gene Seymour': 5.0, 'Michael Phillips': 3.5, 'Claudia Puig': 4.0, 'Mick LaSalle': 3.0, 'Jack Matthews': 5.0, 'Toby': 4.0}, 'You, Me and Dupree': {'Lisa Rose': 2.5, 'Gene Seymour': 3.5, 'Claudia Puig': 2.5, 'Mick LaSalle': 2.0, 'Jack Matthews': 3.5, 'Toby': 1.0}, 'The Night Listener': {'Lisa Rose': 3.0, 'Gene Seymour': 3.0, 'Michael Phillips': 4.0, 'Claudia Puig': 4.5, 'Mick LaSalle': 3.0, 'Jack Matthews': 3.0}}


In [97]:
topMatches(movies,'Superman Returns') #负值代表不喜欢

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

### 寻找这部影片的潜在爱好者

In [98]:
getRecommendations(movies,'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

## Item-based Filtering(基于物品的过滤)

### Building the Item Comparsion Dataset(构造物品比较数据集)

In [99]:
def calculateSimilarItems(prefs, n=10):
    '''
    Create a dictionary of items showing which other items they are
    most similar to.
    '''

    result = {}
    # Invert the preference matrix to be item-centric
    itemPrefs = transformPrefs(prefs)
    c = 0
    for item in itemPrefs:
        # Status updates for large datasets
        c += 1
        if c % 100 == 0:
            print('%d / %d' % (c, len(itemPrefs)))
        # Find the most similar items to this one
        scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
        result[item] = scores
    return result

In [100]:
itemsim = calculateSimilarItems(critics)

###  Getting Recommendations(获得推荐)

In [101]:
def getRecommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    # Loop over items rated by this user(循环遍历由当前用户评分的物品)
    for (item, rating) in userRatings.items():
        # Loop over items similar to this one（循环遍历与当前物品相近的物品）
        for (similarity, item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings:
                continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2, 0)
            scores[item2] += similarity * rating
            # Sum of all the similarities
            totalSim.setdefault(item2, 0)
            totalSim[item2] += similarity
    # Divide each total score by total weighting to get an average
    rankings = [(score / totalSim[item], item) for (item, score) in
                scores.items()]
    # Return the rankings from highest to lowest
    rankings.sort()
    rankings.reverse()
    return rankings

In [102]:
getRecommendedItems(critics,itemsim,'Toby')

[(3.1667425234070894, 'The Night Listener'),
 (2.9366294028444346, 'Just My Luck'),
 (2.868767392626467, 'Lady in the Water')]

## Building a del.icio.us Link Recommender(构建一个基于del.icio.us的推荐系统)

*由于pydelicious失效，这部分内容略去*

## Using the  MovieLens Dataset(使用MovieLens数据集)

**这里有几个在Python3下比较匪夷所思的问题：<br>1.path地址有问题，总是找不到合适的path地址，解决方法是利用os.getcwd()得到当前目录，其余同书上<br>
2.line.split('.')看csv就可知道，新的数据集分隔符为'.'<br>3.csv file第一行总有user，movieid等提示信息，不能一股脑录入，解决方案是直接在csv文件里删去**

In [103]:
import os
def loadMovieLens(path=os.getcwd()+'/data/movielens'):
    movies = {}
    for line in open(path + '/movies.csv'):
        (id, title) = line.split(',')[0:2]
        movies[id] = title
  # Load data
    prefs = {}
    for line in open(path + '/ratings.csv'):
        (user, movieid, rating, ts) = line.split(',')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    return prefs

In [104]:
prefs = loadMovieLens()

In [105]:
prefs['87'] #第87位用户的评分如下

{'"Birdcage': 4.0,
 '"Rock': 3.0,
 'Beavis and Butt-Head Do America (1996)': 2.0,
 'Black Sheep (1996)': 3.0,
 'Broken Arrow (1996)': 3.0,
 'Cold Comfort Farm (1995)': 5.0,
 'Eraser (1996)': 3.0,
 'Executive Decision (1996)': 4.0,
 'Fargo (1996)': 5.0,
 'Happy Gilmore (1996)': 4.0,
 'Independence Day (a.k.a. ID4) (1996)': 3.0,
 'Kids in the Hall: Brain Candy (1996)': 3.0,
 'Kingpin (1996)': 4.0,
 'Leaving Las Vegas (1995)': 4.0,
 'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)': 5.0,
 'Mighty Aphrodite (1995)': 4.0,
 'Mission: Impossible (1996)': 3.0,
 "Mr. Holland's Opus (1995)": 1.0,
 'Phenomenon (1996)': 3.0,
 'Rumble in the Bronx (Hont faan kui) (1995)': 3.0,
 'Sabrina (1995)': 3.0,
 'Shine (1996)': 5.0,
 'Star Wars: Episode IV - A New Hope (1977)': 4.0,
 'Star Wars: Episode VI - Return of the Jedi (1983)': 3.0,
 'Striptease (1996)': 3.0,
 'Tin Cup (1996)': 1.0,
 'Toy Story (1995)': 3.0,
 'Trainspotting (1996)': 3.0,
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)': 3.0

In [106]:
getRecommendations(prefs,'87',)[0:30]

[(5.0, 'Wrong Cops (2013)'),
 (5.0, 'Wrong (2012)'),
 (5.0, 'Wolf Children (Okami kodomo no ame to yuki) (2012)'),
 (5.0, 'Without a Clue (1988)'),
 (5.0, 'Wish Upon a Star (1996)'),
 (5.0, 'Willie & Phil (1980)'),
 (5.0, 'When Night Is Falling (1995)'),
 (5.0, 'Waltz with Bashir (Vals im Bashir) (2008)'),
 (5.0, 'Waiter (Ober) (2006)'),
 (5.0, 'Voyeur (Abel) (1986)'),
 (5.0, 'Visions of Light: The Art of Cinematography (1992)'),
 (5.0, 'Village of the Damned (1960)'),
 (5.0, 'Undertow (2004)'),
 (5.0, 'Trailer Park Boys (1999)'),
 (5.0, 'Through the Olive Trees (Zire darakhatan zeyton) (1994)'),
 (5.0, 'They All Laughed (1981)'),
 (5.0, 'Theremin: An Electronic Odyssey (1993)'),
 (5.0, 'The Slipper and the Rose: The Story of Cinderella (1976)'),
 (5.0, 'The Pacific (2010)'),
 (5.0, 'The Last Days of Emma Blank (2009)'),
 (5.0, 'The Earrings of Madame de... (1953)'),
 (5.0, 'The Dress (1996)'),
 (5.0, 'The Big Bus (1976)'),
 (5.0, 'The Beatles: Eight Days a Week - The Touring Years (20

In [107]:
itemsim = calculateSimilarItems(prefs,n=50)

100 / 8963
200 / 8963
300 / 8963
400 / 8963
500 / 8963
600 / 8963
700 / 8963
800 / 8963
900 / 8963
1000 / 8963
1100 / 8963
1200 / 8963
1300 / 8963
1400 / 8963
1500 / 8963
1600 / 8963
1700 / 8963
1800 / 8963
1900 / 8963
2000 / 8963
2100 / 8963
2200 / 8963
2300 / 8963
2400 / 8963
2500 / 8963
2600 / 8963
2700 / 8963
2800 / 8963
2900 / 8963
3000 / 8963
3100 / 8963
3200 / 8963
3300 / 8963
3400 / 8963
3500 / 8963
3600 / 8963
3700 / 8963
3800 / 8963
3900 / 8963
4000 / 8963
4100 / 8963
4200 / 8963
4300 / 8963
4400 / 8963
4500 / 8963
4600 / 8963
4700 / 8963
4800 / 8963
4900 / 8963
5000 / 8963
5100 / 8963
5200 / 8963
5300 / 8963
5400 / 8963
5500 / 8963
5600 / 8963
5700 / 8963
5800 / 8963
5900 / 8963
6000 / 8963
6100 / 8963
6200 / 8963
6300 / 8963
6400 / 8963
6500 / 8963
6600 / 8963
6700 / 8963
6800 / 8963
6900 / 8963
7000 / 8963
7100 / 8963
7200 / 8963
7300 / 8963
7400 / 8963
7500 / 8963
7600 / 8963
7700 / 8963
7800 / 8963
7900 / 8963
8000 / 8963
8100 / 8963
8200 / 8963
8300 / 8963
8400 / 8963
8

In [108]:
getRecommendedItems(prefs,itemsim,'87')[0:30]

[(5.0, 'xXx (2002)'),
 (5.0, 'loudQUIETloud: A Film About the Pixies (2006)'),
 (5.0, 'Zombieland (2009)'),
 (5.0, 'Zodiac (2007)'),
 (5.0, 'Zenon: Z3 (2004)'),
 (5.0, 'Zenon: The Zequel (2001)'),
 (5.0, 'Zenon: Girl of the 21st Century (1999)'),
 (5.0, 'Young People Fucking (a.k.a. YPF) (2007)'),
 (5.0, 'Yossi & Jagger (2002)'),
 (5.0, 'Wrong Cops (2013)'),
 (5.0, 'Wrong (2012)'),
 (5.0, 'Wonderland (2003)'),
 (5.0, 'Wonder Woman (2009)'),
 (5.0, "Winter's Bone (2010)"),
 (5.0, 'Winnie the Pooh and the Blustery Day (1968)'),
 (5.0, 'Winnebago Man (2009)'),
 (5.0, 'Willie & Phil (1980)'),
 (5.0, 'Wild at Heart (1990)'),
 (5.0, 'White Lightning (1973)'),
 (5.0, 'Whiplash (2014)'),
 (5.0, 'Wetlands (Feuchtgebiete) (2013)'),
 (5.0, 'Wedding Crashers (2005)'),
 (5.0, 'We Own the Night (2007)'),
 (5.0, 'We Bought a Zoo (2011)'),
 (5.0, 'Walking and Talking (1996)'),
 (5.0, 'Waiter (Ober) (2006)'),
 (5.0, 'Voyeur (Abel) (1986)'),
 (5.0, 'Visions of Light: The Art of Cinematography (1992)'),
