In [50]:
from recommendations import critics
from math import sqrt
critics

{'Claudia Puig': {'Just My Luck': 3.0,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 4.0,
  'The Night Listener': 4.5,
  'You, Me and Dupree': 2.5},
 'Gene Seymour': {'Just My Luck': 1.5,
  'Lady in the Water': 3.0,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 5.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 3.5},
 'Jack Matthews': {'Lady in the Water': 3.0,
  'Snakes on a Plane': 4.0,
  'Superman Returns': 5.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 3.5},
 'Lisa Rose': {'Just My Luck': 3.0,
  'Lady in the Water': 2.5,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 3.5,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 2.5},
 'Michael Phillips': {'Lady in the Water': 2.5,
  'Snakes on a Plane': 3.0,
  'Superman Returns': 3.5,
  'The Night Listener': 4.0},
 'Mick LaSalle': {'Just My Luck': 2.0,
  'Lady in the Water': 3.0,
  'Snakes on a Plane': 4.0,
  'Superman Returns': 3.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 2.0},
 'Toby': {'Snak

### 欧几里得距离评价

In [24]:
# 返回两个人的距离相似度评价
def sim_distance(prefs, person1, person2):
    # 得到shared_items的列表
    si = dict()
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
    # 如果两者没有共同之处,则返回0
    if len(si) == 0: return 0
    # 计算所有差值平方和(pow()返回x^y)
    sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2)
                        for item in prefs[person1] if item in prefs[person2]])
    # 将sum_of_sqares加1避免被0整除,并取倒数
    return 1/(1+sqrt(sum_of_squares))

# 试计算Lisa Rose,Gene Seymour之间的相似度评价
sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.29429805508554946

### 皮尔逊相关系数

In [25]:
def sim_pearson(prefs, p1, p2):
    # 得到曾经评价过的物品列表
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]: 
            si[item]=1
    
    # 得到列表元素的个数
    n = len(si)
    
    # 如果两者没有共同之处,则返回1
    if n==0: 
        return 1
   
    # 对所有偏好求和
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    
    # 求平方和
    sum1sq = sum([pow(prefs[p1][it],2) for it in si])
    sum2sq = sum([pow(prefs[p2][it],2) for it in si])
    
    # 求乘积之和
    psum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
    
    # 计算皮尔逊评价值
    num = psum - (sum1*sum2/n)
    den = sqrt((sum1sq-pow(sum1,2)/n) * (sum2sq - pow(sum2,2)/n))
    if den==0:
        return 0
    r = num/den
    return r

# 计算Lisa Rose和Gene Seymour的相关系数(输出范围为-1到1)
sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

### 使用上面的方法为评论者打分

In [18]:
# 从反映偏好的字典中返回最为匹配者
# 返回结果的个数和相似度函数均为可选参数
'''
prefs:评价数据集
person:数据集里面的人
n:返回最相近的人的个数
'''
def topMatches(prefs, person, n=5, similarity=sim_pearson):
    scores = [(similarity(prefs, person, other), other) for other in prefs if other!=person]
    
    # 对列表进行排序,评价值最高者排在最前面
    scores.sort()
    scores.reverse()
    return scores[0:n]
topMatches(critics, 'Toby', n=3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

In [39]:
def getRecommendations(prefs, person, simlarity=sim_pearson):
    totals = {}
    simSums = {}
    for other in prefs:
        # 不和自己做比较 
        if other == person: continue
        sim = simlarity(prefs, person, other)
        
        # 忽略评价值为零或者小于零的情况
        if sim<=0: continue
        for item in prefs[other]:
            # 只对自己还未曾看过的影片进行评价
            if item not in prefs[person] or prefs[person][item]==0:
                # 相似度 * 评价值
                totals.setdefault(item, 0)
                totals[item] += prefs[other][item]*sim
                # 相似度之和
                simSums.setdefault(item, 0)
                simSums[item] += sim
    # 建立一个归一化的列表
    rankings = [(total/simSums[item],item) for item, total in totals.items()]
    
    # 返回经过排序的列表
    rankings.sort()
    rankings.reverse()
    return rankings

In [40]:
# pearson相关系数评价
getRecommendations(critics, 'Toby')

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [41]:
# 欧几里得距离评价
getRecommendations(critics, 'Toby', simlarity=sim_distance)

[(3.457128694491423, 'The Night Listener'),
 (2.778584003814924, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]

### 物品间的相关度

In [46]:
# 使用原来的方法将物品和人员对调
def transformPrefs(prefs):
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            # 将物品和人员对调
            result[item][person] = prefs[person][item]
    return result

In [47]:
# 调用topMathes函数,得到一组与<Superman Returns>最为相近的影片
movies = transformPrefs(critics)
topMatches(movies, 'Superman Returns')

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

### 基于物品的过滤

In [58]:
# 构造物品比较数据集
def calculateSimilarItems(prefs, n=10):
    # 建立字典,以给出与这些物品最为相近的所有其他物品
    result = {}
    
    # 以物品为中心对偏好矩阵实施倒置处理
    itemPrefs = transformPrefs(prefs)
    c = 0
    for item in itemPrefs:
        # 针对大数据集更新状态变量
        c += 1
        # 对c进行取模操作
        if c%100==0:
            print("%d / %d" % (c, len(itemPrefs)))
        # 寻找最为相近的物品
        scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
        result[item] = scores
    return result

In [95]:
itemsim = calculateSimilarItems(critics)
itemsim

{'Just My Luck': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'You, Me and Dupree'),
  (0.2989350844248255, 'The Night Listener'),
  (0.2553967929896867, 'Snakes on a Plane'),
  (0.20799159651347807, 'Superman Returns')],
 'Lady in the Water': [(0.4494897427831781, 'You, Me and Dupree'),
  (0.38742588672279304, 'The Night Listener'),
  (0.3483314773547883, 'Snakes on a Plane'),
  (0.3483314773547883, 'Just My Luck'),
  (0.2402530733520421, 'Superman Returns')],
 'Snakes on a Plane': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'The Night Listener'),
  (0.3090169943749474, 'Superman Returns'),
  (0.2553967929896867, 'Just My Luck'),
  (0.1886378647726465, 'You, Me and Dupree')],
 'Superman Returns': [(0.3090169943749474, 'Snakes on a Plane'),
  (0.252650308587072, 'The Night Listener'),
  (0.2402530733520421, 'Lady in the Water'),
  (0.20799159651347807, 'Just My Luck'),
  (0.1918253663634734, 'You, Me and Dupree')],
 'The Night Listener': [

In [106]:
# 对影片评分进行归一化计算
def getRecommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    
    # 循环遍历由当前用户评分的物品
    for (item, rating) in userRatings.items():
        # 循环遍历当前物品相近的物品
        for (similarity, item2) in itemMatch[item]:
            # 如果该用户已经对当前物品做过评价,则将其忽略
            if item2 in userRatings:
                continue
            scores.setdefault(item2, 0)
            scores[item2] += similarity * rating
            
            # 全部相似度之和
            totalSim.setdefault(item2, 0)
            totalSim[item2] += similarity
    # 将每个和机制除以加权和,求出平均值
    rankings = [(score / totalSim[item], item) for item, score in scores.items()]
    
    # 按最高值到最低值的顺序,返回评价结果
    rankings.sort()
    rankings.reverse()
    return rankings

In [107]:
# 运行函数,为Toby提供推荐结果
getRecommendedItems(critics, itemsim, 'Toby')

[(3.1667425234070894, 'The Night Listener'),
 (2.9366294028444346, 'Just My Luck'),
 (2.868767392626467, 'Lady in the Water')]

### 使用Movielens数据集

In [123]:
def loadMovieLens(path='ml-100k'):
    # 获取影片标题
    movies = {}
    for line in open(path + '/u.item', encoding='ISO-8859-1'):
        (id, title) = line.split('|')[0:2]
        movies[id] = title
        
    # 加载数据
    prefs = {}
    for line in open(path + '/u.data', encoding='ISO-8859-1'):
        (user, movieid, rating, ts) = line.split('\t')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    return prefs
prefs = loadMovieLens()
prefs['1']

{'101 Dalmatians (1996)': 2.0,
 '12 Angry Men (1957)': 5.0,
 '20,000 Leagues Under the Sea (1954)': 3.0,
 '2001: A Space Odyssey (1968)': 4.0,
 'Abyss, The (1989)': 3.0,
 'Ace Ventura: Pet Detective (1994)': 3.0,
 'Air Bud (1997)': 1.0,
 'Akira (1988)': 4.0,
 'Aladdin (1992)': 4.0,
 'Alien (1979)': 5.0,
 'Aliens (1986)': 5.0,
 'All Dogs Go to Heaven 2 (1996)': 1.0,
 'Amadeus (1984)': 5.0,
 'Angels and Insects (1995)': 4.0,
 "Antonia's Line (1995)": 5.0,
 'Apocalypse Now (1979)': 3.0,
 'Apollo 13 (1995)': 4.0,
 'Aristocats, The (1970)': 2.0,
 'Army of Darkness (1993)': 4.0,
 'Austin Powers: International Man of Mystery (1997)': 4.0,
 'Babe (1995)': 1.0,
 'Back to the Future (1985)': 5.0,
 'Bad Boys (1995)': 2.0,
 'Basic Instinct (1992)': 3.0,
 'Batman & Robin (1997)': 1.0,
 'Batman Forever (1995)': 1.0,
 'Batman Returns (1992)': 1.0,
 'Beavis and Butt-head Do America (1996)': 3.0,
 'Bedknobs and Broomsticks (1971)': 2.0,
 'Belle de jour (1967)': 3.0,
 'Big Night (1996)': 5.0,
 'Billy Ma

In [129]:
# 使用基于用户的推荐
getRecommendations(prefs, '87')[0:30]

[(5.0, 'They Made Me a Criminal (1939)'),
 (5.0, 'Star Kid (1997)'),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (5.0, 'Boys, Les (1997)'),
 (4.89884443128923, 'Legal Deceit (1997)'),
 (4.815019082242709, 'Letter From Death Row, A (1998)'),
 (4.800260666069042, 'Mrs. Dalloway (1997)'),
 (4.771240079753505, 'Leading Man, The (1996)'),
 (4.7321082983941425, 'Hearts and Minds (1996)'),
 (4.707354190896574, 'Dangerous Beauty (1998)'),
 (4.696244466490867, 'Pather Panchali (1955)'),
 (4.652397061026758, 'Lamerica (1994)'),
 (4.532337612572981, 'Innocents, The (1961)'),
 (4.527998574747076, 'Casablanca (1942)'),
 (4.512903125553784, 'Four Days in September (1997)'),
 (4.510270149719864, 'Everest (1998)'),
 (4.485151301801341, 'Wallace & Gromit: The Best of Aardman Animation (1996)'),
 (4.463287461

In [127]:
# 使用基于物品的推荐
itemsim = calculateSimilarItems(prefs, n=50)

100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664


In [128]:
getRecommendedItems(prefs, itemsim, '87')[0:30]

[(5.0, "What's Eating Gilbert Grape (1993)"),
 (5.0, 'Vertigo (1958)'),
 (5.0, 'Usual Suspects, The (1995)'),
 (5.0, 'Toy Story (1995)'),
 (5.0, 'Titanic (1997)'),
 (5.0, 'Sword in the Stone, The (1963)'),
 (5.0, 'Stand by Me (1986)'),
 (5.0, 'Sling Blade (1996)'),
 (5.0, 'Silence of the Lambs, The (1991)'),
 (5.0, 'Shining, The (1980)'),
 (5.0, 'Shine (1996)'),
 (5.0, 'Sense and Sensibility (1995)'),
 (5.0, 'Scream (1996)'),
 (5.0, 'Rumble in the Bronx (1995)'),
 (5.0, 'Rock, The (1996)'),
 (5.0, 'Robin Hood: Prince of Thieves (1991)'),
 (5.0, 'Reservoir Dogs (1992)'),
 (5.0, 'Police Story 4: Project S (Chao ji ji hua) (1993)'),
 (5.0, 'House of the Spirits, The (1993)'),
 (5.0, 'Fresh (1994)'),
 (5.0, 'Denise Calls Up (1995)'),
 (5.0, 'Day the Sun Turned Cold, The (Tianguo niezi) (1994)'),
 (5.0, 'Before the Rain (Pred dozhdot) (1994)'),
 (5.0, 'Assignment, The (1997)'),
 (5.0, '1-900 (1994)'),
 (4.875, "Ed's Next Move (1996)"),
 (4.833333333333333, 'Anna (1996)'),
 (4.8, 'Dark City 