# 推荐系统

## 协同过滤
* Item-Base
* User-Base

In [1]:
critics = {
    'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'Yor, Me and Dupree': 2.5, 'The Night Listener': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 'Yor, Me and Dupree': 3.5},
    'Michale Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0, 'Superman Returns': 3.5, 'The Night Listener': 4.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0, 'Yor, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0, 'Yor, Me and Dupree': 2.0},
    'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'Yor, Me and Dupree': 3.5},
    'Toby': {'Snakes on a Plane': 4.5, 'Yor, Me and Dupree': 1.0, 'Superman Returns': 4.0}
}

In [2]:
critics['Lisa Rose']['Lady in the Water']

2.5

## 欧氏距离来计算相似度

In [9]:
from math import sqrt

def sim_distance(prefs, person1, person2):
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
            
    if len(si) == 0:
        return 0
    
    ## 用欧氏距离来计算
    sum_of_squares = sum([pow((prefs[person1][item]-prefs[person2][item]),2) for item in prefs[person1] if item in prefs[person2]])
    
    return 1/(1+ sqrt(sum_of_squares))
        

In [10]:
sim_distance(critics,'Lisa Rose', 'Gene Seymour')

0.29429805508554946

## 找出与某个person 最相近的

In [39]:
def find_most_clearly(prefs, person):
    scores = []
    most_clearly_person=None
    print ('>>>>>>>>>>>Find "%s" the most clearly>>>>>>>>>>>' % person)
    for key in prefs.keys():
        if key != person:
            score = sim_distance(critics, person, key)
            scores.append(score)
            most_clearly_person = key if max(scores) == score else most_clearly_person
            
            print('"%s" vs. "%s": %.8f' % (person, key, score))
            
    if len(scores) == 0:
        return 0
            
    max_score = max(scores)
    print('Vs."%s" most clearly is "%s", clearly value is: %.8f' % (person, most_clearly_person, max_score))
    return max_score

In [40]:
find_most_clearly(critics, 'Lisa Rose')

>>>>>>>>>>>Find "Lisa Rose" the most clearly>>>>>>>>>>>
"Lisa Rose" vs. "Jack Matthews": 0.34054243
"Lisa Rose" vs. "Mick LaSalle": 0.41421356
"Lisa Rose" vs. "Claudia Puig": 0.38742589
"Lisa Rose" vs. "Toby": 0.34833148
"Lisa Rose" vs. "Michale Phillips": 0.47213595
"Lisa Rose" vs. "Gene Seymour": 0.29429806
Vs. "Lisa Rose" most clearly is "Michale Phillips", clearly value is: 0.47213595


0.4721359549995794

In [41]:
find_most_clearly(critics, 'Gene Seymour')

>>>>>>>>>>>Find "Gene Seymour" the most clearly>>>>>>>>>>>
"Gene Seymour" vs. "Jack Matthews": 0.66666667
"Gene Seymour" vs. "Mick LaSalle": 0.27792630
"Gene Seymour" vs. "Claudia Puig": 0.28172905
"Gene Seymour" vs. "Lisa Rose": 0.29429806
"Gene Seymour" vs. "Toby": 0.25824570
"Gene Seymour" vs. "Michale Phillips": 0.34054243
Vs. "Gene Seymour" most clearly is "Jack Matthews", clearly value is: 0.66666667


0.6666666666666666

In [42]:
find_most_clearly(critics, 'Toby')

>>>>>>>>>>>Find "Toby" the most clearly>>>>>>>>>>>
"Toby" vs. "Jack Matthews": 0.26747889
"Toby" vs. "Mick LaSalle": 0.40000000
"Toby" vs. "Claudia Puig": 0.35678917
"Toby" vs. "Lisa Rose": 0.34833148
"Toby" vs. "Michale Phillips": 0.38742589
"Toby" vs. "Gene Seymour": 0.25824570
Vs. "Toby" most clearly is "Mick LaSalle", clearly value is: 0.40000000


0.4

# 皮尔逊相关度 来计算相似度
皮尔逊相关度 首先要找出两个 person都评价过的物品，然后计算两者的评分总各与平方根, 并求得评分的乘积之和 

In [43]:
def sim_person(prefs, p1, p2):
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item]=1
            
    n = len(si)        
    
    if n == 0: return 0
    
    # 对所有的偏好求和
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    
    # 求平方和
    sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it],2) for it in si])
    
    # 求乘积之和
    pSum = sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    # 求皮尔逊 评价值
    num = pSum - (sum1*sum2/n)
    den = sqrt((sum1Sq-pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
    if den == 0: return 0
    
    r = num/den
    
    return r

In [44]:
sim_person(critics,'Lisa Rose', 'Gene Seymour')

0.39605901719066977

### 一个通用方法，根据指定人员与每个人进行相似度比较 

In [64]:
def topMatches(prefs, person, n=5, similarity=sim_person):
    scores = [(similarity(prefs, person, other),other) for other in prefs if other != person]
    
    scores.sort()
    scores.reverse()
    return scores[:n]

In [65]:
topMatches(critics, 'Lisa Rose')

[(0.9912407071619299, 'Toby'),
 (0.7470178808339965, 'Jack Matthews'),
 (0.5940885257860044, 'Mick LaSalle'),
 (0.5669467095138396, 'Claudia Puig'),
 (0.40451991747794525, 'Michale Phillips')]

## 如何为用户推荐还没有看过的电影？
从现有的用户看过的，且评分高的影片中推荐

In [71]:
# 利用所有他人评价值的加权平均，为某人提供建议
def getRecommendations(prefs, person, similarity=sim_person):
    totals = {}
    simSums = {}
    for other in prefs:
        if other == person: 
            continue
        sim = similarity(prefs, person, other)
        
        if sim<=0: continue
        
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item] == 0:
                totals.setdefault(item, 0)
                totals[item]+= prefs[other][item]*sim
                
                simSums.setdefault(item,0)
                simSums[item]+=sim
                
    # create 一个 规一化列表
    rankings = [(total/simSums[item], item) for item, total in totals.items()]
    rankings.sort()
    rankings.reverse()
    
    return rankings       

In [74]:
print critics['Toby']
getRecommendations(critics, 'Toby')

{'Snakes on a Plane': 4.5, 'Yor, Me and Dupree': 1.0, 'Superman Returns': 4.0}


[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.5309807037655645, 'Just My Luck')]

In [85]:
getRecommendations(critics, 'Toby', similarity=sim_distance)

[(3.4571286944914226, 'The Night Listener'),
 (2.778584003814923, 'Lady in the Water'),
 (2.4224820423619167, 'Just My Luck')]

In [78]:
def transformPrefs(prefs):
    results = {}
    for person in prefs:
        for item in prefs[person]:
            results.setdefault(item,{})
            
            results[item][person]=prefs[person][item]
            
    return results

In [80]:
movies = transformPrefs(critics)
movies

{'Just My Luck': {'Claudia Puig': 3.0,
  'Gene Seymour': 1.5,
  'Lisa Rose': 3.0,
  'Mick LaSalle': 2.0},
 'Lady in the Water': {'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 2.5,
  'Michale Phillips': 2.5,
  'Mick LaSalle': 3.0},
 'Snakes on a Plane': {'Claudia Puig': 3.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 4.0,
  'Lisa Rose': 3.5,
  'Michale Phillips': 3.0,
  'Mick LaSalle': 4.0,
  'Toby': 4.5},
 'Superman Returns': {'Claudia Puig': 4.0,
  'Gene Seymour': 5.0,
  'Jack Matthews': 5.0,
  'Lisa Rose': 3.5,
  'Michale Phillips': 3.5,
  'Mick LaSalle': 3.0,
  'Toby': 4.0},
 'The Night Listener': {'Claudia Puig': 4.5,
  'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 3.0,
  'Michale Phillips': 4.0,
  'Mick LaSalle': 3.0},
 'Yor, Me and Dupree': {'Claudia Puig': 2.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 3.5,
  'Lisa Rose': 2.5,
  'Mick LaSalle': 2.0,
  'Toby': 1.0}}

以上结果便是 每部电影，用户的评分

In [81]:
topMatches(movies, 'Just My Luck')

[(0.5555555555555556, 'The Night Listener'),
 (-0.3333333333333333, 'Snakes on a Plane'),
 (-0.42289003161103106, 'Superman Returns'),
 (-0.4856618642571827, 'Yor, Me and Dupree'),
 (-0.9449111825230676, 'Lady in the Water')]

In [82]:
topMatches(movies, 'Superman Returns')

[(0.6579516949597695, 'Yor, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

以上简单的代码便可实现，在用户购买一个商品时，为用户推荐 **买过该商品的用户也买/看过这些商品** 的推荐!

# 基于 物品 的推荐 (Item-Base)

In [207]:
# 以下函数主要目的是为了生成数据集
def caculateSimilarItems(prefs, n=10, printLogCount=100):
    results = {}
    itemPrefs = transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        c+=1
        if c%printLogCount == 0:
            print('%d / %d' %(c, len(itemPrefs)))
        
        # 寻找最为相近的物品
        scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
        results[item] = scores
        
    return results    

In [86]:
itemSim = caculateSimilarItems(critics)
itemSim

{'Just My Luck': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'Yor, Me and Dupree'),
  (0.2989350844248255, 'The Night Listener'),
  (0.2553967929896867, 'Snakes on a Plane'),
  (0.20799159651347807, 'Superman Returns')],
 'Lady in the Water': [(0.4494897427831781, 'Yor, Me and Dupree'),
  (0.38742588672279304, 'The Night Listener'),
  (0.3483314773547883, 'Snakes on a Plane'),
  (0.3483314773547883, 'Just My Luck'),
  (0.2402530733520421, 'Superman Returns')],
 'Snakes on a Plane': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'The Night Listener'),
  (0.3090169943749474, 'Superman Returns'),
  (0.2553967929896867, 'Just My Luck'),
  (0.1886378647726465, 'Yor, Me and Dupree')],
 'Superman Returns': [(0.3090169943749474, 'Snakes on a Plane'),
  (0.252650308587072, 'The Night Listener'),
  (0.2402530733520421, 'Lady in the Water'),
  (0.20799159651347807, 'Just My Luck'),
  (0.1918253663634734, 'Yor, Me and Dupree')],
 'The Night Listener': [

In [98]:
def getRecommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    for (item, rating) in userRatings.items():
        for (similarity, item2) in itemMatch[item]:
            if item2 in userRatings: continue
                
            # 评价值与相似度的加权之和    
            scores.setdefault(item2, 0)
            scores[item2]+=similarity * rating
            
            # 全部相似度之和
            totalSim.setdefault(item2, 0)
            totalSim[item2]+=similarity
    
    # 将每个合计值除以加权和，求出平均值
    rankings = [(score/totalSim[item], item) for (item, score) in scores.items()]
    
    rankings.sort()
    rankings.reverse()
    return rankings

In [99]:
getRecommendedItems(critics, itemSim, 'Toby')

[(3.1667425234070894, 'The Night Listener'),
 (2.9366294028444346, 'Just My Luck'),
 (2.868767392626467, 'Lady in the Water')]

# 使用 MoviveLens 数据集

* MovieLens 数据集: https://grouplens.org/datasets/movielens/


In [163]:
import pandas as pd

In [178]:
def loadMovieLens(path='data/MovieLens/'):
    movies = {}
    moviesData = pd.read_csv(path +'movies.csv')
    for i in range(moviesData.shape[0]):
        movie = moviesData.iloc[i:i+1]
        movies[movie['movieId'].values[0]] = movie['title'].values[0]
    
    prefs={}
    ratingsData = pd.read_csv('data/MovieLens/ratings.csv')
    for i in range(ratingsData.shape[0]):
        ratingD = ratingsData.iloc[i:i+1]
        user = str(ratingD['userId'].values[0])
        movieId=ratingD['movieId'].values[0]
        rating = ratingD['rating'].values[0]
        prefs.setdefault(user, {})
        prefs[user][movies[movieId]] = float(rating)
        
    return  prefs  

In [179]:
moviePrefs = loadMovieLens()

In [181]:
# 基于用户推荐
# moviePrefs['87']
#moviePrefs.items()[0:10]
getRecommendations(moviePrefs, '87')[0:10]

[(5.0, 'Wrong Cops (2013)'),
 (5.0, 'Wrong (2012)'),
 (5.0, 'Wolf Children (Okami kodomo no ame to yuki) (2012)'),
 (5.0, 'Without a Clue (1988)'),
 (5.0, 'Wish Upon a Star (1996)'),
 (5.0, 'Willie & Phil (1980)'),
 (5.0, 'White Sound, The (Das wei\xc3\x9fe Rauschen) (2001)'),
 (5.0, 'When Night Is Falling (1995)'),
 (5.0, 'Waltz with Bashir (Vals im Bashir) (2008)'),
 (5.0, 'Waiter (Ober) (2006)')]

In [208]:
import datetime
starTime = datetime.datetime.now()
itemSim = caculateSimilarItems(moviePrefs,printLogCount=1000)
print (getRecommendedItems(moviePrefs,itemSim,'87')[0:10])
print ('Spent %d seconds' % (datetime.datetime.now() - starTime).total_seconds())

1000 / 9064
2000 / 9064
3000 / 9064
4000 / 9064
5000 / 9064
6000 / 9064
7000 / 9064
8000 / 9064
9000 / 9064
[(5.0, 'xXx (2002)'), (5.0, 'loudQUIETloud: A Film About the Pixies (2006)'), (5.0, 'Zombieland (2009)'), (5.0, 'Zodiac (2007)'), (5.0, 'Zenon: Z3 (2004)'), (5.0, 'Zenon: The Zequel (2001)'), (5.0, 'Zenon: Girl of the 21st Century (1999)'), (5.0, 'Young People Fucking (a.k.a. YPF) (2007)'), (5.0, 'Yossi & Jagger (2002)'), (5.0, 'Wrong Cops (2013)')]
Spent 249 seconds


从上面可以看到，**基于物品过滤**所花费的时间还是比较长的，但只要计算出了 **物品的相似度**以后，`getRecommendedItems` 方法基本上是和很快就可以出结果!

In [209]:
print (getRecommendedItems(moviePrefs,itemSim,'3')[0:10])

[(5.0, 'Yes Man (2008)'), (5.0, 'Win/win (2010)'), (5.0, 'Wife, The (1995)'), (4.5, 'Yu-Gi-Oh! (2004)'), (4.5, 'Young Guns II (1990)'), (4.5, "World's Fastest Indian, The (2005)"), (4.5, 'Wonder Boys (2000)'), (4.5, 'Wolf Children (Okami kodomo no ame to yuki) (2012)'), (4.5, 'Wizard of Oz, The (1939)'), (4.5, 'Witless Protection (2008)')]


In [210]:
print (getRecommendedItems(moviePrefs,itemSim,'140')[0:10])

[(5.0, 'Wings of Hope (Julianes Sturz in den Dschungel) (2000)'), (5.0, 'Win/win (2010)'), (5.0, 'Willie & Phil (1980)'), (5.0, 'Wife, The (1995)'), (4.75, 'Witchfinder General (Conquerer Worm, The) (1968)'), (4.5, 'loudQUIETloud: A Film About the Pixies (2006)'), (4.5, 'Zootopia (2016)'), (4.5, 'Zoolander 2 (2016)'), (4.5, 'Zatoichi on the Road (Zat\xc3\xb4ichi kenka-tabi) (Zat\xc3\xb4ichi 5) (1963)'), (4.5, 'Young People Fucking (a.k.a. YPF) (2007)')]


## 基于用户的协同过滤还是基于物品的协同过滤?

在针对**大数据集**生成推荐列表时，**Item-Base** 的方式明显要经 **User-Base**更快，只不过它的确有维护物品相似度表的额外开销。
对于 **稀疏数据集**，**Item-Base** 通常要 **优于**  **User-Base**，而对于密集数据集而言，两者的效果几乎是一样的。
尽管如些，**User-Base**的方法更易于实现，且无需额外步骤，因此通常更适用于规模较小且变化非常频繁的内存数据集。
