# 示例：餐馆菜肴推荐引擎

In [1]:
import numpy as np
from numpy import linalg as la

**推荐系统的工作过程：**给定一个用户，系统会为此用户返回N个最好的推荐菜。
1. 寻找用户没有评级的菜肴，即在用户－物品矩阵中的0值；
2. 在用户没有评级的所有物品中，对每个物品预计一个可能的评级分数。这就是说，我们认为用户可能会对物品的打分（这就是相似度计算的初衷）；
3. 对这些物品的评分从高到低进行排序，返回前N个物品。

## 数据导入1

In [2]:
def loadExData():
    return[[1, 1, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [1, 1, 1, 0, 0],
           [5, 5, 5, 0, 0]]

## 相似度计算

In [3]:
# 欧式距离
def ecludSim(inA,inB):
    return 1.0 / (1.0 + np.linalg.norm(inA - inB))

# 皮尔逊相关系数
def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar = 0)[0][1]

# 余弦相似度
def cosSim(inA,inB):
    num = float(inA.T*inB)
    denom = np.linalg.norm(inA) * np.linalg.norm(inB)
    return 0.5+0.5*(num/denom)

## 基于物品相似度的推荐引擎

In [4]:
# 计算在给定相似度计算方法的条件下，用户对物品的估计评分值
# dataMat: 数据矩阵，user: 用户编号，simMeas：相似度计算方法，item: 物品编号
def standEst(dataMat, user, simMeas, item):
    # 得到数据集的物品数目
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    # 遍历行中的每个物品
    for j in range(n):
        # 得到物品评分值
        userRating = dataMat[user,j]
        # 如果物品评分值为0，就意味着用户没有对该物品评分，跳过这个物品
        if userRating == 0: continue
        # 两个物品当中已经被评分的那个元素
        overLap = np.nonzero(np.logical_and(dataMat[:,item].A>0, dataMat[:,j].A>0))[0]
        # 如果两者没有任何重合元素，则相似度为0
        if len(overLap) == 0: similarity = 0
        # 计算重合物品的相似度
        else: similarity = simMeas(dataMat[overLap,item], dataMat[overLap,j])
        # print('the %d and %d similarity is: %f' % (item, j, similarity))
        # 累加相似度
        simTotal += similarity
        # 累加相似度和当前用户评分的乘积
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    # 对相似度评分的乘积进行归一化
    else: return ratSimTotal/simTotal

# 推荐引擎：产生最高的N个推荐结果
# simMeas：相似度计算方法，estMethod：估计方法
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    # 对给定用户建立一个未评分的物品列表
    unratedItems = np.nonzero(dataMat[user,:].A==0)[1]
    # 如果不存在未评分物品，那么就退出函数
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    # 遍历所有的未评分的物品
    for item in unratedItems:
        # 得到物品的预测得分
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        # 将物品的编号和估计得分值放在一个元素列表中
        itemScores.append((item, estimatedScore))
    # 按照估计得分，对该列表进行排序并返回（从大到小），取N个
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]

In [5]:
myMat = np.mat(loadExData())

In [6]:
myMat[0, 1] = myMat[0, 0] = myMat[1, 0] = myMat[2, 0] = 4

In [7]:
myMat[3, 3] = 2

In [8]:
myMat

matrix([[4, 4, 0, 2, 2],
        [4, 0, 0, 3, 3],
        [4, 0, 0, 1, 1],
        [1, 1, 1, 2, 0],
        [2, 2, 2, 0, 0],
        [1, 1, 1, 0, 0],
        [5, 5, 5, 0, 0]])

In [9]:
recommend(myMat, 2)

[(2, 2.5), (1, 2.0243290220056256)]

In [10]:
recommend(myMat, 2, simMeas=ecludSim)

[(2, 3.0), (1, 2.8266504712098603)]

In [11]:
recommend(myMat, 2, simMeas=pearsSim)

[(2, 2.5), (1, 2.0)]

## 数据导入2

In [12]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

In [13]:
U, Sigma, VT = la.svd(np.mat(loadExData2()))

In [14]:
Sigma

array([15.77075346, 11.40670395, 11.03044558,  4.84639758,  3.09292055,
        2.58097379,  1.00413543,  0.72817072,  0.43800353,  0.22082113,
        0.07367823])

In [15]:
# 总能量
Sig2 = Sigma ** 2
sum(Sig2)

541.9999999999995

In [16]:
# 计算总能量的90%
sum(Sig2) * 0.9

487.7999999999996

In [17]:
# 计算前两个元素所包含的能量， 该值低于总能量的90%
sum(Sig2[:2])

378.8295595113579

In [18]:
# 计算前三个元素所包含的能量，该值高于总能量的90%
sum(Sig2[:3])

500.50028912757926

所以将一个11维的矩阵转换成一个3维的矩阵

## 基于SVD的评分估计
对转换后的三维空间构造出一个相似度计算函数

In [19]:
def svdEst(dataMat, user, simMeas, item):
     # 得到数据集的物品数目
    n = np.shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    # 调用linalg的svd分解
    U,Sigma,VT = la.svd(dataMat)
    # 建立对角矩阵，4维对角矩阵
    Sig4 = np.mat(np.eye(4)*Sigma[:4])
    # 构建转换后的数据集，取前3行的奇异值做低维转换
    xformedItems = dataMat.T * U[:,:4] * Sig4.I 
    # 在用户对应行的所有元素上遍历
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j==item: continue
        # 计算相似度
        similarity = simMeas(xformedItems[item,:].T, xformedItems[j,:].T)
        print('the %d and %d similarity is: %f' % (item, j, similarity))
        # 累加相似度
        simTotal += similarity
        # 累加相似度与对应评分值的乘积
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    # 返回估计评分的计算
    else: return ratSimTotal/simTotal

In [20]:
U, Sigma, VT = la.svd(np.mat(loadExData2()))
myMat = np.mat(loadExData2())

In [21]:
recommend(myMat, 1, estMethod=svdEst)

the 0 and 3 similarity is: 0.490950
the 0 and 5 similarity is: 0.484274
the 0 and 10 similarity is: 0.512755
the 1 and 3 similarity is: 0.491294
the 1 and 5 similarity is: 0.481516
the 1 and 10 similarity is: 0.509709
the 2 and 3 similarity is: 0.491573
the 2 and 5 similarity is: 0.482346
the 2 and 10 similarity is: 0.510584
the 4 and 3 similarity is: 0.450495
the 4 and 5 similarity is: 0.506795
the 4 and 10 similarity is: 0.512896
the 6 and 3 similarity is: 0.743699
the 6 and 5 similarity is: 0.468366
the 6 and 10 similarity is: 0.439465
the 7 and 3 similarity is: 0.482175
the 7 and 5 similarity is: 0.494716
the 7 and 10 similarity is: 0.524970
the 8 and 3 similarity is: 0.491307
the 8 and 5 similarity is: 0.491228
the 8 and 10 similarity is: 0.520290
the 9 and 3 similarity is: 0.522379
the 9 and 5 similarity is: 0.496130
the 9 and 10 similarity is: 0.493617


[(4, 3.344714938469228), (7, 3.329402072452697), (9, 3.328100876390069)]

In [22]:
recommend(myMat, 1, estMethod=svdEst, simMeas=pearsSim)

the 0 and 3 similarity is: 0.341942
the 0 and 5 similarity is: 0.124132
the 0 and 10 similarity is: 0.116698
the 1 and 3 similarity is: 0.345560
the 1 and 5 similarity is: 0.126456
the 1 and 10 similarity is: 0.118892
the 2 and 3 similarity is: 0.345149
the 2 and 5 similarity is: 0.126190
the 2 and 10 similarity is: 0.118640
the 4 and 3 similarity is: 0.450126
the 4 and 5 similarity is: 0.528504
the 4 and 10 similarity is: 0.544647
the 6 and 3 similarity is: 0.923822
the 6 and 5 similarity is: 0.724840
the 6 and 10 similarity is: 0.710896
the 7 and 3 similarity is: 0.319482
the 7 and 5 similarity is: 0.118324
the 7 and 10 similarity is: 0.113370
the 8 and 3 similarity is: 0.334910
the 8 and 5 similarity is: 0.119673
the 8 and 10 similarity is: 0.112497
the 9 and 3 similarity is: 0.566918
the 9 and 5 similarity is: 0.590049
the 9 and 10 similarity is: 0.602380


[(4, 3.346952186702173), (9, 3.3353796573274694), (6, 3.3071930278130366)]